In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

from IPython.display import display

sns.set_theme()
plt.rcParams.update({'figure.facecolor':'white'})

# Load data and do basic formatting

In [None]:
df = pd.read_csv("../data/processed_sensor_dwd_train.csv", index_col=0)

In [None]:
# convert timestamp to datetime
df['timestamp'] = pd.to_datetime(df['timestamp'])

# convert pressure to hPa
df['pressure_sensors'] = df['pressure_sensors'] / 100
df['pressure_std'] = df['pressure_std'] / 100

# add sensor IDs
df_location = df.groupby(['lat', 'lon']).count().reset_index()[['lat', 'lon']]
df_location['location_id'] = df_location.index+1
df = df.merge(df_location, on=['lat', 'lon'], how='left')

# define lists with columns
no_data_cols = ['location_id', 'timestamp', 'city', 'lat', 'lon']
sc_cols = sorted(['PM10', 'PM2p5', 'PM10_std', 'PM2p5_std', 'pressure_sensors', 'temperature_sensors', 'humidity_sensors', 'pressure_std', 'temperature_std', 'humidity_std'])
sc_cols_wo_std = [col for col in sc_cols if 'std' not in col]
dwd_cols = sorted([col for col in df.columns if (col not in no_data_cols and col not in sc_cols)])
std_cols = [col for col in sc_cols if 'std' in col]
data_cols_wo_std = sc_cols_wo_std + dwd_cols
data_cols = sc_cols + dwd_cols

# reorganize columns: first non-data columns, then sorted data columns
df = df.reindex(columns=no_data_cols + sc_cols + dwd_cols)
df

In [None]:
# save assignment of sensor_id to coordinates
location_id_assignment = pd.DataFrame(
    data={
        'location_id': df['location_id'].unique()
    }
)
for l in ['lat', 'lon']:
    location_id_assignment[l] = location_id_assignment.apply(lambda x: df.loc[df['location_id']==x['location_id'], l].iloc[0], axis=1)


In [None]:
df.info()

# Investigation of missing values, zeros and outliers

In [None]:
# Basic statistics of the whole sc dataset
df[sc_cols_wo_std].describe().T.round(1)

In [None]:
# Basic statistics of the sc dataset for Frankfurt
df[df['city']=='Frankfurt'][sc_cols_wo_std].describe().T.round(1)

In [None]:
# Basic statistics of the sc dataset for Bremen
df[df['city']=='Bremen'][sc_cols_wo_std].describe().T.round(1)

PM10: Mean is almost double of the 75th percentile -> Outliers raise the mean extremely </br>
PM2.5: similar to PM10, but less extreme </br>
humidity: al values (mean, 25th, 50th and 75th percentile) seem to be very large, the max value is above 100, what doesn't make any sense </br>
pressure: assuming the units are Pa (1 bar = 100.000 Pa): min value is below 100 -> unrealistic, max value is also unrealistic (more than 60 bar) </br>
temperature: std seems very high (54 °C), min and max value are unrealistic </br>
 </br>
 Bremen vs. Frankfurt </br>
 PM10 and PM2.5: std for Bremen is double of std for Frankfurt </br>
 humidity: 50th percentile of Bremen is already 99.9 % what seems quite high
 pressure and temperature: no obvious unrealistic observations besides the min and max values

In [None]:
print("missing values in each column")
for col in df.columns:
    print(f"{col}: {df[col].isna().sum()} ({round(df[col].isna().sum() / df.shape[0] * 100, 1)} %)")

In [None]:
print("value '0' in each column")
for col in df.columns:
    print(f"{col}: {df[df[col]==0][col].count()} ({round(df[df[col]==0][col].count() / df.shape[0] * 100, 1)} %)")

In [None]:
def count_nan_and_0s(df: pd.DataFrame, cols: list = None) -> pd.DataFrame:
    """Counts zeros and nans per column.

    Args:
        df (pd.DataFrame): Dataframe to search for zeros and nans.
        cols (list, optional): List of columns, if no columns are specified all will be used. Defaults to None.
        thresholds (dict, optional): Thresholds for further . Defaults to None.

    Returns:
        pd.DataFrame: Dataframe containing counts of zeros and nans.
    """
    # use all columns af none were defined
    if cols == None:
        cols=df.columns
    # make a new dataframe and put the defined column names in the first column
    df_nan_0 = pd.DataFrame()
    df_nan_0['data'] = cols
    # calculate missing values and zeros as absolute value and share 
    df_nan_0['missing_values'] = [df[col].isna().sum() for col in cols]
    df_nan_0['missing_values_share'] = [df[col].isna().sum() / df.shape[0] * 100 for col in cols]
    df_nan_0['0_values'] = [df[df[col]==0][col].count() for col in cols]
    df_nan_0['0_values_share'] = [df[df[col]==0][col].count() / df.shape[0] * 100 for col in cols]

    # transpose the dataframe and use the original column names as column names
    df_nan_0 = df_nan_0.set_index('data').T.reset_index()
    df_nan_0.columns = [name if i>0 else 'metric' for i, name in enumerate(df_nan_0.columns)]
    return df_nan_0


# find missing values and zeros in the sc dataset
df_data_analysis = count_nan_and_0s(df, data_cols)
df_data_analysis.round(1)

In [None]:
# define metrics and columns to plot
metrics = ["missing_values_share", "0_values_share"]
ys = list(df_data_analysis.columns)
ys.remove('metric')

# define size of subplot
columns = 4
rows = int(np.ceil((len(df_data_analysis.columns) - 1) / columns))

# plot
fig, ax = plt.subplots(rows, columns, figsize=(20,20)) # create subplots
plt.suptitle("Data analysis of missing values and zeros", fontsize=20) # title of plot
fig.tight_layout() # tight_layout automatically adjusts subplot params so that the subplot(s) fits in to the figure area
plt.subplots_adjust(hspace = .5, wspace = .2, top = .93) # adjusts the space between the single subplots

for row in range(rows):
    for col in range(columns):
        if col + row * columns < len(ys):
        
            # create a bar for each metric defined above for a column of ys list
            sns.barplot(data=df_data_analysis[df_data_analysis['metric'].isin(metrics)], x='metric', y=ys[col + row * columns], ax=ax[row][col])
            # set ylim to [0, 100] as we are plotting percentages
            ax[row][col].set_ylim([0, 100])
            # put the percentage above each plotted bar
            ax[row][col].bar_label(ax[row][col].containers[0], fmt='%.1f')
            # set the x, y and x-tick labels
            ax[row][col].set_xlabel("")
            ax[row][col].set_ylabel("Share of values in %")
            ax[row][col].set_xticklabels(labels=["Missing values", "Zeros"])
            # use the column name with slight changes as subplot name
            title = f"{ys[col + row * columns]}".replace('_', ' ').replace('std', 'std. dev.').replace('2p5', '2.5').capitalize()
            ax[row][col].set_title(title, fontsize = 15);
        else:
            # delete not needed subplots
            fig.delaxes(ax[row][col])


In [None]:
# columns to plot
ys = data_cols_wo_std

# define size of subplot
columns = 3
rows = int(np.ceil((len(ys)) / columns))

# plot
fig, ax = plt.subplots(rows, columns, figsize=(20,20)) # create subplots
plt.suptitle("Outlier analysis", fontsize=20) # title of plot
fig.tight_layout() # tight_layout automatically adjusts subplot params so that the subplot(s) fits in to the figure area
plt.subplots_adjust(hspace = .7, wspace = .2, top = .93) # adjusts the space between the single subplots

for row in range(rows):
    for col in range(columns):
        if col + row * columns < len(ys):

            # create a bar for each metric defined above for a column of ys list
            sns.scatterplot(data=df, x='timestamp', y=ys[col + row * columns], ax=ax[row][col], alpha=.3)
            # set the x, y and x-tick labels
            ax[row][col].set_xlabel(ax[row][col].get_xlabel().capitalize())
            ax[row][col].set_ylabel(ax[row][col].get_ylabel().capitalize())
            # use the column name with slight changes as subplot name
            title = f"{ys[col + row * columns]}".replace('_', ' ').replace('std', 'std. dev.').replace('2p5', '2.5').capitalize()
            ax[row][col].set_title(title, fontsize = 15)
            ax[row][col].tick_params(labelrotation=90)
        else:
            # delete not needed subplots
            fig.delaxes(ax[row][col])


There are few outliers in humidity, pressure and temperature which can be dropped by setting thresholds. </br>
For PM10 and PM2.5 it is less obvious as the data is scattered all over the possible range. 

# Delete unrealistic values and outliers for environmental variables

## hard thresholds based on physical estimations
We can first have a look at the extreme values measured by Deutscher Wetterdienst to get an impression what range of values is realistic.

In [None]:
print(df['humidity_dwd'].max())
print(df.query("city == 'Frankfurt'")['humidity_dwd'].min())
print(df.query("city == 'Bremen'")['humidity_dwd'].min())

In [None]:
print(df['pressure_dwd'].max())
print(df.query("city == 'Frankfurt'")['pressure_dwd'].min())
print(df.query("city == 'Bremen'")['pressure_dwd'].min())

In [None]:
print(df['temperature_dwd'].max())
print(df.query("city == 'Frankfurt'")['temperature_dwd'].min())
print(df.query("city == 'Bremen'")['temperature_dwd'].min())

In [None]:
# set lower and upper threshold
thresholds_env = {
    'humidity_sensors': (15, 100),
    'pressure_sensors': (960, 1050),
    'temperature_sensors': (-20, 60),
}

def del_hard_thresholds_env(df, thresholds_env=thresholds_env):
    # delete values below lower and above upper threshold
    for col, thresh in thresholds_env.items():
        nan_before = df[col].isna().sum()
        df.iloc[df[col] <= thresh[0], list(df.columns).index(col)] = np.nan
        df.iloc[df[col] >= thresh[1], list(df.columns).index(col)] = np.nan
        print(f"added {df[col].isna().sum() - nan_before} nans in {col}")

print(df['temperature_sensors'].isna().sum())
del_hard_thresholds_env(df)
print(df['temperature_sensors'].isna().sum())


## values with std. dev. 'nan' or zero
If the standard deviation is 'nan', there was no or only one observation. If the standard deviation is zero, there was no fluctuation in the measured value, what can be assumed to be a measurement error.

In [None]:
# delete values for the defined columns if the standard deviation is zero or 'nan'
cols_env = [
    'temperature_sensors',
    'humidity_sensors',
    'pressure_sensors',
]

def del_std_nan_env(df, cols=cols_env):
    for col in cols:
        nan_before = df[col].isna().sum()
        df.loc[df[col.split('_')[0]+'_std']==0, col] = np.nan    
        df.loc[df[col.split('_')[0]+'_std']==np.nan, col] = np.nan    
        print(f"added {df[col].isna().sum() - nan_before} nans in {col}")

print(df['temperature_sensors'].isna().sum())
del_std_nan_env(df)
print(df['temperature_sensors'].isna().sum())


## dynamic thresholds based on quantiles

In [None]:
# define quantiles as threshold
thresh = {
    'temperature': (.01, .85),
    'humidity': (.05, .95),
    'pressure': (.05, .95),
}


def del_dynamic_threshold_env(df, thresh=thresh):
# make a dataframe containing median, upper and lower threshold defined by the quantiles above
    df_thresholds = df.groupby(['city', 'timestamp']).agg(
        temp_median = pd.NamedAgg(column='temperature_sensors', aggfunc='median'), 
        temp_lower = pd.NamedAgg(column='temperature_sensors', aggfunc=lambda x: x.quantile(q=thresh['temperature'][0])),
        temp_upper = pd.NamedAgg(column='temperature_sensors', aggfunc=lambda x: x.quantile(q=thresh['temperature'][1])),
        hum_median = pd.NamedAgg(column='humidity_sensors', aggfunc='median'), 
        hum_lower = pd.NamedAgg(column='humidity_sensors', aggfunc=lambda x: x.quantile(q=thresh['humidity'][0])),
        hum_upper = pd.NamedAgg(column='humidity_sensors', aggfunc=lambda x: x.quantile(q=thresh['humidity'][1])),
        pres_median = pd.NamedAgg(column='pressure_sensors', aggfunc='median'), 
        pres_lower = pd.NamedAgg(column='pressure_sensors', aggfunc=lambda x: x.quantile(q=thresh['pressure'][0])),
        pres_upper = pd.NamedAgg(column='pressure_sensors', aggfunc=lambda x: x.quantile(q=thresh['pressure'][1])),
    ).reset_index()

    # merge the thresholds with the sc dataframe
    df = df.merge(df_thresholds, how='left', on=['city', 'timestamp'])

    # replace values below lower threshold and above upper threshold with 'nan'
    for col, thresholds in {
        'temperature_sensors': ['temp_lower', 'temp_upper'],
        'humidity_sensors': ['hum_lower', 'hum_upper'],
        'pressure_sensors': ['pres_lower','pres_upper'],
    }.items():
        nan_before = df[col].isna().sum()
        df.loc[(df[col] < df[thresholds[0]]) | (df[col] > df[thresholds[1]]), col] = np.nan
        print(f"{df[col].isna().sum() - nan_before} nans added in {col}")

    # drop columns used for dynamic thresholding
    df.drop([col for col in df_thresholds.columns if not col in no_data_cols], axis=1, inplace=True)

print(df['temperature_sensors'].isna().sum())
del_dynamic_threshold_env(df)
print(df['temperature_sensors'].isna().sum())

In [None]:
# df_thresholds.columns

In [None]:
# # replace values below lower threshold and above upper threshold with 'nan'
# for col, thresholds in {
#     'temperature_sensors': ['temp_lower', 'temp_upper'],
#     'humidity_sensors': ['hum_lower', 'hum_upper'],
#     'pressure_sensors': ['pres_lower','pres_upper'],
# }.items():
#     nan_before = df[col].isna().sum()
#     df.loc[(df[col] < df[thresholds[0]]) | (df[col] > df[thresholds[1]]), col] = np.nan
#     print(f"{df[col].isna().sum() - nan_before} nans added in {col}")


In [None]:
# # drop columns used for dynamic thresholding
# df.drop([col for col in df_thresholds.columns if not col in no_data_cols], axis=1, inplace=True)

# Visualization of cleaned data and comparison with dwd data

In [None]:
def plot_sc_vs_dwd(city, columns=1, reduction=1):
    # Plot dwd and sc data 
    # define size of subplot
    rows = int(np.ceil(3 / columns))

    fig, ax = plt.subplots(rows, columns, figsize=(20,20)) # create subplots
    plt.suptitle(f"Comparison sensor data vs. dwd in {city}", fontsize=20) # title of plot
    fig.tight_layout() # tight_layout automatically adjusts subplot params so that the subplot(s) fits in to the figure area
    plt.subplots_adjust(hspace = .2, wspace = .2, top = .95) # adjusts the space between the single subplots

    # Plot humidity from both datasets vs time
    sns.scatterplot(data=df[(df['humidity_sensors'].notna()) & (df['city'] == city)][::reduction], x='timestamp', y='humidity_sensors', ax=ax[0], label='Sensor Community')
    sns.lineplot(data=df[(df['humidity_dwd'].notna()) & (df['city']== city)], x='timestamp', y='humidity_dwd', color='red', alpha=.5, ax=ax[0], label='Deutscher Wetterdienst')
    ax[0].set_ylabel('Relative Humidity in %')

    # Plot pressure from both datasets vs time
    sns.scatterplot(data=df[(df['pressure_sensors'].notna()) & (df['city'] == city)][::reduction], x='timestamp', y='pressure_sensors', ax=ax[1], label='Sensor Community')
    sns.lineplot(data=df[(df['pressure_dwd'].notna()) & (df['city']== city)], x='timestamp', y='pressure_dwd', color='red', alpha=.5, ax=ax[1], label='Deutscher Wetterdienst')
    ax[1].set_ylabel('Pressure in hPa')

    # Plot temperature from both datasets vs time
    sns.scatterplot(data=df[(df['temperature_sensors'].notna()) & (df['city'] == city)][::reduction], x='timestamp', y='temperature_sensors', ax=ax[2], label='Sensor Community')
    sns.lineplot(data=df[(df['temperature_dwd'].notna()) & (df['city']== city)], x='timestamp', y='temperature_dwd', color='red', alpha=.5, ax=ax[2], label='Deutscher Wetterdienst')
    ax[2].set_ylabel('Temperature in °C')

    xlim_left = df['timestamp'].min()
    xlim_right = df['timestamp'].max()

    # capitalize axis titles and add legend
    for i in range(3):
        ax[i].legend(loc='lower right')
        ax[i].set_xlabel(ax[i].get_xlabel().capitalize())
        ax[i].set_xlim(xlim_left, xlim_right)
    
    


In [None]:
# # Plot comparison of data from both sources for Frankfurt
# plot_sc_vs_dwd('Frankfurt')
# plt.savefig("../figures/EDA_sc_vs_dwd_Frankfurt.png", bbox_inches='tight')
# plt.close()
# ;

![EDA_sc_vs_dwd_Frankfurt.png](../figures/EDA_sc_vs_dwd_Frankfurt.png)


In [None]:
# # Plot comparison of data from both sources for Bremen
# plot_sc_vs_dwd('Bremen')
# plt.savefig("../figures/EDA_sc_vs_dwd_Bremen.png", bbox_inches='tight')
# plt.close()
# ;

![EDA_sc_vs_dwd_Bremen.png](../figures/EDA_sc_vs_dwd_Bremen.png)


In [None]:
# Example of the distribution of measured temperatures in one day
sns.histplot(data=df[(df['timestamp'] > '2020-07-01') & (df['timestamp'] < '2020-07-15')], x='temperature_sensors', bins=20);

# Investigation of single locations

In [None]:
# group by location_id and calculate the total number of hours with measurements, date of the first and of the last measurement
location_grouped = df[(df['PM10'].notna()) & (df['PM2p5'].notna())][['location_id', 'timestamp']].\
    groupby(['location_id']).\
        agg(
                hours = pd.NamedAgg(column='timestamp', aggfunc='count'), 
                date_min = pd.NamedAgg(column='timestamp', aggfunc='min'),
                date_max = pd.NamedAgg(column='timestamp', aggfunc='max')
            ).\
            reset_index().\
                sort_values('hours', ascending=False)

location_grouped['date_min'] = pd.to_datetime(location_grouped['date_min'])
location_grouped['date_max'] = pd.to_datetime(location_grouped['date_max'])
location_grouped['period_length'] = location_grouped['date_max'] - location_grouped['date_min'] + pd.Timedelta(days=1)
location_grouped['hours_per_day'] = location_grouped['hours'] / location_grouped['period_length'].dt.days
location_grouped.head(5)

In [None]:
# plot the number of hours that were measured at each location
plt.figure(figsize=(25, 10))
g = sns.barplot(data=location_grouped, x='location_id', y='hours', order=location_grouped.sort_values('hours', ascending=False)['location_id'])
g.set_xlabel(g.get_xlabel().capitalize().replace('_', ' '))
g.set_ylabel(g.get_ylabel().capitalize())
plt.xticks(rotation=90);

In [None]:
# plot the number of hours per day measured per location
plt.figure(figsize=(25, 10))
g = sns.barplot(data=location_grouped.sort_values('hours_per_day', ascending=False), x='location_id', y='hours_per_day', order=location_grouped.sort_values('hours_per_day', ascending=False)['location_id'])
g.set_xlabel(g.get_xlabel().capitalize().replace('_', ' '))
g.set_ylabel(g.get_ylabel().capitalize().replace('_', ' '))
plt.xticks(rotation=90);

In [None]:
print(f"Total number of locations: {location_grouped.shape[0]}")
print('Locations with the least hours of measurement:')
location_grouped.tail(20)

In [None]:
location_grouped[['hours', 'hours_per_day']].describe().T.round(1)

There are some sensor locations which delivered data only for few hours

In [None]:
def plot_all_PM(df):
    fig, (ax1, ax2, ax3, ax4) = plt.subplots(4, 1, figsize=(20, 20))
    plt.suptitle("Sensors per City", fontsize=20) # title of plot
    fig.tight_layout() # tight_layout automatically adjusts subplot params so that the subplot(s) fits in to the figure area
    plt.subplots_adjust(hspace = .5, wspace = .2, top = .9) # adjusts the space between the single subplots

    # get ids match them with the cities
    labels_frankfurt = set(df.query("city=='Frankfurt'")['location_id'])
    labels_bremen = set(df.query("city=='Bremen'")['location_id'])

    # plot PM10 data of Frankfurt
    sns.lineplot(data=df[df['city']=='Frankfurt'][::10], x='timestamp', y='PM10', hue='location_id', ax=ax1, legend=False)
    ax1.legend(labels=labels_frankfurt) # assign a unique color to every id
    ax1.set_title('Frankfurt - PM10', fontsize = 15) # set title and font size
    ax1.legend([], [], frameon=False) # hide legend

    # plot PM2.5 data for Frankfurt
    sns.lineplot(data=df[df['city']=='Frankfurt'][::10], x='timestamp', y='PM2p5', hue='location_id', ax=ax2, legend=False)
    ax2.legend(labels=labels_frankfurt)
    ax2.set_title('Frankfurt - PM2.5', fontsize = 15)
    ax2.legend([], [], frameon=False)

    # plot PM10 data for Bremen
    sns.lineplot(data=df[df['city']=='Bremen'][::10], x='timestamp', y='PM10', hue='location_id', ax=ax3, legend=False)
    ax3.legend(labels=labels_bremen)
    ax3.set_title('Bremen - PM10', fontsize = 15)
    ax3.legend([], [], frameon=False)

    # plot PM2.5 data for Bremen
    sns.lineplot(data=df[df['city']=='Bremen'][::10], x='timestamp', y='PM2p5', hue='location_id', ax=ax4, legend=False)
    ax4.legend(labels=labels_bremen)
    ax4.set_title('Bremen - PM2.5', fontsize = 15)
    ax4.legend([], [], frameon=False)

plot_all_PM(df)

# Example location (location_id=2)

In [None]:
# get location_id's occuring in Frankfurt
ids_frankfurt = df.query("city=='Frankfurt'")['location_id'].unique()

# plot PM10, PM2.5 and humidity of one location
plt.figure(figsize=(15, 8))
ax = sns.lineplot(data=df[df['location_id']==ids_frankfurt[0]], x='timestamp', y='PM10', color='b', alpha=.5)
sns.lineplot(data=df[df['location_id']==ids_frankfurt[0]], x='timestamp', y='PM2p5', color="r", alpha=.5, ax=ax)
ax2 = ax.twinx() # add second y-axis
sns.lineplot(data=df[df['location_id']==ids_frankfurt[0]], x='timestamp', y='humidity_sensors', color="g", alpha=.5, ax=ax2)



In [None]:
# Plot correlation heatmap for one single location
sns.heatmap(df[df['location_id']==ids_frankfurt[0]][sc_cols_wo_std].corr(), annot=True)

In [None]:
# make a dataframe containing timestamps of one year with resolution of one hour
one_year_full = pd.DataFrame()
one_year_full['timestamp'] = pd.date_range("2021-03-01", "2022-02-28 23:00:00", freq="H")

In [None]:
# add observations of one location to that dataframe
one_year_full_2 = pd.merge(one_year_full, df[df['location_id']==ids_frankfurt[0]], how='left', on='timestamp')
print(f"{one_year_full_2['PM10'].isna().sum()} missing values in PM10")
print(f"{one_year_full_2['PM2p5'].isna().sum()} missing values in PM2.5")

In [None]:
# get indices of observations where PM10 value is 'NaN'
missing_index = one_year_full_2.index[one_year_full_2['PM10'].isna()].tolist()

missing_periods = [] # list for periods of missing values
i = 0 # index for loop
start = None # start of a period
previous = None # index of the previous loop


while i < len(missing_index):
    # if start is None, it is the first loop
    if start == None:
        start = previous = missing_index[i]
        i += 1
        continue
    # if the current index is the previous index + 1, we are still moving within a closed period
    if missing_index[i] == previous+1:
        previous = missing_index[i]
        i += 1
        continue
    # else one period is over and another one is starting
    else:
        # add the closed period to the list of missing periods
        missing_periods.append(
            (one_year_full_2['timestamp'][start], 
            one_year_full_2['timestamp'][previous], 
            one_year_full_2['timestamp'][previous] - one_year_full_2['timestamp'][start] + pd.Timedelta(1, 'hour'))
        )
        start = previous = missing_index[i]
        i += 1
# add the last period to the list
missing_periods.append(
    (one_year_full_2['timestamp'][start], 
    one_year_full_2['timestamp'][previous], 
    one_year_full_2['timestamp'][previous]  - one_year_full_2['timestamp'][start] + pd.Timedelta(1, 'hour'))
)

# print the periods of missing PM10 values and their duration
p = 0
for start, end, duration in missing_periods:
    p += 1
    print(f"Period of missing values #{p}:\n\tstart: {start}\n\tend: {end}\n\tduration: {duration}\n")

## Set dynamic thresholds for PM data

GOAL: Calculate a dynamic median per hour for all sensors in a city. If a value is for example three times the median it is estimated to be an error.

In [None]:
def clean_pm(df: pd.DataFrame, cols: list=['PM10', 'PM2p5'], factor: int = 3) -> pd.DataFrame:
    """deletes outliers for the given columns and considerung their timestamps and cities which are larger than factor times the median

    Args:
        df (pd.DataFrame): input dataframe
        cols (list): columns to clean
        factor (int, optional): factor that is used to calculate the threshold for keeping or deleting data. Defaults to 3.

    Returns:
        pd.DataFrame: cleaned dataframe
    """

    for col in df.columns:
        if 'threshold' in col:
            df.drop(col, axis=1, inplace=True)
    
    # define a list for saving the thresholds
    thresholds = []

    # for each city in the dataframe make a dataframe with timestamps
    for city in df['city'].unique():
        df_cur = df[df['city'] == city]
        df_threshold = pd.DataFrame(
            data={
                'timestamp': df_cur['timestamp'].unique(), 
                'city': city
            }
        )

        # for each timestamp calculate the median and threshold (factor * median)
        for col in cols:
            df_threshold[col+'_median'] = df_threshold.apply(lambda x: df_cur[(df_cur['timestamp'] == x['timestamp'])][col].median(), axis=1)
            df_threshold[col+'_threshold'] = factor * df_threshold[col+'_median']
        thresholds.append(df_threshold)

    # concatenate all thresholds
    df_thresholds = pd.DataFrame()
    for df_threshold in thresholds:
        df_thresholds = pd.concat([df_thresholds, df_threshold])
    
    # merge thresholds with original dataframe on timestamp and city 
    df = df.merge(df_thresholds, how='left', on=['timestamp', 'city'])
    
    # delete values if they are above the threshold and print number of deleted values
    for col in cols:
        nan_before = df[col].isna().sum()
        df[col] = df.apply(lambda x: x[col] if x[col] <= x[col+'_threshold'] else np.nan, axis=1)
        print(f"{df[col].isna().sum() - nan_before} NaNs added in {col}")

    # for col in cols:
    #     df.drop([col+'_threshold'], axis=1, inplace=True)
    return df


df = clean_pm(df)


In [None]:
plot_all_PM(df)

In [None]:
def get_PM_data_per_location(df: pd.DataFrame) -> tuple:
    """

    Args:
        df (pd.Dataframe): Dataframe containing data of PM sensors

    Returns:
        tuple: Tuple containing one dataframe per city and PM sensor
    """
    # make dataframe containing the timestamps
    df_missing_values_bremen_pm10 = pd.DataFrame(
        data={
            'timestamp': df['timestamp'].unique(),
        }
    )

    # copy that dataframe for every combination of PM sensor and city
    df_missing_values_bremen_pm2p5 = df_missing_values_bremen_pm10.copy()
    df_missing_values_frankfurt_pm10 = df_missing_values_bremen_pm10.copy()
    df_missing_values_frankfurt_pm2p5 = df_missing_values_bremen_pm10.copy()

    # add sensor data for every location in Bremen
    for location in df.loc[df['city'] == 'Bremen', 'location_id'].unique():
        df_missing_values_bremen_pm10 = pd.merge(df_missing_values_bremen_pm10, df.loc[df['location_id']==location, ['timestamp','PM10']], on='timestamp')
        df_missing_values_bremen_pm10.rename(columns={'PM10': location}, inplace=True) # rename the new column using the location_id
        df_missing_values_bremen_pm10.set_index('timestamp', inplace=True) # use timestamps as index

        df_missing_values_bremen_pm2p5 = pd.merge(df_missing_values_bremen_pm2p5, df.loc[df['location_id']==location, ['timestamp','PM2p5']], on='timestamp')
        df_missing_values_bremen_pm2p5.rename(columns={'PM2p5': location}, inplace=True)
        df_missing_values_bremen_pm2p5.set_index('timestamp', inplace=True)

    # do the same for Frankfurt
    for location in df.loc[df['city'] == 'Frankfurt', 'location_id'].unique():
        df_missing_values_frankfurt_pm10 = pd.merge(df_missing_values_frankfurt_pm10, df.loc[df['location_id']==location, ['timestamp','PM10']], on='timestamp')
        df_missing_values_frankfurt_pm10.rename(columns={'PM10': location}, inplace=True)
        df_missing_values_frankfurt_pm10.set_index('timestamp', inplace=True)

        df_missing_values_frankfurt_pm2p5 = pd.merge(df_missing_values_frankfurt_pm2p5, df.loc[df['location_id']==location, ['timestamp','PM2p5']], on='timestamp')
        df_missing_values_frankfurt_pm2p5.rename(columns={'PM2p5': location}, inplace=True)
        df_missing_values_frankfurt_pm2p5.set_index('timestamp', inplace=True)
    return  df_missing_values_bremen_pm10, df_missing_values_bremen_pm2p5, df_missing_values_frankfurt_pm10, df_missing_values_frankfurt_pm2p5


df_missing_values_bremen_pm10, df_missing_values_bremen_pm2p5, df_missing_values_frankfurt_pm10, df_missing_values_frankfurt_pm2p5 = get_PM_data_per_location(df)


In [None]:
# plot missing values per id for PM10 in Bremen
plt.figure(figsize=(30, 10))
g = sns.heatmap(df_missing_values_bremen_pm10.isna().T.sort_index(), cbar_kws={'label': 'Missing Data'})
g.set_title('PM10 - Bremen', fontsize=20);

In [None]:
# plot missing values per id for PM2.5 in Bremen
plt.figure(figsize=(30, 15))
g = sns.heatmap(df_missing_values_bremen_pm2p5.isna().T.sort_index(), cbar_kws={'label': 'Missing Data'})
g.set_title('PM2.5 - Bremen', fontsize=20);

In [None]:
# plot missing values per id for PM2.5 in Frankfurt
plt.figure(figsize=(30, 15))
g = sns.heatmap(df_missing_values_frankfurt_pm2p5.isna().T.sort_index(), cbar_kws={'label': 'Missing Data'})
g.set_title('PM2.5 - Frankfurt', fontsize=20);

In [None]:
# plot missing values per id for PM10 in Frankfurt
plt.figure(figsize=(30, 15))
g = sns.heatmap(df_missing_values_frankfurt_pm10.isna().T.sort_index(), cbar_kws={'label': 'Missing Data'})
g.set_title('PM10 - Frankfurt', fontsize=20);

# Drop sensors with only few data in the past year

In [None]:
# df.to_csv("../data/df_backup.csv")

In [None]:
# import numpy as np
# import pandas as pd
# import matplotlib.pyplot as plt
# import seaborn as sns

# import warnings
# warnings.filterwarnings("ignore")

# sns.set_theme()
# plt.rcParams.update({'figure.facecolor':'white'})

# df = pd.read_csv("../data/df_backup.csv", index_col=0)
# df['timestamp'] = pd.to_datetime(df['timestamp'])


In [None]:
def get_share_of_missing_values(df: pd.DataFrame, start_time: str):
    # Get the total number of observations possible in the past year
    observations_of_interest = df[(df['location_id'] == df['location_id'].unique()[0]) & (df['timestamp'] >= pd.to_datetime(start_time))].shape[0]

    # make a dataframe to store missing values per location
    missing_values = pd.DataFrame(columns=['location_id', 'city', 'PM10_missing', 'PM2p5_missing'])

    # get missing values for every location
    for location in df['location_id'].unique():
        # filter for location
        df_cur = df[(df['location_id'] == location) & (df['timestamp'] >= pd.to_datetime('2021-01-01'))][['city', 'PM10', 'PM2p5']]
        
        # create a new entry in the dataframe containing location_id, city and share of missing values
        new_entry = {
            'location_id': int(location),
            'city': df_cur['city'].iloc[0],
            'PM10_missing': df_cur['PM10'].isna().sum() / observations_of_interest,
            'PM2p5_missing': df_cur['PM2p5'].isna().sum() / observations_of_interest,
        }
        missing_values = missing_values.append(new_entry, ignore_index=True)

    # cast location_id to int
    missing_values['location_id'] = missing_values['location_id'].astype(int) 
    return missing_values

In [None]:

missing_values = get_share_of_missing_values(df, '2021-01-01')
missing_values

In [None]:
fig, ax = plt.subplots(4,1,figsize=(20,15))
plt.suptitle("Missing values per city and sensor", fontsize=20) # title of plot
fig.tight_layout() # tight_layout automatically adjusts subplot params so that the subplot(s) fits in to the figure area
plt.subplots_adjust(hspace = .4, wspace = .2, top = .93) # adjusts the space between the single subplots
i=0
# plot share of missing values for every city and PM sensor
for city in missing_values['city'].unique():
    for col in ['PM10_missing', 'PM2p5_missing']:
        sns.barplot(
            data=missing_values[missing_values['city']==city],
            x='location_id',
            y=col,
            order=missing_values[missing_values['city']==city].sort_values(col, ascending=False)['location_id'], # sort by missing values
            ax=ax[i]
        )
        ax[i].tick_params(labelrotation=90) # rotate x tick labels
        ax[i].set_title(city + ' - ' + col.split('_')[0].replace('p', '.')) # set a title (City - Sensor)
        i += 1


In [None]:
# get the IDs of good sensors having less than 25 % missing values in PM2.5
good_sensors = missing_values.query("PM2p5_missing < 0.25")['location_id']
good_sensors

In [None]:
# get the data of those good sensors
def use_good_sensors_only(df, good_sensors=good_sensors):
    df_good_sensors = df[df['location_id'].\
        isin(good_sensors)].\
            drop([col for col in df.columns if ('median' in col or 'threshold' in col)], axis=1)
    return df_good_sensors

df_good_sensors = use_good_sensors_only(df)

In [None]:
print(df.shape)
print(df_good_sensors.shape)

# Clean test data

In [None]:
# import test data
df_test = pd.read_csv("../data/processed_sensor_dwd_test.csv", index_col=0)


In [None]:
# assign location IDs according to coordinates
df_test['location_id'] = df_test.apply(
    lambda x: location_id_assignment.\
        loc[(location_id_assignment['lat'] == x['lat']) & (location_id_assignment['lon'] == x['lon']), 'location_id'].\
            iloc[0], 
            axis=1
)
df_test['location_id'].isna().sum()

In [None]:
# convert timestamp to datetime
df_test['timestamp'] = pd.to_datetime(df_test['timestamp'])

# sort columns
df_test = df_test.reindex(columns=no_data_cols + sc_cols + dwd_cols)

df_test.head()

In [None]:
# remove outliers of environmental parameters by different mechanisms
print("hard thresholds")
del_hard_thresholds_env(df_test)

print("constant values")
del_std_nan_env(df_test)

print("dnyamic thersholds")
del_dynamic_threshold_env(df_test)

In [None]:
# plot all PM data
plot_all_PM(df_test)

In [None]:
# get missing values of PM data per sensor
df_missing_values_bremen_pm10_test, \
df_missing_values_bremen_pm2p5_test, \
df_missing_values_frankfurt_pm10_test, \
df_missing_values_frankfurt_pm2p5_test = get_PM_data_per_location(df_test)

In [None]:
# plot missing values per id for PM10 in Bremen
plt.figure(figsize=(30, 10))
g = sns.heatmap(df_missing_values_bremen_pm10_test.isna().T.sort_index(), cbar_kws={'label': 'Missing Data'})
g.set_title('PM10 - Bremen', fontsize=20);

In [None]:
# plot missing values per id for PM2.5 in Bremen
plt.figure(figsize=(30, 10))
g = sns.heatmap(df_missing_values_bremen_pm2p5_test.isna().T.sort_index(), cbar_kws={'label': 'Missing Data'})
g.set_title('PM2.5 - Bremen', fontsize=20);

In [None]:
# plot missing values per id for PM10 in Frankfurt
plt.figure(figsize=(30, 10))
g = sns.heatmap(df_missing_values_frankfurt_pm10_test.isna().T.sort_index(), cbar_kws={'label': 'Missing Data'})
g.set_title('PM10 - Frankfurt', fontsize=20);

In [None]:
# plot missing values per id for PM2.5 in Frankfurt
plt.figure(figsize=(30, 10))
g = sns.heatmap(df_missing_values_frankfurt_pm2p5_test.isna().T.sort_index(), cbar_kws={'label': 'Missing Data'})
g.set_title('PM2.5 - Frankfurt', fontsize=20);

In [None]:
# clean PM data
df_test = clean_pm(df_test)


In [None]:
# get data of sensors marked as good
df_good_sensors_test = use_good_sensors_only(df_test)

In [None]:
print(df_test.shape)
print(df_good_sensors_test.shape)

## Test data after cleaning

In [None]:
# number of 'good sensors' should be identical to locations in test dataframe
print(len(good_sensors))
df_good_sensors_test['location_id'].nunique()

In [None]:
# plot all PM data per location
plot_all_PM(df_good_sensors_test)

In [None]:
# get missing values per sensor
df_missing_values_bremen_pm10_test, \
df_missing_values_bremen_pm2p5_test, \
df_missing_values_frankfurt_pm10_test, \
df_missing_values_frankfurt_pm2p5_test = get_PM_data_per_location(df_good_sensors_test)

In [None]:
# plot missing values per id for PM10 in Bremen
plt.figure(figsize=(30, 10))
g = sns.heatmap(df_missing_values_bremen_pm10_test.isna().T.sort_index(), cbar_kws={'label': 'Missing Data'})
g.set_title('PM10 - Bremen', fontsize=20);

In [None]:
# plot missing values per id for PM2.5 in Bremen
plt.figure(figsize=(30, 10))
g = sns.heatmap(df_missing_values_bremen_pm2p5_test.isna().T.sort_index(), cbar_kws={'label': 'Missing Data'})
g.set_title('PM2.5 - Bremen', fontsize=20);

In [None]:
# plot missing values per id for PM10 in Frankfurt
plt.figure(figsize=(30, 10))
g = sns.heatmap(df_missing_values_frankfurt_pm10_test.isna().T.sort_index(), cbar_kws={'label': 'Missing Data'})
g.set_title('PM10 - Frankfurt', fontsize=20);

In [None]:
# plot missing values per id for PM2.5 in Frankfurt
plt.figure(figsize=(30, 10))
g = sns.heatmap(df_missing_values_frankfurt_pm2p5_test.isna().T.sort_index(), cbar_kws={'label': 'Missing Data'})
g.set_title('PM2.5 - Frankfurt', fontsize=20);

In [None]:
# get share of missing values in the cleaned test dataframe
missing_values_test = get_share_of_missing_values(df_good_sensors_test, "2021-01-01")

# make a series of good sensors in test data (less than 75 % missing in PM2.5)
good_sensors_test = missing_values_test.query("PM2p5_missing < 0.25")['location_id']


In [None]:
# get bad sensors in test dataframe (more than 75 % of PM2.5 data missing)
bad_sensors = []
for location in list(good_sensors):
    if location not in list(good_sensors_test):
        bad_sensors.append(location)

print(len(bad_sensors))
bad_sensors

# Update dataframes using only 'good sensors' and save cleaned train and test dataframe

In [None]:
# update train dataframe according to good sensors in test data
df_good_sensors = use_good_sensors_only(df, good_sensors_test)

# save train data for good sensors
df_good_sensors.to_csv("../data/cleaned_sensors_dwd_train.csv")

In [None]:
# make test dataframe containing only good sensors
df_good_sensors_test = use_good_sensors_only(df_test, good_sensors_test)

# save test data
df_good_sensors_test.to_csv("../data/cleaned_sensors_dwd_test.csv")

# Last check of missing data in the final dataframes

In [None]:
df_missing_values_bremen_pm10_test, \
df_missing_values_bremen_pm2p5_test, \
df_missing_values_frankfurt_pm10_test, \
df_missing_values_frankfurt_pm2p5_test = get_PM_data_per_location(df_good_sensors_test)

In [None]:
# plot missing values per id for PM10 in Bremen
plt.figure(figsize=(30, 10))
g = sns.heatmap(df_missing_values_bremen_pm10_test.isna().T.sort_index(), cbar_kws={'label': 'Missing Data'})
g.set_title('PM10 - Bremen - Cleaned Test Data', fontsize=20);

In [None]:
# plot missing values per id for PM2.5 in Bremen
plt.figure(figsize=(30, 10))
g = sns.heatmap(df_missing_values_bremen_pm2p5_test.isna().T.sort_index(), cbar_kws={'label': 'Missing Data'})
g.set_title('PM2.5 - Bremen - Cleaned Test Data', fontsize=20);

In [None]:
# plot missing values per id for PM10 in Frankfurt
plt.figure(figsize=(30, 10))
g = sns.heatmap(df_missing_values_frankfurt_pm10_test.isna().T.sort_index(), cbar_kws={'label': 'Missing Data'})
g.set_title('PM10 - Frankfurt - Cleaned Test Data', fontsize=20);

In [None]:
# plot missing values per id for PM2.5 in Frankfurt
plt.figure(figsize=(30, 10))
g = sns.heatmap(df_missing_values_frankfurt_pm2p5_test.isna().T.sort_index(), cbar_kws={'label': 'Missing Data'})
g.set_title('PM2.5 - Frankfurt - Cleaned Test Data', fontsize=20);

In [None]:
df_missing_values_bremen_pm10, \
df_missing_values_bremen_pm2p5, \
df_missing_values_frankfurt_pm10, \
df_missing_values_frankfurt_pm2p5 = get_PM_data_per_location(df_good_sensors)

In [None]:
# plot missing values per id for PM10 in Bremen
plt.figure(figsize=(30, 10))
g = sns.heatmap(df_missing_values_bremen_pm10.isna().T.sort_index(), cbar_kws={'label': 'Missing Data'})
g.set_title('PM10 - Bremen - Cleaned Train Data', fontsize=20);

In [None]:
# plot missing values per id for PM2.5 in Bremen
plt.figure(figsize=(30, 10))
g = sns.heatmap(df_missing_values_bremen_pm2p5.isna().T.sort_index(), cbar_kws={'label': 'Missing Data'})
g.set_title('PM2.5 - Bremen - Cleaned Train Data', fontsize=20);

In [None]:
# plot missing values per id for PM10 in Frankfurt
plt.figure(figsize=(30, 10))
g = sns.heatmap(df_missing_values_frankfurt_pm10.isna().T.sort_index(), cbar_kws={'label': 'Missing Data'})
g.set_title('PM10 - Frankfurt - Cleaned Train Data', fontsize=20);

In [None]:
# plot missing values per id for PM2.5 in Frankfurt
plt.figure(figsize=(30, 10))
g = sns.heatmap(df_missing_values_frankfurt_pm2p5.isna().T.sort_index(), cbar_kws={'label': 'Missing Data'})
g.set_title('PM2.5 - Frankfurt - Cleaned Train Data', fontsize=20);



# Plotting

In [None]:
df_good_sensors['location_id'].nunique()

## Plot PM2.5 concentration of the train time frame and add Corona lockdowns

In [None]:

# https://de.wikipedia.org/wiki/COVID-19-Pandemie_in_Deutschland#Reaktionen_und_Maßnahmen_der_Politik
# corona lockdowns (start date, end date, 'strength' (used for transparency))
corona_lockdowns = [
    ("2020-03-22", "2020-05-06", 3),
    ("2020-11-02", "2020-12-16", 1.5), # 2020-11-02: lockdown light, 2020-12-16:lockdown
    ("2020-12-16", "2021-03-03", 3), # 2020-11-02: lockdown light, 2020-12-16:lockdown
    ("2021-04-23", "2021-06-03", 3), # Bundesnotbremse
]

# # locations to save the image
# save_locations = [
#     125,
#     12,
#     11,
#     159,
#     84,
#     111,
# ]

# set the upper limits of the y axis for every location
y_limits = {
    125: 105,
    12: 80,
    11: 70,
    159: 75,
    84: 75,
    111: 80,
}

# set seaborn theme and set context to talk to increase the size of labels
sns.set_theme()
sns.set_context("talk")
with sns.axes_style("darkgrid"):

    for location, y_limit in y_limits.items(): # for location in save_locations: # list(df_good_sensors['location_id'].unique())[:]: # 
        plt.figure(figsize=(25,5))
        g = sns.lineplot(data=df_good_sensors[df_good_sensors['location_id']==location], x='timestamp', y='PM2p5', label='PM2.5 conc.')

        # g.set_title(str(df_good_sensors.loc[df_good_sensors['location_id']==location, 'city'].iloc[0]) + ' (ID: ' + str(location) + ')')
        ax = g.axes # get axes of g
        g.set(
            xlabel='',
            ylabel=g.get_ylabel().replace('p', '.') + ' in µg/m$^3$',
            facecolor='#EEEEEE',
            xlim=(pd.to_datetime('2020-01-01') - pd.Timedelta(10, 'D'), pd.to_datetime('2021-12-31') + pd.Timedelta(10, 'D')),
            # ylim=(-5, 105),
            ylim=(-5, y_limit) # set y_lim according to the values specified above
        )

        # add corona lockdowns as red boxes
        for lockdown in corona_lockdowns:
            g.axvspan(pd.to_datetime(lockdown[0]), pd.to_datetime(lockdown[1]), alpha=lockdown[2]/10, color='red', label='Corona Lockdown')
        
        # add a legend and show the first two entries (PM and lockdown)
        handles, labels = g.get_legend_handles_labels()
        g.legend(handles=handles[:2], labels=labels[:2], frameon=False)

        # save figure
        # if location in save_locations:
        plt.savefig('../figures/EDA_PM2p5_lockdowns_' + str(location) + '.png', transparent=True, bbox_inches='tight')
        # plt.savefig('../figures/EDA_PM2p5_lockdowns_' + str(location) + '.png', facecolor=g.get_facecolor(), bbox_inches='tight')



In [None]:
# https://de.wikipedia.org/wiki/COVID-19-Pandemie_in_Deutschland#Reaktionen_und_Maßnahmen_der_Politik
corona_lockdowns = [
    ("2020-03-22", "2020-05-06", 3),
    ("2020-11-02", "2020-12-16", 1.5), # 2020-11-02: lockdown light, 2020-12-16:lockdown
    ("2020-12-16", "2021-03-03", 3), # 2020-11-02: lockdown light, 2020-12-16:lockdown
    ("2021-04-23", "2021-06-03", 3), # Bundesnotbremse
]

# define the weeks to plot
weeks = pd.date_range(pd.to_datetime('2021-01-01'), pd.to_datetime('2021-12-31'), freq='W')

# location to plot
location=125 # good: 80

sns.set_theme()
sns.set_context("talk")
with sns.axes_style("darkgrid"):
    for i, week in enumerate(weeks[:-1]):
        plt.figure(figsize=(25,5))

        # plot PM2.5 values for each week
        g = sns.lineplot(data=df_good_sensors[df_good_sensors['location_id']==location], x='timestamp', y='PM2p5', label='PM2.5 conc.', legend=False)

        # use city and location_id as title
        g.set_title(str(df_good_sensors.loc[df_good_sensors['location_id']==location, 'city'].iloc[0]) + ' (ID: ' + str(location) + ')')
        g.set(
            xlabel='',
            ylabel=g.get_ylabel().replace('p', '.') + ' in µg/m$^3$',
            # facecolor='#EEEEEE',
            xlim=(weeks[i], weeks[i+1]), # set x_lim to show only one week
            ylim=(-5, 100),
        )
    




In [None]:
# make a dataframe grouped by time and id and calculate mean for that time frame
df_good_sensors['weekday'] = df_good_sensors['timestamp'].dt.weekday
df_good_sensors['day_name'] = df_good_sensors['timestamp'].dt.day_name()
df_good_sensors['week'] = df_good_sensors['timestamp'].dt.week
df_good_sensors['hour'] = df_good_sensors['timestamp'].dt.hour
df_good_sensors_weekdays = df_good_sensors.groupby(['location_id', 'city', 'week', 'weekday', 'day_name', 'hour']).mean()[['PM10', 'PM2p5']].reset_index()


In [None]:
# change list in the first code line to define which locations to show
for location in [125]: # list(df_good_sensors_weekdays['location_id'].unique())[:5]: # 
    fig, ax = plt.subplots(1, 7, figsize=(30,10))
    fig.suptitle('ID: ' + str(location)) # add ID as super title

    # Plot PM2.5 per weekday and ID
    for day in list(df_good_sensors_weekdays['weekday'].unique())[:]: 
        data = df_good_sensors_weekdays[(df_good_sensors_weekdays['weekday']==day) & (df_good_sensors_weekdays['location_id']==location)]
        g=sns.lineplot(data=data, x='hour', y='PM2p5', ax=ax[day])
        g.set_ylim(0, 26)
        g.set_title(data['day_name'].iloc[0])

    # Use the y label only for the first subplot
    ax[0].set_ylabel(g.get_ylabel().replace('p', '.') + ' in µg/m$^3$',)
    for a in ax[1:]:
        a.set_ylabel('')
    plt.show()

In [None]:
fig, ax = plt.subplots(1, 7, figsize=(30,10))
fig.suptitle('Average over all locations and all weeks') # add ID as super title

# Plot PM2.5 per weekday and ID
for day in list(df_good_sensors_weekdays['weekday'].unique())[:]: 
    data = df_good_sensors_weekdays[(df_good_sensors_weekdays['weekday']==day)]
    g=sns.lineplot(data=data, x='hour', y='PM2p5', ax=ax[day])
    g.set_ylim(0, 10)
    g.set_xlim(0, 23)
    g.set_title(data['day_name'].iloc[0])

# Use the y label only for the first subplot
ax[0].set_ylabel(g.get_ylabel().replace('p', '.') + ' in µg/m$^3$',)
for a in ax[1:]:
    a.set_ylabel('')
plt.show()

In [None]:
location = 125
# Plot average PM2.5 for one week
plt.figure(figsize=(20,8))
g=sns.lineplot(data=df_good_sensors_weekdays[df_good_sensors_weekdays['location_id']==location], x='weekday', y='PM2p5')
g.set_ylim(0,g.get_ylim()[1]);

## Correlation of PM2p5 with features (regressors for Prophet) for final train data
### First look at one location

In [None]:
# load data if notebook not run so far
df_good_sensors = pd.read_csv("../data/cleaned_sensors_dwd_train.csv")
# convert timestamp to datetime
df_good_sensors['timestamp'] = pd.to_datetime(df_good_sensors['timestamp'])
df_good_sensors

In [None]:
# looking for most complete location
df_good_sensors.groupby(['location_id'], dropna=False).PM2p5.count().sort_values()
location = 98


In [None]:
# show correlation map for one location
columns_plot = ['PM10', 'PM2p5', 'humidity_dwd', 'precip', 'pressure_dwd', 'pressure_sealevel', 'temperature_dwd', 'wind_direction', 'wind_speed']
corr_mtrx = df_good_sensors.query(f'location_id == {location}')[columns_plot].corr()
plt.subplots(figsize=(10, 8))
mask = np.triu(np.ones_like(corr_mtrx, dtype=bool))
sns.heatmap(corr_mtrx, annot=True, cmap="YlGnBu_r", mask=mask, vmax=1, vmin=-1,fmt='.2f')

> PM2p5 correlates for this location most with humidity, temperature and wind speed.

### Second look at all locations

In [None]:
# columns to be considered for correlation
columns_corr = ['PM10', 'PM2p5', 'humidity_dwd', 'pressure_dwd', 'precip', 'wind_direction', 'wind_speed', 'temperature_dwd']
# created sorted list of location_ids
location_list = np.sort(df_good_sensors.location_id.unique())

# create DataFrame for correlations of PM2p5 with given features = prophet regressors
corr_mtrx = df_good_sensors.query(f'location_id == {location_list[0]}')[columns_corr].corr(method='pearson')
df_PM2p5_correlations = pd.DataFrame(corr_mtrx.iloc[1,2:]) # correlation of PM2p5: iloc[1,2:], PM10: iloc[0,2:]

# fill DataFrame
for i in location_list:
    corr_mtrx = df_good_sensors.query(f'location_id == {i}')[columns_corr].corr(method='pearson')
    df_PM2p5_correlations[i] = corr_mtrx.iloc[1,2:]

# drop double column and transpose
df_PM2p5_correlations = df_PM2p5_correlations.drop('PM2p5', axis=1).T

display(df_PM2p5_correlations)

In [None]:
# plot 
df_PM2p5_correlations.plot(figsize=(20,12))
plt.title('                        Frankfurt                                                                          Bremen', fontsize=20)
plt.xlabel('location_id', fontsize=20)
plt.ylabel('correlation with PM2p5', fontsize=20)
plt.legend(fontsize=20)
plt.ylim(-0.5,0.5)
plt.xlim(0)
plt.plot([124.5, 124.5], [-0.5, 0.5], linewidth=5, color='black')
#plt.plot([0, 50], [0, 1], linewidth=2)

Correlations with PM2p5
> * Humidity, temperature and wind speed are the most important features for most locations.
> * For Bremen temperature and wind speed seem a little less important.
> * For Frankfurt pressure seems to play a roll in contrast to Bremen.
> * Precipitation shows unexpectedly no correlation with PM2p5. This is maybe due to a time shift: When it's raining now the PM values will increase within some hours...
> * Some locations show no correlation at all.

Let's have a deeper look at those correlations.

In [None]:
# prepare dataframe with rounded values for histplot
df_round = df_good_sensors.round(0)

In [None]:
# plot
plt.figure(figsize=(20,20))
plt.suptitle('25-75 % of feature values vs. PM2.5 for all locations')

plt.subplot(4,1,1)
sns.boxplot(data=df_round, y='humidity_dwd', x='PM2p5', hue='city', showfliers=False, whis=0)
plt.xlim(0,30)

plt.subplot(4,1,2)
sns.boxplot(data=df_round, y='pressure_dwd', x='PM2p5', hue='city', showfliers=False, whis=0)
plt.xlim(0,30)

plt.subplot(4,1,3)
sns.boxplot(data=df_round, y='wind_speed', x='PM2p5', hue='city', showfliers=False, whis=0)
plt.xlim(0,30)

plt.subplot(4,1,4)
sns.boxplot(data=df_round, y='temperature_dwd', x='PM2p5', hue='city', showfliers=False, whis=0)
plt.xlim(0,30);

> same findings like above for looking at all locations at once

In [None]:
# calc means per city
df_mean_per_city = df_good_sensors.groupby(['city'])['PM10', 'PM2p5', 'humidity_dwd', 'precip', 'pressure_dwd', 'temperature_dwd', 'wind_direction', 'wind_speed'].mean().reset_index()
df_mean_per_city


### Figure out which locations show hardly a correlation between PM2p5 and features

In [None]:
# create column for location_id
df_PM2p5_correlations['location_id'] = df_PM2p5_correlations.index

# add city column 
df_PM2p5_correlations['city'] = df_PM2p5_correlations['location_id'].apply(lambda x: "Frankfurt" if x < 125 else 'Bremen')
df_PM2p5_correlations

# calculate sum of correlation absolutes
df_PM2p5_correlations['corr_sum_abs'] = df_PM2p5_correlations['humidity_dwd'].abs() + df_PM2p5_correlations['precip'].abs() + df_PM2p5_correlations['pressure_dwd'].abs() + df_PM2p5_correlations['temperature_dwd'].abs() + df_PM2p5_correlations['wind_direction'].abs() + df_PM2p5_correlations['wind_speed'].abs()

display(df_PM2p5_correlations)

In [None]:
# plot
plt.figure(figsize=(20,8))
sns.barplot(data=df_PM2p5_correlations.sort_values('corr_sum_abs'), x='location_id', y='corr_sum_abs', hue='city', order=df_PM2p5_correlations.sort_values('corr_sum_abs')['location_id'])
plt.xticks(rotation=90);

> * corr_sum_abs could be a measure for the quality of our PM prediction. If corr_sum_abs is small, there are few correlations with regressors and no good prediction can be expected?
> * Why are the PM values in Bremen less correlated to weather data? Is it a question of mean PM concentration?

In [None]:
# add averages of PM2p5 for comparison
df_PM2p5_correlations['PM2p5_mean'] = df_good_sensors.groupby(['location_id']).mean()['PM2p5']
df_PM2p5_correlations['PM2p5_median'] = df_good_sensors.groupby(['location_id']).median()['PM2p5']
df_PM2p5_correlations['PM2p5_quantile_99'] = df_good_sensors.groupby(['location_id']).quantile(0.99)['PM2p5']
df_PM2p5_correlations['PM2p5_max'] = df_good_sensors.groupby(['location_id']).max()['PM2p5']
df_PM2p5_correlations

In [None]:
# plot
plt.figure(figsize=(30,8))
plt.subplot(1,3,1)
sns.scatterplot(data=df_PM2p5_correlations, x='PM2p5_median', y='corr_sum_abs', style='city', s=200)
plt.xlim(0)
plt.ylim(0)

plt.subplot(1,3,2)
sns.scatterplot(data=df_PM2p5_correlations, x='PM2p5_quantile_99', y='corr_sum_abs', style='city', s=200)
#plt.legend(['mean Frankfurt', 'mean Bremen', 'median Frankfurt', 'median Bremen'])
plt.xlim(0)
plt.ylim(0)

plt.subplot(1,3,3)
sns.scatterplot(data=df_PM2p5_correlations, x='PM2p5_max', y='corr_sum_abs', style='city', s=200)
#plt.legend(['mean Frankfurt', 'mean Bremen', 'median Frankfurt', 'median Bremen'])
plt.xlim(0)
plt.ylim(0);


> * Median (left): Locations with generally small PM2p5 values show few correlation with weather data. This is reasonable as wind or humidity will not reduce PM if it's not there.
> * Overall there there is a stronger correlation for Frankfurt on weather data than for Bremen. Even though both cities cover a comparable wide range of PM values. This is mostly due to the missing correlation with air pressure.

### Have a deeper look at precipitation
Is there really no correlation with PM?

In [None]:
# plot PM as a function of precipitation without and with time shift of - 1 hour
plt.figure(figsize=(20,15))
plt.subplot(2,1,1)
sns.lineplot(x=df_good_sensors.query(f'precip > 0')['precip'], y=df_good_sensors.query('precip > 0')['PM10'].shift(-1))
sns.lineplot(x=df_good_sensors.query(f'precip > 0')['precip'], y=df_good_sensors.query('precip > 0')['PM10'])
plt.legend(['with shift -1', '', 'without shift', '', 'with shift +1'])
plt.xlim(0)
plt.ylim(0)

plt.subplot(2,1,2)
sns.lineplot(x=df_good_sensors.query(f'precip > 0')['precip'], y=df_good_sensors.query('precip > 0')['PM2p5'].shift(-1))
sns.lineplot(x=df_good_sensors.query(f'precip > 0')['precip'], y=df_good_sensors.query('precip > 0')['PM2p5'])
plt.legend(['with shift -1', '', 'without shift', '', 'with shift +1'])
plt.xlim(0)
plt.ylim(0);

> There is actually no correlation between PM values and amount of precipitation.

In [None]:
# add column with 1 for rain and 0 for no rain
df_good_sensors['precip_bool'] = df_good_sensors['precip'].apply(lambda x: 1 if x > 0 else 0)

In [None]:
# caculate statstics for PM values depending on rain (0 = no precipitation, 1 = precipitation)
df_good_sensors.groupby('precip_bool').describe()[['PM2p5', 'PM10']].T


> When it's raining the maximum PM values are clearly smaller than without rain. For all other statistic values there values there is no clear dependency. As a consequence precipitation seems to have no real impact on PM values in Bremen hand Frankfurt.

This observation can be explained with the comparable small PM concentration in the given cities. For Beijing it was shown that "the washing process of rainfall strongly affects PM2.5, which decreased to 10–30 μg/m3 with 5 mm of rainfall."

In [None]:
# calculate 
df_delta_pm = pd.DataFrame()
for location in df_good_sensors['location_id'].unique():
    df_temp = df_good_sensors[df_good_sensors['location_id']==location][['location_id', 'timestamp', 'city', 'PM10', 'PM2p5', 'precip']]

    df_temp['PM2p5_shifted_1'] = df_temp['PM2p5'].shift(periods=1)
    df_temp['PM2p5_shifted_2'] = df_temp['PM2p5'].shift(periods=2)
    df_temp['PM2p5_shifted_3'] = df_temp['PM2p5'].shift(periods=3)

    df_temp['PM2p5_delta'] = df_temp['PM2p5'].shift(periods=1) - df_temp['PM2p5']
    df_temp['PM2p5_delta_percent'] = (df_temp['PM2p5'].shift(periods=1) - df_temp['PM2p5']) / df_temp['PM2p5'] * 100

    df_temp['PM2p5_delta_2'] = df_temp['PM2p5'].shift(periods=2) - df_temp['PM2p5']
    df_temp['PM2p5_delta_2_percent'] = (df_temp['PM2p5'].shift(periods=2) - df_temp['PM2p5']) / df_temp['PM2p5'] * 100

    df_temp['PM10_delta'] = df_temp['PM10'].shift(periods=1) - df_temp['PM10']
    df_temp['PM10_delta_percent'] = (df_temp['PM10'].shift(periods=1) - df_temp['PM10']) / df_temp['PM10'] * 100

    df_temp['PM2p5_rolling'] = df_temp['PM2p5'].rolling(window=5).mean()

    df_delta_pm = pd.concat([df_delta_pm, df_temp])

df_delta_pm

In [None]:
# group precipitation by city and count occurences of precipitation intensity
df_precip = df_delta_pm[['precip', 'city', 'location_id']].groupby(['city', 'precip']).count().reset_index().rename(columns={'location_id': 'count'})

# calculate the observations per city
sum_bremen = df_precip[df_precip['city']=='Bremen']['count'].sum()
sum_frankfurt = df_precip[df_precip['city']=='Frankfurt']['count'].sum()

# calculate the percentage of the count per precipitation intensity
df_precip['percent'] = np.nan
df_precip.loc[df_precip['city']=='Bremen', 'percent'] = df_precip.loc[df_precip['city']=='Bremen', 'count'] / sum_bremen * 100
df_precip.loc[df_precip['city']=='Frankfurt', 'percent'] = df_precip.loc[df_precip['city']=='Frankfurt', 'count'] / sum_frankfurt * 100

# plot precipitation
plt.figure(figsize=(25, 10))
g = sns.barplot(data=df_precip, x='precip', y='percent', hue='city')
g.axes.tick_params(labelrotation=90)
g.set_ylim(0, 3);

In roughly 90 % of all hours in this data no rain occured.

In [None]:
plt.figure(figsize=(20, 10))
g = sns.scatterplot(data=df_delta_pm, x='precip', y='PM2p5_delta', hue='city')
g.set_ylim(-50, 50)

In [None]:
plt.figure(figsize=(20, 10))
g = sns.scatterplot(data=df_delta_pm, x='precip', y='PM2p5_delta_percent', hue='city')
g.set_ylim(-200, 200)

In [None]:
plt.figure(figsize=(15, 10))
mask = np.triu(np.ones_like(df_delta_pm.corr(), dtype=bool))
sns.heatmap(df_delta_pm.corr(), annot=True, cmap="YlGnBu_r", mask=mask, vmax=1, vmin=-1,fmt='.2f')

Precipitation doesn't show correlation with any of the investigated parameters. PM2.5 concentration seem not to decrease after raining, regardless the amount of rain. 

### Relationship between PM2.5 and PM10 and autocorrelation

In [None]:
# calculate 
df_pm = pd.DataFrame()
for location in df_good_sensors['location_id'].unique():
    df_temp = df_good_sensors[df_good_sensors['location_id']==location][['location_id', 'timestamp', 'city', 'PM10', 'PM2p5']]

    for i in range(24):
        df_temp[f'PM2p5_shifted_{i+1}'] = df_temp['PM2p5'].shift(periods=i+1)

    df_temp['PM2p5_PM10'] = df_temp['PM2p5'] / df_temp['PM10']

    df_temp['PM2p5_PM10_rolling'] = df_temp['PM2p5_PM10'].rolling(5).mean()

    df_pm = pd.concat([df_pm, df_temp])


In [None]:
plt.figure(figsize=(30, 30))
mask = np.triu(np.ones_like(df_pm.drop(['location_id', 'PM10', 'PM2p5_PM10'], axis=1).corr(), dtype=bool))
sns.heatmap(df_pm.drop(['location_id', 'PM10', 'PM2p5_PM10'], axis=1).corr(), annot=True, cmap="YlGnBu_r", mask=mask, vmax=1, vmin=0,fmt='.2f')

As expected, the correlation decreases over time. In other words, PM2.5 concentrations that are closer together in a temporal manner are mor correlated.

In [None]:
sns.histplot(data=df_pm, x='PM2p5_PM10')

In [None]:
df_pm[df_pm['PM2p5_PM10']==0][['PM2p5_PM10', 'PM2p5', 'PM10']].describe().T

In [None]:
df_pm[df_pm['PM2p5_PM10']>=1][['PM2p5_PM10', 'PM2p5', 'PM10']].describe().T

In [None]:
locations = df_pm['location_id'].unique()[:10]
columns = 5
rows = int(np.ceil((len(locations) - 1) / columns))
fig, ax = plt.subplots(rows, columns, figsize=(25, 80 / 13 * (len(locations)//5)))
fig.tight_layout() # tight_layout automatically adjusts subplot params so that the subplot(s) fits in to the figure area
plt.subplots_adjust(hspace = .25, wspace = .15, top = .93) # adjusts the space between the single subplots
for row in range(rows):
    for col in range(columns):
        if col + row * columns < len(locations):
            g=sns.histplot(
                data=df_pm[df_pm['location_id']==locations[row * columns + col]],
                x='PM2p5_PM10',
                ax=ax[row][col],
            )
            ax[row][col].set_xlim(-.05, 1.05)
            ax[row][col].set_title(locations[row * columns + col], fontsize = 20)
            ax[row][col].tick_params(labelrotation=90)


In [this publication](https://www.frontiersin.org/articles/10.3389/fenvs.2021.692440/full) the authors characterized those PM2.5/PM10 histograms with a mode over 0.6 as anthropogenic. Using this classifications, in both cities are several locations that can be classified as anthropogenic.

In [None]:
type(df_pm.loc[0,'timestamp'])

In [None]:
df_pm['month'] = df_pm['timestamp'].dt.month
df_pm['year'] = df_pm['timestamp'].dt.year

df_pm_grouped_2021 = df_pm[['location_id', 'city', 'PM10', 'PM2p5', 'month', 'year']].groupby(['location_id', 'month', 'year', 'city']).mean().reset_index().query("year==2021")


fig, ax = plt.subplots(1, 2, figsize=(25, 10))
fig.tight_layout() # tight_layout automatically adjusts subplot params so that the subplot(s) fits in to the figure area
fig.subplots_adjust(hspace = .25, wspace = .15, top = .9) # adjusts the space between the single subplots
fig.suptitle("Mean PM2.5 concentration per month", fontsize=30)
i = 0
for city in df_pm_grouped_2021['city'].unique():
    g=sns.barplot(
        data=df_pm_grouped_2021[df_pm_grouped_2021['city']==city],
        x='month',
        y='PM2p5',
        ax=ax[i],
        color='b',
    )
    ax[i].set_title(city, fontsize = 20)
    ax[i].set_ylim(0, 17)
    ax[i].set_ylabel("PM2.5 in µg/m$^3$")
    ax[i].set_xlabel("Month")
    i+=1


The mean PM2.5 concentration per city shows a clear seasonality. The concentrations are rather high in winter and low in summer, which could be due to the increased energy need caused by heating in cold months. 

In [None]:
locations = df_pm['location_id'].unique()[:10]
columns = 1
rows = int(np.ceil((len(locations) - 1) / columns))
fig, ax = plt.subplots(rows, columns, figsize=(30, (150/65)*len(locations)))
fig.tight_layout() # tight_layout automatically adjusts subplot params so that the subplot(s) fits in to the figure area
fig.subplots_adjust(hspace = .35, wspace = .15, top = .97) # adjusts the space between the single subplots
fig.suptitle("PM2.5 / PM10 ratio over time (rolling average)", fontsize=30)
for row in range(rows):
    for col in range(columns):
        if col + row * columns < len(locations):
            g=sns.scatterplot(
                data=df_pm[df_pm['location_id']==locations[row * columns + col]],
                x='timestamp',
                y='PM2p5_PM10_rolling',
                ax=ax[row]#[col],
            )

            g.set_xlim(pd.to_datetime('2020-01-01'), pd.to_datetime('2021-12-31'))
            g.set_ylim(-.1,1.1)
            if row < rows-1:
                g.set_xticklabels([])
                g.set_xlabel('')
            ax[row].set_title(f"ID: {locations[row * columns + col]}", fontsize = 20)
            ax[row].set_ylabel("PM2.5 / PM10", fontsize = 20)
            # ax[row].tick_params(labelrotation=90)


### Hyperparameter tuning
As we want to find general hyperparameters for both locations, we calculate the prior_scales for the use of weather data as regressors for both cities at once:

In [None]:
def calc_corr_mean (feature, df):
    """calculates mean for a features given in df
    """
    return df[feature].mean()

In [None]:
features = ['humidity_dwd', 'pressure_dwd', 'precip', 'wind_direction', 'wind_speed', 'temperature_dwd']
# prepare DataFrame for results
df_mean_features = pd.DataFrame(['mean', 'prior_scale'])

# calc mean and add to DataFrame
for i in features:
    df_mean_features[i] = round(calc_corr_mean(i, df_PM2p5_correlations), 2)

# calculate prior_scale and add to DataFrame
sum_ = df_mean_features.iloc[0:1, 1:].abs().sum(axis=1)[0]
for i in features:
    df_mean_features.loc[1, i] = round(np.abs(df_mean_features.loc[0, i] / sum_), 2)
df_mean_features.T

# check if sum is 1
#df_mean_features.iloc[1:2, 1:].abs().sum(axis=1)


In [None]:
# plot correlations with mean values for whole data set
df_PM2p5_correlations[['humidity_dwd', 'pressure_dwd', 'precip', 'wind_direction', 'wind_speed', 'temperature_dwd']].plot(figsize=(15,8))
plt.title('                        Frankfurt                                                                          Bremen', fontsize=20)
plt.xlabel('location_id', fontsize=20)
plt.ylabel('correlation with PM2p5', fontsize=20)
plt.legend(fontsize=20)
plt.ylim(-0.5,0.5)
plt.xlim(0)
plt.plot([124.5, 124.5], [-0.5, 0.5], linewidth=5, color='black')
for i in features:
    plt.plot([0, 182], [df_mean_features.loc[0, i], df_mean_features.loc[0, i]], linewidth=2)

## How severe is PM pollution in Frankfurt and Bremen?


In [None]:
# European Air Quality Index https://www.eea.europa.eu/themes/air/air-quality-index (source: Wikipedia)
def pm2p5_bins(pm):
    bins = {
        '1-good': [0,10],                    # good
        '2-fair': [10,20],                   # fair
        '3-moderate': [20,25],                   # moderate
        '4-poor': [25,50],                   # poor
        '5-very poor': [50,75],                   # very poor
        '6-extremely poor': [75,800],                  # extremely poor
        '7-undefined': [800,2000]                 # undefined
    }
    for k,v in bins.items():
        if v[0] <= pm < v[1]:
            return k

def pm10_bins(pm):
    bins = {
        '1-good': [0,20],                    # good
        '2-fair': [20,40],                   # fair
        '3-moderate': [40,50],                   # moderate
        '4-poor': [50,100],                   # poor
        '5-very poor': [100,150],                   # very poor
        '6-extremely poor': [150,1200],                  # extremely poor
        '7-undefined': [1200,2000]                 # undefined
    }
    for k,v in bins.items():
        if v[0] <= pm < v[1]:
            return k

# add air quality to DataFrame        
df_good_sensors["PM2p5_quality"] = df_good_sensors["PM2p5"].apply(pm2p5_bins)
df_good_sensors["PM10_quality"] = df_good_sensors["PM10"].apply(pm10_bins)

In [None]:
# count number of PM  measurements depending on air quality per city
# PM2p5
pm2p5_quality_count = pd.DataFrame(df_good_sensors.query("city=='Bremen'")['PM2p5_quality'].value_counts())
pm2p5_quality_count['Frankfurt'] = pd.DataFrame(df_good_sensors.query("city=='Frankfurt'")['PM2p5_quality'].value_counts())
pm2p5_quality_count.reset_index(inplace=True)
pm2p5_quality_count.columns = ['quality', 'PM2p5_Bremen', 'PM2p5_Frankfurt']
pm2p5_quality_count['PM2p5_sum'] = pm2p5_quality_count['PM2p5_Bremen'] + pm2p5_quality_count['PM2p5_Frankfurt']
#display(pm2p5_quality_count)

# PM10
pm10_quality_count = pd.DataFrame(df_good_sensors.query("city=='Bremen'")['PM10_quality'].value_counts())
pm10_quality_count['Frankfurt'] = pd.DataFrame(df_good_sensors.query("city=='Frankfurt'")['PM10_quality'].value_counts())
pm10_quality_count.reset_index(inplace=True)
pm10_quality_count.columns = ['quality', 'PM10_Bremen', 'PM10_Frankfurt']
pm10_quality_count['PM10_sum'] = pm10_quality_count['PM10_Bremen'] + pm10_quality_count['PM10_Frankfurt']
#display(pm10_quality_count)

# merge PM2p5 and PM10
quality_absolute = pm2p5_quality_count.merge(pm10_quality_count, on='quality').sort_values('quality')
display(quality_absolute)

In [None]:
# calculate percentages of PM  measurements depending on air quality per city
# PM2p5
percentage_PM2p5 = pd.DataFrame((pd.crosstab(index=[0], columns=df_good_sensors.query("city=='Bremen'")['PM2p5_quality'], normalize="index") * 100).round(2).iloc[0,:])
percentage_PM2p5['Frankfurt'] = (pd.crosstab(index=[0], columns=df_good_sensors.query("city=='Frankfurt'")['PM2p5_quality'], normalize="index") * 100).round(2).iloc[0,:]
percentage_PM2p5['sum'] = (pd.crosstab(index=[0], columns=df_good_sensors['PM2p5_quality'], normalize="index") * 100).round(2).iloc[0,:]
percentage_PM2p5.reset_index(inplace=True)
percentage_PM2p5.columns = ['quality', 'PM2p5_Bremen', 'PM2p5_Frankfurt', 'PM2p5_sum']
#display(percentage_PM2p5)

# PM10
percentage_PM10 = pd.DataFrame((pd.crosstab(index=[0], columns=df_good_sensors.query("city=='Bremen'")['PM10_quality'], normalize="index") * 100).round(2).iloc[0,:])
percentage_PM10['Frankfurt'] = (pd.crosstab(index=[0], columns=df_good_sensors.query("city=='Frankfurt'")['PM10_quality'], normalize="index") * 100).round(2).iloc[0,:]
percentage_PM10['sum'] = (pd.crosstab(index=[0], columns=df_good_sensors['PM10_quality'], normalize="index") * 100).round(2).iloc[0,:]
percentage_PM10.reset_index(inplace=True)
percentage_PM10.columns = ['quality', 'PM10_Bremen', 'PM10_Frankfurt', 'PM10_sum']
#display(percentage_PM10)

quality_percentage = percentage_PM2p5.merge(percentage_PM10, on='quality')
quality_percentage

In [None]:
# plot percentages


quality_percentage[['quality', 'PM2p5_Bremen', 'PM2p5_Frankfurt', 'PM2p5_sum']].plot(kind='bar', 
                    x='quality',
                    stacked=False, 
                    colormap='tab10', # 'tab10' 'Set1' 'Dark2'
                    figsize=(15, 6))
plt.title('PM2.5')
plt.ylim(0,100)
plt.legend(['Bremen', 'Frankfurt', 'sum'])
plt.ylabel('measurements (%)', fontsize=15)
plt.xlabel('air quality', fontsize=15)



quality_percentage[['quality', 'PM10_Bremen', 'PM10_Frankfurt', 'PM10_sum']].plot(kind='bar', 
                    x='quality',
                    stacked=False, 
                    colormap='tab10', # 'tab10' 'Set1' 'Dark2'
                    figsize=(15, 6))
plt.title('PM10')
plt.ylim(0,100)
plt.legend(['Bremen', 'Frankfurt', 'sum'])
plt.ylabel('measurements (%)', fontsize=15)
plt.xlabel('air quality', fontsize=15);

## Comparison of DWD weather data and PM2.5 values for Frankfurt and Bremen

In [None]:
# plot histogram of dwd weather data in comparison for Frankfurt and Bremen
cmap = ['#4c72b0', '#dd8552'] # blue and orange 

sns.set_theme()
sns.set_context("talk")
plt.rcParams.update({'font.size': 40})
with sns.axes_style("darkgrid"):
    plt.figure(figsize=(25,16))
    plt.subplot(2,3,1)
    sns.histplot(data=df_good_sensors.query("location_id == 2 or location_id == 182"), x='humidity_dwd', alpha=0.5, bins=40, hue='city', palette=cmap)
    plt.xlim(0,100)
    plt.legend().remove() # no legend shown
    plt.xlabel('relative humidity (%)')
    plt.ylabel('count')
    
    plt.subplot(2,3,4)
    sns.histplot(data=df_good_sensors.query("location_id == 2 or location_id == 182"), x='temperature_dwd', alpha=0.5, bins=60, hue='city', palette=cmap)
    plt.legend().remove() # no legend shown
    plt.xlim(-20, 40)
    plt.xlabel('temperature (°C)')
    plt.ylabel('count')

    plt.subplot(2,3,3)
    sns.histplot(data=df_good_sensors.query("location_id == 2 or location_id == 182"), x='wind_speed', alpha=0.5, bins=60, hue='city', palette=cmap)
    plt.legend().remove() # no legend shown
    plt.xlim(0,15)
    plt.xlabel('wind speed (m/s)')
    plt.ylabel('count')
        
    plt.subplot(2,3,2)
    sns.histplot(data=df_good_sensors.query("location_id == 2 or location_id == 182"), x='pressure_dwd', alpha=0.5, bins=60, hue='city', palette=cmap)    
    plt.legend().remove() # no legend shown
    plt.xlim(970 , 1050)
    plt.xlabel('air pressure (mbar)')
    plt.ylabel('count')
    
    plt.subplot(2,3,5)
    sns.histplot(data=df_good_sensors.query("location_id == 2 or location_id == 182"), x='wind_direction', alpha=0.5, bins=30, hue='city', palette=cmap)
    plt.legend().remove() # no legend shown
    plt.xlim(0, 360)
    plt.xlabel('wind direction (°)')
    plt.ylabel('count')
    
    plt.subplot(2,3,6)
    sns.histplot(data=df_good_sensors.query("location_id == 2 or location_id == 182"), x='precip', alpha=0.5, bins=40, hue='city', palette=cmap)
    plt.legend().remove() # no legend shown
    plt.ylim(0,500)
    plt.xlim(0, 10)
    plt.xlabel('precipitation (?)')
    plt.ylabel('count')


In [None]:
# plot histogram of dwd weather data in comparison for Frankfurt and Bremen (for presentation)
cmap = ['#4c72b0', '#dd8552'] # blue and orange 
sns.set_theme()
sns.set_context("talk")
with sns.axes_style("darkgrid"):
    plt.figure(figsize=(16,6))
    
    plt.subplot(1,2,1)
    
    sns.histplot(data=df_good_sensors.query("location_id == 2 or location_id == 182"), x='humidity_dwd', alpha=0.5, bins=40, hue='city', palette=cmap)
    plt.xlim(0,100)
    plt.legend().remove() # no legend shown
    plt.xlabel('relative humidity in %')
    plt.ylabel('counts')
    
    plt.subplot(1,2,2)
    sns.histplot(data=df_good_sensors.query("location_id == 2 or location_id == 182"), x='pressure_dwd', alpha=0.5, bins=60, hue='city', palette=cmap)    
    plt.legend().remove() # no legend shown
    plt.xlim(970 , 1050)
    plt.xlabel('air pressure in mbar')
    plt.ylabel('counts')

    plt.tight_layout()

    plt.savefig('../figures/histplot_dwd_data_per_city.png', transparent=True, bbox_inches='tight')

In [None]:
df_good_sensors.query("location_id > 125").shape
df_good_sensors.query("location_id < 55").shape

In [None]:
cmap = ['#4c72b0', '#dd8552'] # blue and orange for Frankfurt and  Bremen
sns.set_theme()
sns.set_context("talk")
with sns.axes_style("darkgrid"):
    plt.figure(figsize=(14,6))
    sns.histplot(data=df_good_sensors.query("location_id > 125 or location_id < 55"), x='PM2p5', alpha=0.5, bins=1000, hue='city', palette=cmap)
    plt.legend().remove() # no legend shown
    plt.xlabel('PM$_{2.5}$ in µg/m$^3$')
    plt.ylabel('count')
    plt.xlim(0,50);
    plt.savefig('../figures/PM2p5_hist.png', transparent=True, bbox_inches='tight')


In [None]:
df_good_sensors['month'] =  df_good_sensors['timestamp'].dt.month
df_good_sensors.columns

In [None]:
# prepare data
df_dwd_grouped = df_good_sensors[['location_id', 'city', 'PM10', 'PM2p5', 'month', 'humidity_dwd', 'precip', 'pressure_dwd', 'temperature_dwd', 'wind_direction', 'wind_speed']].groupby(['location_id', 'month', 'city']).median().reset_index()
df_dwd_grouped

In [None]:
plt.figure(figsize=(20,16))
plt.subplot(3,2,1)
sns.barplot(data=df_dwd_grouped, x='month', y='pressure_dwd', hue='city')
plt.ylabel('mean pressure (mbar)')
plt.ylim(990, 1030)

plt.subplot(3,2,2)
sns.barplot(data=df_dwd_grouped, x='month', y='temperature_dwd', hue='city')
plt.ylabel('mean temperature ()')
#plt.ylim(990, 1020)

plt.subplot(3,2,3)
sns.barplot(data=df_dwd_grouped, x='month', y='wind_speed', hue='city')
plt.ylabel('mean wind speed ()')
#plt.ylim(990, 1020)

plt.subplot(3,2,4)
sns.barplot(data=df_dwd_grouped, x='month', y='humidity_dwd', hue='city')
plt.ylabel('mean humidity ()')
#plt.ylim(990, 1020)

plt.subplot(3,2,5)
sns.barplot(data=df_dwd_grouped, x='month', y='wind_direction', hue='city')
plt.ylabel('mean wind direction ()')
#plt.ylim(990, 1020)


plt.subplot(3,2,6)
sns.barplot(data=df_dwd_grouped, x='month', y='PM2p5', hue='city')
plt.ylabel('mean PM2.5 ()')
#plt.ylim(990, 1020)



In [None]:
plt.figure(figsize=(10,8))
df_pm_grouped = df_pm[['location_id', 'city', 'PM10', 'PM2p5', 'month', 'year']].groupby(['location_id', 'month', 'year', 'city']).mean().reset_index()
sns.barplot(data=df_pm_grouped, x='month', y='PM2p5', hue='city')
plt.ylabel('mean PM2.5')
