In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme()


# Load data and do basic formatting

In [None]:
# load data from Deutscher Wetterdienst
df_dwd = pd.read_csv("../data/processed_deutscher_wetterdienst.csv", index_col=0)

# convert 'date' to datatime and replace invalid values with 'nan'
df_dwd.date = pd.to_datetime(df_dwd.date)
for col in df_dwd.columns:
    df_dwd[col] = df_dwd[col].replace(-999, np.nan)
df_dwd

In [None]:
# load data of sensor community
df_sc = pd.read_csv("../data/processed_sensor_community.csv", index_col=0)

# convert date and timestamp to datetime
df_sc.timestamp = pd.to_datetime(df_sc.timestamp)
df_sc.date = pd.to_datetime(df_sc.date)

# convert pressure to hPa
df_sc['pressure'] = df_sc['pressure'] / 100
df_sc['pressure_std'] = df_sc['pressure_std'] / 100

# add sensor IDs
df_sc_location = df_sc.groupby(['lat', 'lon']).count().reset_index()[['lat', 'lon']]
df_sc_location['location_id'] = df_sc_location.index+1
df_sc = df_sc.merge(df_sc_location, on=['lat', 'lon'], how='left')

# define lists with columns
non_data_cols = ['location_id', 'timestamp', 'hour', 'date', 'lat', 'lon','city']
data_cols = sorted([col for col in list(df_sc.columns) if col not in non_data_cols])
data_cols_wo_std = [col for col in data_cols if 'std' not in col]
std_cols = [col for col in data_cols if 'std' in col]

# reorganize columns: first non-data columns, then sorted data columns
df_sc = df_sc.reindex(columns=non_data_cols+data_cols)
df_sc

In [None]:
df_sc.info()

# Investigation of missing values, zeros and outliers

In [None]:
# Basic statistics of the whole sc dataset
df_sc[data_cols_wo_std].describe().T.round(1)

In [None]:
# Basic statistics of the sc dataset for Frankfurt
df_sc[df_sc['city']=='Frankfurt'][data_cols_wo_std].describe().T.round(1)

In [None]:
# Basic statistics of the sc dataset for Bremen
df_sc[df_sc['city']=='Bremen'][data_cols_wo_std].describe().T.round(1)

PM10: Mean is almost double of the 75th percentile -> Outliers raise the mean extremely </br>
PM2.5: similar to PM10, but less extreme </br>
humidity: al values (mean, 25th, 50th and 75th percentile) seem to be very large, the max value is above 100, what doesn't make any sense </br>
pressure: assuming the units are Pa (1 bar = 100.000 Pa): min value is below 100 -> unrealistic, max value is also unrealistic (more than 60 bar) </br>
temperature: std seems very high (54 °C), min and max value are unrealistic </br>
 </br>
 Bremen vs. Frankfurt </br>
 PM10 and PM2.5: std for Bremen is double of std for Frankfurt </br>
 humidity: 50th percentile of Bremen is already 99.9 % what seems quite high
 pressure and temperature: no obvious unrealistic observations besides the min and max values

In [None]:
print("missing values in each column")
for col in df_sc.columns:
    print(f"{col}: {df_sc[col].isna().sum()} ({round(df_sc[col].isna().sum() / df_sc.shape[0] * 100, 1)} %)")

In [None]:
print("value '0' in each column")
for col in df_sc.columns:
    print(f"{col}: {df_sc[df_sc[col]==0][col].count()} ({round(df_sc[df_sc[col]==0][col].count() / df_sc.shape[0] * 100, 1)} %)")

In [None]:
def count_nan_and_0s(df: pd.DataFrame, cols: list = None) -> pd.DataFrame:
    """Counts zeros and nans per column.

    Args:
        df (pd.DataFrame): Dataframe to search for zeros and nans.
        cols (list, optional): List of columns, if no columns are specified all will be used. Defaults to None.
        thresholds (dict, optional): Thresholds for further . Defaults to None.

    Returns:
        pd.DataFrame: Dataframe containing counts of zeros and nans.
    """
    # use all columns af none were defined
    if cols == None:
        cols=df.columns
    # make a new dataframe and put the defined column names in the first column
    df_nan_0 = pd.DataFrame()
    df_nan_0['data'] = cols
    # calculate missing values and zeros as absolute value and share 
    df_nan_0['missing_values'] = [df[col].isna().sum() for col in cols]
    df_nan_0['missing_values_share'] = [df[col].isna().sum() / df.shape[0] * 100 for col in cols]
    df_nan_0['0_values'] = [df[df[col]==0][col].count() for col in cols]
    df_nan_0['0_values_share'] = [df[df[col]==0][col].count() / df.shape[0] * 100 for col in cols]

    # transpose the dataframe and use the original column names as column names
    df_nan_0 = df_nan_0.set_index('data').T.reset_index()
    df_nan_0.columns = [name if i>0 else 'metric' for i, name in enumerate(df_nan_0.columns)]
    return df_nan_0


# find missing values and zeros in the sc dataset
df_data_analysis = count_nan_and_0s(df_sc, data_cols)
df_data_analysis.round(1)

In [None]:
# define metrics and columns to plot
metrics = ["missing_values_share", "0_values_share"]
ys = sorted(list(df_data_analysis.columns))
ys.remove('metric')

# define size of subplot
columns = 4
rows = int(np.ceil((len(df_data_analysis.columns) - 1) / columns))

# plot
fig, ax = plt.subplots(rows, columns, figsize=(20,10)) # create subplots
plt.suptitle("Data analysis of missing values and zeros", fontsize=20) # title of plot
fig.tight_layout() # tight_layout automatically adjusts subplot params so that the subplot(s) fits in to the figure area
plt.subplots_adjust(hspace = .5, wspace = .2, top = .9) # adjusts the space between the single subplots

for row in range(rows):
    for col in range(columns):
        if col + row * (rows + 1) < len(ys):
            # create a bar for each metric defined above for a column of ys list
            sns.barplot(data=df_data_analysis[df_data_analysis['metric'].isin(metrics)], x='metric', y=ys[col + row * (rows + 1)], ax=ax[row][col])
            # set ylim to [0, 100] as we are plotting percentages
            ax[row][col].set_ylim([0, 100])
            # put the percentage above each plotted bar
            ax[row][col].bar_label(ax[row][col].containers[0], fmt='%.1f')
            # set the x, y and x-tick labels
            ax[row][col].set_xlabel("")
            ax[row][col].set_ylabel("Share of values in %")
            ax[row][col].set_xticklabels(labels=["Missing values", "Zeros"])
            # use the column name with slight changes as subplot name
            title = f"{ys[col + row * (rows + 1)]}".replace('_', ' ').replace('std', 'std. dev.').replace('2p5', '2.5').capitalize()
            ax[row][col].set_title(title, fontsize = 15);
        else:
            # delete not needed subplots
            fig.delaxes(ax[row][col])


In [None]:
# columns to plot
ys = data_cols_wo_std

# define size of subplot
columns = 3
rows = int(np.ceil((len(ys)) / columns))

# plot
fig, ax = plt.subplots(rows, columns, figsize=(20,10)) # create subplots
plt.suptitle("Outlier analysis", fontsize=20) # title of plot
fig.tight_layout() # tight_layout automatically adjusts subplot params so that the subplot(s) fits in to the figure area
plt.subplots_adjust(hspace = .5, wspace = .2, top = .9) # adjusts the space between the single subplots

for row in range(rows):
    for col in range(columns):
        if col + row * (rows + 1) < len(ys):
            # create a bar for each metric defined above for a column of ys list
            sns.scatterplot(data=df_sc, x='timestamp', y=ys[col + row * (rows + 1)], ax=ax[row][col], alpha=.3)
            # set the x, y and x-tick labels
            ax[row][col].set_xlabel(ax[row][col].get_xlabel().capitalize())
            ax[row][col].set_ylabel(ax[row][col].get_ylabel().capitalize())
            # use the column name with slight changes as subplot name
            title = f"{ys[col + row * (rows + 1)]}".replace('_', ' ').replace('std', 'std. dev.').replace('2p5', '2.5').capitalize()
            ax[row][col].set_title(title, fontsize = 15)
            ax[row][col].tick_params(labelrotation=90)
        else:
            # delete not needed subplots
            fig.delaxes(ax[row][col])


There are few outliers in humidity, pressure and temperature which can be dropped by setting thresholds. </br>
For PM10 and PM2.5 it is less obvious as the data is scattered all over the possible range. 

# Delete unrealistic values and outliers for environmental variables

## hard thresholds based on physical estimations
We can first have a look at the extreme values measured by Deutscher Wetterdienst to get an impression what range of values is realistic.

In [None]:
print(df_dwd['humidity'].max())
print(df_dwd.query("City == 'Frankfurt'")['humidity'].min())
print(df_dwd.query("City == 'Bremen'")['humidity'].min())

In [None]:
print(df_dwd['pressure'].max())
print(df_dwd.query("City == 'Frankfurt'")['pressure'].min())
print(df_dwd.query("City == 'Bremen'")['pressure'].min())

In [None]:
print(df_dwd['temperature'].max())
print(df_dwd.query("City == 'Frankfurt'")['temperature'].min())
print(df_dwd.query("City == 'Bremen'")['temperature'].min())

In [None]:
# thresholds_part = {
#     'PM10': (0, 1000),
#     'PM2p5': (0, 500),
# }
# set lower and upper threshold
thresholds_env = {
    'humidity': (15, 100),
    'pressure': (960, 1050),
    'temperature': (-20, 60),
}

# delete values below lower and above upper threshold
for col, thresh in thresholds_env.items():
    nan_before = df_sc[col].isna().sum()
    df_sc.iloc[df_sc[col] <= thresh[0], list(df_sc.columns).index(col)] = np.nan
    df_sc.iloc[df_sc[col] >= thresh[1], list(df_sc.columns).index(col)] = np.nan
    print(f"added {df_sc[col].isna().sum() - nan_before} nans in {col}")


## values with std. dev. 'nan' or zero
If the standard deviation is 'nan', there was no or only one observation. If the standard deviation is zero, there was no fluctuation in the measured value, what can be assumed to be a measurement error.

In [None]:
# delete values for the defined columns if the standard deviation is zero or 'nan'
for col in [
    'temperature',
    'humidity',
    'pressure',
]:
    df_sc.loc[df_sc[col+'_std']==0, col] = np.nan    
    df_sc.loc[df_sc[col+'_std']==np.nan, col] = np.nan    


## dynamic thresholds based on quantiles

In [None]:
# define quantiles as threshold
thresh = {
    'temperature': (.01, .85),
    'humidity': (.05, .95),
    'pressure': (.05, .95),
}

# make a dataframe containing median, upper and lower threshold defined by the quantiles above
df_thresholds = df_sc.groupby(['city', 'timestamp']).agg(
    temp_median = pd.NamedAgg(column='temperature', aggfunc='median'), 
    temp_lower = pd.NamedAgg(column='temperature', aggfunc=lambda x: x.quantile(q=thresh['temperature'][0])),
    temp_upper = pd.NamedAgg(column='temperature', aggfunc=lambda x: x.quantile(q=thresh['temperature'][1])),
    hum_median = pd.NamedAgg(column='humidity', aggfunc='median'), 
    hum_lower = pd.NamedAgg(column='humidity', aggfunc=lambda x: x.quantile(q=thresh['humidity'][0])),
    hum_upper = pd.NamedAgg(column='humidity', aggfunc=lambda x: x.quantile(q=thresh['humidity'][1])),
    pres_median = pd.NamedAgg(column='pressure', aggfunc='median'), 
    pres_lower = pd.NamedAgg(column='pressure', aggfunc=lambda x: x.quantile(q=thresh['pressure'][0])),
    pres_upper = pd.NamedAgg(column='pressure', aggfunc=lambda x: x.quantile(q=thresh['pressure'][1])),
).reset_index()

# merge the thresholds with the sc dataframe
df_sc = df_sc.merge(df_thresholds, how='left', on=['city', 'timestamp'])

In [None]:
df_thresholds.describe()

In [None]:
# replace values below lower threshold and above upper threshold with 'nan'
for col, thresholds in {
    'temperature': ['temp_lower', 'temp_upper'],
    'humidity': ['hum_lower', 'hum_upper'],
    'pressure': ['pres_lower','pres_upper'],
}.items():
    nan_before = df_sc[col].isna().sum()
    df_sc.loc[(df_sc[col] < df_sc[thresholds[0]]) | (df_sc[col] > df_sc[thresholds[1]]), col] = np.nan
    print(f"{df_sc[col].isna().sum() - nan_before} nans added in {col}")


# Visualization of cleaned data and comparison with dwd data

In [None]:
# Plot dwd and sc data for Frankfurt
# define columns to plot
ys = data_cols_wo_std

# define size of subplot
columns = 1
rows = int(np.ceil((len(ys)) / columns)) -1

# plot
fig, ax = plt.subplots(rows, columns, figsize=(20,25)) # create subplots
plt.suptitle("Comparison sensor data vs. dwd in Frankfurt", fontsize=20) # title of plot
fig.tight_layout() # tight_layout automatically adjusts subplot params so that the subplot(s) fits in to the figure area
plt.subplots_adjust(hspace = .2, wspace = .2, top = .95) # adjusts the space between the single subplots

# Plot humidity from both datasets vs time
sns.scatterplot(data=df_sc[df_sc['city'] == 'Frankfurt'], x='timestamp', y='humidity', ax=ax[0])
sns.lineplot(data=df_dwd[df_dwd['City']=='Frankfurt'], x='date', y='humidity', color='red', alpha=.5, ax=ax[0])
ax[0].set_xlabel(ax[0].get_xlabel().capitalize())
ax[0].set_ylabel(ax[0].get_ylabel().capitalize())

# Plot humidity and precipitation from both datasets vs time
sns.scatterplot(data=df_sc[df_sc['city'] == 'Frankfurt'], x='timestamp', y='humidity', ax=ax[1])
sns.lineplot(data=df_dwd[df_dwd['City']=='Frankfurt'], x='date', y='precip', color='red', alpha=.5, ax=ax[1])
ax[1].set_xlabel(ax[1].get_xlabel().capitalize())
ax[1].set_ylabel(ax[1].get_ylabel().capitalize())

# Plot pressure from both datasets vs time
sns.scatterplot(data=df_sc[df_sc['city'] == 'Frankfurt'], x='timestamp', y='pressure', ax=ax[2])
sns.lineplot(data=df_dwd[df_dwd['City']=='Frankfurt'], x='date', y='pressure', color='red', alpha=.5, ax=ax[2])
ax[2].set_xlabel(ax[2].get_xlabel().capitalize())
ax[2].set_ylabel(ax[2].get_ylabel().capitalize())

# Plot temperature from both datasets vs time
sns.scatterplot(data=df_sc[df_sc['city'] == 'Frankfurt'], x='timestamp', y='temperature', ax=ax[3])
sns.lineplot(data=df_dwd[df_dwd['City']=='Frankfurt'], x='date', y='temperature', color='red', alpha=.5, ax=ax[3])
ax[3].set_xlabel(ax[3].get_xlabel().capitalize())
ax[3].set_ylabel(ax[3].get_ylabel().capitalize())

;

In [None]:
# Plot dwd and sc data for Bremen
# define columns to plot
ys = data_cols_wo_std

# define size of subplot
columns = 1
rows = int(np.ceil((len(ys)) / columns)) -1

# plot
fig, ax = plt.subplots(rows, columns, figsize=(20,25)) # create subplots
plt.suptitle("Comparison sensor data vs. dwd in Bremen", fontsize=20) # title of plot
fig.tight_layout() # tight_layout automatically adjusts subplot params so that the subplot(s) fits in to the figure area
plt.subplots_adjust(hspace = .2, wspace = .2, top = .95) # adjusts the space between the single subplots

# Plot humidity from both datasets vs time
sns.scatterplot(data=df_sc[df_sc['city'] == 'Bremen'], x='timestamp', y='humidity', ax=ax[0])
sns.lineplot(data=df_dwd[df_dwd['City']=='Bremen'], x='date', y='humidity', color='red', alpha=.5, ax=ax[0])
ax[0].set_xlabel(ax[0].get_xlabel().capitalize())
ax[0].set_ylabel(ax[0].get_ylabel().capitalize())

# Plot humidity and precipitation 
sns.scatterplot(data=df_sc[df_sc['city'] == 'Bremen'], x='timestamp', y='humidity', ax=ax[1])
sns.lineplot(data=df_dwd[df_dwd['City']=='Bremen'], x='date', y='precip', color='red', alpha=.5, ax=ax[1])
ax[1].set_xlabel(ax[1].get_xlabel().capitalize())
ax[1].set_ylabel(ax[1].get_ylabel().capitalize())

# Plot pressure from both datasets vs time
sns.scatterplot(data=df_sc[df_sc['city'] == 'Bremen'], x='timestamp', y='pressure', ax=ax[2])
sns.lineplot(data=df_dwd[df_dwd['City']=='Bremen'], x='date', y='pressure', color='red', alpha=.5, ax=ax[2])
ax[2].set_xlabel(ax[2].get_xlabel().capitalize())
ax[2].set_ylabel(ax[2].get_ylabel().capitalize())

# Plot temperature from both datasets vs time
sns.scatterplot(data=df_sc[df_sc['city'] == 'Bremen'], x='timestamp', y='temperature', ax=ax[3])
sns.lineplot(data=df_dwd[df_dwd['City']=='Bremen'], x='date', y='temperature', color='red', alpha=.5, ax=ax[3])
ax[3].set_xlabel(ax[3].get_xlabel().capitalize())
ax[3].set_ylabel(ax[3].get_ylabel().capitalize())

;

In [None]:
# Example of the distribution of measured temperatures in one day
sns.histplot(data=df_sc[(df_sc['timestamp']>'2020-07-01') & (df_sc['timestamp']<'2020-07-15')], x='temperature', bins=20);

# Investigation of single locations

In [None]:
# group by location_id and calculate the total number of hours with measurements, date of the first and of the last measurement
location_grouped = df_sc[['location_id', 'hour', 'date']].\
    groupby(['location_id']).\
        agg(
                hours = pd.NamedAgg(column='hour', aggfunc='count'), 
                date_min = pd.NamedAgg(column='date', aggfunc='min'),
                date_max = pd.NamedAgg(column='date', aggfunc='max')
            ).\
            reset_index().\
                sort_values('hours', ascending=False)

location_grouped['date_min'] = pd.to_datetime(location_grouped['date_min'])
location_grouped['date_max'] = pd.to_datetime(location_grouped['date_max'])
location_grouped['period_length'] = location_grouped['date_max'] - location_grouped['date_min'] + pd.Timedelta(days=1)
location_grouped['hours_per_day'] = location_grouped['hours'] / location_grouped['period_length'].dt.days
location_grouped.sample(5)

In [None]:
# plot the number of hours that were measured at each location
plt.figure(figsize=(25, 10))
g = sns.barplot(data=location_grouped, x='location_id', y='hours', order=location_grouped.sort_values('hours', ascending=False)['location_id'])
g.set_xlabel(g.get_xlabel().capitalize().replace('_', ' '))
g.set_ylabel(g.get_ylabel().capitalize())
plt.xticks(rotation=90);

In [None]:
# plot the number of hours per day measured per location
plt.figure(figsize=(25, 10))
g = sns.barplot(data=location_grouped.sort_values('hours_per_day', ascending=False), x='location_id', y='hours_per_day', order=location_grouped.sort_values('hours_per_day', ascending=False)['location_id'])
g.set_xlabel(g.get_xlabel().capitalize().replace('_', ' '))
g.set_ylabel(g.get_ylabel().capitalize().replace('_', ' '))
plt.xticks(rotation=90);

In [None]:
print(f"Total number of locations: {location_grouped.shape[0]}")
print('Locations with the least hours of measurement:')
location_grouped.tail(20)

In [None]:
location_grouped[['hours', 'hours_per_day']].describe().T.round(1)

There are some sensor locations which delivered data only for few hours

In [None]:
fig, (ax1, ax2, ax3, ax4) = plt.subplots(4, 1, figsize=(20, 20))
plt.suptitle("Sensors per city", fontsize=20) # title of plot
fig.tight_layout() # tight_layout automatically adjusts subplot params so that the subplot(s) fits in to the figure area
plt.subplots_adjust(hspace = .5, wspace = .2, top = .9) # adjusts the space between the single subplots

labels_frankfurt = set(df_sc.query("city=='Frankfurt'")['location_id'])
labels_bremen = set(df_sc.query("city=='Bremen'")['location_id'])

sns.lineplot(data=df_sc[df_sc['city']=='Frankfurt'][::10], x='timestamp', y='PM10', hue='location_id', ax=ax1, legend=False)
ax1.legend(labels=labels_frankfurt)
ax1.set_title('Frankfurt - PM10', fontsize = 15)

sns.lineplot(data=df_sc[df_sc['city']=='Frankfurt'][::10], x='timestamp', y='PM2p5', hue='location_id', ax=ax2, legend=False)
ax2.legend(labels=labels_frankfurt)
ax2.set_title('Frankfurt - PM2.5', fontsize = 15)

sns.lineplot(data=df_sc[df_sc['city']=='Bremen'][::10], x='timestamp', y='PM10', hue='location_id', ax=ax3, legend=False)
ax3.legend(labels=labels_bremen)
ax3.set_title('Bremen - PM10', fontsize = 15)

sns.lineplot(data=df_sc[df_sc['city']=='Bremen'][::10], x='timestamp', y='PM2p5', hue='location_id', ax=ax4, legend=False)
ax4.legend(labels=labels_bremen)
ax4.set_title('Bremen - PM2.5', fontsize = 15)
#plt.legend([], [], frameon=False)
