# Estimate Change during the Lockdown Period & Train a Lockdown Model by Using Transfer Learning


### Read in and prepare data

As input are needed:
- the name of the chosen variable (i.e. air pollutant)
- a dataframe containing the observations of weather variables and air pollutants (as produced by the script "Generate Variables")
- a dataframe containing the selected explanatory variables for the models for each station (as produced by the script "Model Selection")
- the dates of beginning and end of the lockdown in the country/region of observation (here read in from the file 'dictionaries')

In [1]:
# get data 
import numpy as np
import pandas as pd
from ipywidgets import interact, interact_manual
from pygam import LinearGAM, LogisticGAM, s, f,l, GAM, te
from functions import GAMf, time_plot_conf, curves
from sklearn.metrics import mean_squared_error
from itertools import chain
import matplotlib.pyplot as plt
from datetime import datetime
from dictionaries import starts, ends, che_classes, beijing_classes, wuhan_classes, at_classes, loc_classes

In [3]:
# choose pollutant to examine
in_var = 'no2'
IN_VAR = in_var.upper()
loc = 'che'

In [4]:
# read in dataframe with chosen variables as produced by the variable selection algorithm
chosen_vars = pd.read_csv('./' + loc+ '/'+in_var+'_chosen_vars.csv', index_col =0)

In [6]:
# read in dataset of explanatory & independent variables observations as produced by the variable generation script
data_path =  './' + loc+ '/'+ 'df2.csv'
df = pd.read_csv(data_path, index_col = 0)

In [7]:
# convert index to datetime format
df.index = pd.to_datetime(df.index)

In [8]:
# as we are working with a log normal distribution, 0 values for the independent variable cannot be processed 
# they are replaced by a very low non zero number
df[in_var] = df[in_var].replace(0,0.0001)

In [9]:
# drop all columns with NaN values, infinite values or values negative values for the observed variable
df = df.replace([np.inf, -np.inf], np.nan)
df = df.dropna(subset=[in_var])
df = df.drop(df[df[in_var]<0].index)

In [10]:
# get vector with names of all cities
cities = np.unique(df['city'].values)

In [11]:
# create a fake variable that is always 0
df['weekday_d'] = 0

In [12]:
df

Unnamed: 0,pm10,co,press,h,t,o3,p,nox,no2,no,...,wy_lag2,ws_lag2,dew_lag2,h_lag3,t_lag3,wx_lag3,wy_lag3,ws_lag3,dew_lag3,weekday_d
2016-01-01,33.401818,,973.206818,16.944655,5.157727,14.421364,1.825000,49.456818,32.972727,40.180000,...,,,,,,,,,,0
2016-01-01,22.060000,0.372727,943.960000,23.259171,4.385217,19.469000,1.164348,23.590000,29.195000,10.382000,...,,,,,,,,,,0
2016-01-01,8.542174,,918.646087,33.345150,2.931304,42.839565,2.068261,6.845652,9.458696,2.370000,...,-0.173140,0.796522,2.636909,,,,,,,0
2016-01-01,68.412174,,,14.343772,6.094783,19.359565,1.064348,44.335652,37.208696,31.030435,...,-0.553752,0.731739,4.784522,16.944655,5.157727,0.984897,-0.173140,0.796522,2.636909,0
2016-01-01,57.794783,0.420435,973.026522,15.886190,5.638696,17.854783,2.638261,37.371739,32.528696,25.395217,...,-0.878313,1.009130,2.480261,23.259171,4.385217,0.832682,-0.553752,0.731739,4.784522,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-02-07,21.660000,,950.560000,17.930000,5.000000,0.840000,0.000000,26.340000,18.330000,20.900000,...,-0.511930,0.355000,4.033500,7.156596,8.440000,0.510524,-0.859863,1.102500,0.520333,0
2021-02-07,23.860000,,922.060000,17.438095,5.250000,3.560000,0.000000,31.190000,35.960000,15.450000,...,,137.134583,4.416500,10.756228,7.626667,-0.859027,-0.511930,0.355000,4.033500,0
2021-02-07,73.080000,,896.840000,5.131470,9.660000,43.440000,0.000000,5.680000,11.690000,-0.540000,...,0.066970,1.300000,2.930000,12.339212,7.040833,,,137.134583,4.416500,0
2021-02-07,31.450000,,,10.824769,7.590000,2.380000,0.000000,44.140000,33.650000,33.110000,...,-0.724533,0.780000,3.560000,17.930000,5.000000,0.997755,0.066970,1.300000,2.930000,0


In [13]:
# define the dataset before, during and after the lockdown
df_pre_ld = df[df.index< datetime.strptime(starts[loc], '%m/%d/%Y')]
df_ld = df[df.index>=datetime.strptime(starts[loc], '%m/%d/%Y')]
df_ld = df_ld[df_ld.index<datetime.strptime(ends[loc], '%m/%d/%Y')]
df_post_ld = df[df.index> datetime.strptime(ends[loc], '%m/%d/%Y')]

In [14]:
chosen_vars

Unnamed: 0,0,1,2,3,4,5,6
Opfikon_Balsberg,lagpca_halfyear,wy,weekday,pca,ws_lag3,wx,h
StGallen_Blumenbergplatz,t_lag3,ws,wx_lag1,wy_lag2,wx,,
StGallen_Stuelegg,t,ws,pca,wx_lag3,wy_lag3,h,
Zuerich_Schimmelstrasse,ws_lag3,day_year,lagpca_1week,lagpca_8weeks,wx_lag2,,
Zuerich_Stampfenbachstrasse,lagws_1week,day_year,lagpca_1week,lagpca_12weeks,wx,,


# Estimation of the Change during the Lockdown & Training of the LD-Model

In a frist step, a model is trained on the pre-Lockdown data as in the "Model Evaluation" script. In the next step, the weekday variable is fitted newly to the data of the lockdown period. This is, as it can be assumed that relationships between pollutants and weather variables will stay constant also during the lockdown period. However, the weekday variable is simultaneously a proxy for traffic which is expected to change its pattern during the lockdown. This offers an opportunity to train a model for the lockdown conditions.

In [15]:
# function to calculate the transfer model
def GAM_transfer_covid(gam, df_test, chosen_vars, city):
    
    # filter for city
    df_test = df_test[df_test['city']==city]
    
    # generate list of explanatory variables with dummy name instead of true transfer variable name
    ex_vars_dummy = chosen_vars.loc[city].dropna().values.tolist()
    if 'weekday' in ex_vars_dummy:
        w_index = ex_vars_dummy.index('weekday')
        ex_vars_dummy[w_index] = 'weekday_d'
    else:
        ex_vars_dummy.append('weekday_d')
    #print(ex_vars_dummy)
    
    # predict based on the values of the test dataset expect with weekday being always 0
    pred_0 = gam.predict(df_test[ex_vars_dummy])
    df_transfer = pd.DataFrame(pd.np.column_stack([df_test[['weekday',in_var]], np.exp(pred_0)]))
    df_transfer.columns = ['weekday', 'pred', in_var]
    df_transfer['diff'] = df_transfer['pred'] - df_transfer[in_var]
    
    # plot newly fitted curve
    gamT = LinearGAM(f(0), fit_intercept = False)
    gamT.fit(df_transfer['weekday'], df_transfer['diff'])
    return(gamT)


In [16]:
from dateutil.relativedelta import relativedelta
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# initalize dataframes
names = ['RMSE', 'R_squared', 'FAC2', 'testdays', 'traidays','ratio','avg_err']
gam_pre_lockdown = pd.DataFrame(data = None, index = cities, columns = ['model'])
out_pre_lockdown = pd.DataFrame(data=None, index = cities, columns = names)
gam_lockdown = pd.DataFrame(data = None, index = cities, columns = ['model'])
df_compare_all = pd.DataFrame(data=None)
df_compare_all2 = pd.DataFrame(data=None)

# define start and end date of lockdown
start = datetime.strptime(starts[loc], '%m/%d/%Y')
end = datetime.strptime(ends[loc], '%m/%d/%Y')

# transform index to datetime format
df.index = pd.to_datetime(df.index)


# drop observations with negative, infinite or NaN values for the pollutant
df = df.dropna(subset=[in_var])
df = df.drop(df[df[in_var]<0].index)
df = df.replace([np.inf, -np.inf], np.nan)

# perform the estimation for each city
for city in cities:
    ex_vars = chosen_vars.loc[city].dropna().values.tolist()
    
    # subset dataset to city
    print(df)
    df_city = df[df['city']==city]
    print(df_city)
    df_city = df_city.iloc[1:]
    df = df.dropna(subset=ex_vars)
    
    # create a dataframe only containing the data from 2020
    df_2020 =  df_city[df_city.index>=datetime.strptime(starts[loc], '%m/%d/%Y')]
    df_2020 = df_2020[df_2020.index<datetime.strptime(ends[loc], '%m/%d/%Y')]
    df_2020.index = df_2020.index.month.astype(str) + '/' + df_2020.index.day.astype(str)
    
    # create a dataframe only containing the data from 2019
    df_2019 = df_city[df_city.index<(end- relativedelta(years=1))]
    df_2019 = df_2019[df_2019.index>(start - relativedelta(years=1))]
    df_2019.index = df_2019.index.month.astype(str) + '/' + df_2019.index.day.astype(str)
    
    
    # specify list of explanatory variables according to the variables chosen by the selection algorithm
    ex_vars = chosen_vars.loc[city].dropna().values.tolist()
    if 'weekday' not in ex_vars:
        ex_vars.append('weekday')
    
    # specify the same list but with the fake weekday variable instead of the true values
    ex_vars_dummy = chosen_vars.loc[city].dropna().values.tolist()
    if 'weekday' in ex_vars_dummy:
        w_index = ex_vars_dummy.index('weekday')
        ex_vars_dummy[w_index] = 'weekday_d'
    else:
        ex_vars_dummy.append('weekday_d')
    
    df_city = df_city.replace([np.inf, -np.inf], np.nan)
    df_city = df_city.dropna(subset=ex_vars_dummy)
    print(df_city)
    # fit the model on 24 months of pre lockdown data
    gam_pre_lockdown.loc[city, 'model'], out_pre_lockdown.loc[city, names[0]:names[6]], preds=  GAMf(df, in_var, ex_vars, city, starts[loc], ends[loc],24)
    gam_lockdown.loc[city, 'model'] = GAM_transfer_covid(gam_pre_lockdown.loc[city, 'model'], df_2020, chosen_vars, city)
   
   
    # create a dataframe to compare prediction and true values of 2019 & 2020
    df_compare = df_2020[[in_var,'city']]
    df_compare = df_compare.join(df_2019[in_var], lsuffix='_2020', rsuffix = '_2019')
    
   
    df_compare2 =df_city[[in_var,'city']]
    df_before = df_city[df_city.index<=datetime.strptime(starts[loc], '%m/%d/%Y')][in_var]
    max_value = df_before[df_before.index>=datetime.strptime(starts[loc],'%m/%d/%Y') -relativedelta(years=2)].max()
    min_value = df_before[df_before.index>=datetime.strptime(starts[loc],'%m/%d/%Y') -relativedelta(years=2)].min()
    #print(city, max_value)
    # use the max_value as the upper limit for the predictions
    pre_lockdown = np.exp(gam_pre_lockdown.loc[city, 'model'].predict(df_2020[ex_vars]))
    print(city)
    #print(pre_lockdown)
    pre_lockdown[pre_lockdown>max_value] = max_value
    pre_lockdown[pre_lockdown<0]= 0
    print(df_compare2)
    dummy = np.exp(gam_pre_lockdown.loc[city, 'model'].predict(df_2020[ex_vars_dummy]))
    dummy[dummy>max_value] = max_value
    dummy[dummy<0] = 0

    lockdown = dummy + gam_lockdown.loc[city, 'model'].predict(df_2020['weekday'])
    lockdown[lockdown<0]= 0
    # write the pre lockdown odel prediction, the dummy prediction (i.e. with weekday always equal to 0)
    # and the transfer model prediction into the dataframe
    print(df_compare)
    #print(pre_lockdown)
    
    df_compare['pre_lockdown_model'] = pre_lockdown
    df_compare['dummy'] = dummy
    df_compare['lockdown_model'] = lockdown
    df_compare['weekday'] = df_2020['weekday']
   
    
    pre_lockdown2 = np.exp(gam_pre_lockdown.loc[city, 'model'].predict(df_city[ex_vars]))
    pre_lockdown2[pre_lockdown2>max_value] = max_value
    pre_lockdown2[pre_lockdown2<0]= 0
    df_compare2['pre_lockdown_model'] = pre_lockdown2
    dummy2= np.exp(gam_pre_lockdown.loc[city, 'model'].predict(df_city[ex_vars_dummy]))
    dummy2[dummy2>max_value] = max_value
    dummy2[dummy2<0] = 0
    df_compare2['dummy']= dummy2
    
    
    lockdown2 = dummy2 + gam_lockdown.loc[city, 'model'].predict(df_city['weekday'])
    lockdown2[lockdown2<0]= 0
    df_compare2['lockdown_model'] = lockdown2
    df_compare2['weekday'] = df_city['weekday']
    
    df_compare_all = df_compare_all.append(df_compare)
    df_compare_all2 = df_compare_all2.append(df_compare2)
    #print(df_compare2)
df_compare_all['class'] = df_compare_all['city'].replace(loc_classes[loc])
df_compare_all2['class'] = df_compare_all2['city'].replace(loc_classes[loc])
#df_compare_all2 = df_compare_all2.rename({'no2': 'no2_2020'}, axis=1)
df_compare_all2['year'] = df_compare_all2.index.year
#df_compare_all = df_compare_all.rename(columns = {'pm10_2019': 'pm_2019', 'pm10_2020': 'pm_2020'})
    
 

                 pm10        co       press          h         t         o3  \
2016-01-01  33.401818       NaN  973.206818  16.944655  5.157727  14.421364   
2016-01-01  22.060000  0.372727  943.960000  23.259171  4.385217  19.469000   
2016-01-01   8.542174       NaN  918.646087  33.345150  2.931304  42.839565   
2016-01-01  68.412174       NaN         NaN  14.343772  6.094783  19.359565   
2016-01-01  57.794783  0.420435  973.026522  15.886190  5.638696  17.854783   
...               ...       ...         ...        ...       ...        ...   
2021-02-07  21.660000       NaN  950.560000  17.930000  5.000000   0.840000   
2021-02-07  23.860000       NaN  922.060000  17.438095  5.250000   3.560000   
2021-02-07  73.080000       NaN  896.840000   5.131470  9.660000  43.440000   
2021-02-07  31.450000       NaN         NaN  10.824769  7.590000   2.380000   
2021-02-07  27.450000  0.350000  950.060000  12.518038  6.930000   1.970000   

                    p        nox        no2        

Opfikon_Balsberg
                  no2              city
2016-01-02  46.531250  Opfikon_Balsberg
2016-01-03  44.027000  Opfikon_Balsberg
2016-01-04  61.576087  Opfikon_Balsberg
2016-01-05  53.310417  Opfikon_Balsberg
2016-01-06  49.056250  Opfikon_Balsberg
...               ...               ...
2021-02-03  31.463333  Opfikon_Balsberg
2021-02-04  33.065909  Opfikon_Balsberg
2021-02-05  34.513750  Opfikon_Balsberg
2021-02-06  29.027500  Opfikon_Balsberg
2021-02-07  18.330000  Opfikon_Balsberg

[1857 rows x 2 columns]
       no2_2020              city   no2_2019
3/16  38.374583  Opfikon_Balsberg        NaN
3/17  29.481667  Opfikon_Balsberg        NaN
3/18  24.460000  Opfikon_Balsberg  36.611250
3/19  31.156364  Opfikon_Balsberg  32.167500
3/20  27.580455  Opfikon_Balsberg  28.043529
3/21  15.734167  Opfikon_Balsberg  25.625417
3/22   8.226250  Opfikon_Balsberg  46.034167
3/23  12.507391  Opfikon_Balsberg  48.646250
3/24  17.540870  Opfikon_Balsberg  42.176522
3/25  21.237083  Opfikon_Bal

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

StGallen_Blumenbergplatz
                  no2                      city
2016-01-03  23.827083  StGallen_Blumenbergplatz
2016-01-04  32.170417  StGallen_Blumenbergplatz
2016-01-05  41.509167  StGallen_Blumenbergplatz
2016-01-06  43.691250  StGallen_Blumenbergplatz
2016-01-07  37.970833  StGallen_Blumenbergplatz
...               ...                       ...
2021-01-21  58.051739  StGallen_Blumenbergplatz
2021-01-22  53.648696  StGallen_Blumenbergplatz
2021-01-23  24.670435  StGallen_Blumenbergplatz
2021-01-24  18.209565  StGallen_Blumenbergplatz
2021-01-25  30.335000  StGallen_Blumenbergplatz

[1837 rows x 2 columns]
       no2_2020                      city   no2_2019
3/16  44.805652  StGallen_Blumenbergplatz        NaN
3/17  43.222083  StGallen_Blumenbergplatz        NaN
3/18  39.882083  StGallen_Blumenbergplatz  27.709130
3/19  43.180435  StGallen_Blumenbergplatz  42.195217
3/20  38.906250  StGallen_Blumenbergplatz  45.118182
3/21  15.262500  StGallen_Blumenbergplatz  40.141667
3/2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

StGallen_Stuelegg
                 no2               city
2016-01-03  3.980000  StGallen_Stuelegg
2016-01-04  3.469091  StGallen_Stuelegg
2016-01-05  1.483333  StGallen_Stuelegg
2016-01-06  4.149167  StGallen_Stuelegg
2016-01-07  4.716667  StGallen_Stuelegg
...              ...                ...
2021-01-21  1.291739  StGallen_Stuelegg
2021-01-22  4.319167  StGallen_Stuelegg
2021-01-23  2.740833  StGallen_Stuelegg
2021-01-24  0.673913  StGallen_Stuelegg
2021-01-25  1.412917  StGallen_Stuelegg

[1840 rows x 2 columns]
       no2_2020               city   no2_2019
3/16   7.878261  StGallen_Stuelegg        NaN
3/17   8.910833  StGallen_Stuelegg        NaN
3/18   8.415833  StGallen_Stuelegg   0.311304
3/19   8.474783  StGallen_Stuelegg   4.731250
3/20   8.397500  StGallen_Stuelegg   6.834583
3/21   8.195000  StGallen_Stuelegg   9.410952
3/22   4.955217  StGallen_Stuelegg  13.797917
3/23   6.530000  StGallen_Stuelegg   7.632083
3/24   8.411250  StGallen_Stuelegg   6.379130
3/25  11.113478  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

Zuerich_Schimmelstrasse
                  no2                     city
2016-01-02  29.550833  Zuerich_Schimmelstrasse
2016-01-03  30.700417  Zuerich_Schimmelstrasse
2016-01-04  30.750417  Zuerich_Schimmelstrasse
2016-01-05  40.336250  Zuerich_Schimmelstrasse
2016-01-06  36.629545  Zuerich_Schimmelstrasse
...               ...                      ...
2021-02-03  23.068750  Zuerich_Schimmelstrasse
2021-02-04  49.223750  Zuerich_Schimmelstrasse
2021-02-05  48.741429  Zuerich_Schimmelstrasse
2021-02-06  39.991250  Zuerich_Schimmelstrasse
2021-02-07  33.650000  Zuerich_Schimmelstrasse

[1846 rows x 2 columns]
       no2_2020                     city   no2_2019
3/16  52.457083  Zuerich_Schimmelstrasse        NaN
3/17  47.073750  Zuerich_Schimmelstrasse        NaN
3/18  50.321250  Zuerich_Schimmelstrasse  26.035833
3/19  51.056667  Zuerich_Schimmelstrasse  55.887083
3/20  40.882083  Zuerich_Schimmelstrasse  49.157083
3/21  18.025833  Zuerich_Schimmelstrasse  50.063750
3/22   9.711250  Zueric

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

Zuerich_Stampfenbachstrasse
                  no2                         city
2016-01-02  24.575000  Zuerich_Stampfenbachstrasse
2016-01-03  25.492917  Zuerich_Stampfenbachstrasse
2016-01-04  24.303636  Zuerich_Stampfenbachstrasse
2016-01-05  30.752500  Zuerich_Stampfenbachstrasse
2016-01-06  26.818333  Zuerich_Stampfenbachstrasse
...               ...                          ...
2021-01-20  55.915000  Zuerich_Stampfenbachstrasse
2021-01-21  58.692917  Zuerich_Stampfenbachstrasse
2021-01-22  45.184583  Zuerich_Stampfenbachstrasse
2021-01-23  17.202083  Zuerich_Stampfenbachstrasse
2021-01-24   1.917917  Zuerich_Stampfenbachstrasse

[1835 rows x 2 columns]
       no2_2020                         city   no2_2019
3/16  48.606250  Zuerich_Stampfenbachstrasse        NaN
3/17  32.615238  Zuerich_Stampfenbachstrasse        NaN
3/18  37.110000  Zuerich_Stampfenbachstrasse  18.760417
3/19  42.867083  Zuerich_Stampfenbachstrasse  44.808750
3/20  31.012083  Zuerich_Stampfenbachstrasse  39.920417

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

In [234]:
df_compare_all2

Unnamed: 0,no2,city,pre_lockdown_model,dummy,lockdown_model,weekday,class,year
2016-01-02,46.531250,Opfikon_Balsberg,6.863269e+01,7.105043e+01,59.380050,5,high traffic,2016
2016-01-03,44.027000,Opfikon_Balsberg,5.899992e+01,7.066387e+01,56.407108,6,high traffic,2016
2016-01-04,61.576087,Opfikon_Balsberg,6.519213e+01,6.519213e+01,54.215681,0,high traffic,2016
2016-01-05,53.310417,Opfikon_Balsberg,5.901511e+01,5.615171e+01,48.618127,1,high traffic,2016
2016-01-06,49.056250,Opfikon_Balsberg,5.243056e+01,4.899550e+01,41.537411,2,high traffic,2016
...,...,...,...,...,...,...,...,...
2021-01-20,55.915000,Zuerich_Stampfenbachstrasse,3.783847e+01,3.691216e+01,26.140977,2,low traffic,2021
2021-01-21,58.692917,Zuerich_Stampfenbachstrasse,4.789666e+01,4.520505e+01,35.999391,3,low traffic,2021
2021-01-22,45.184583,Zuerich_Stampfenbachstrasse,4.345613e+01,4.185121e+01,28.858264,4,low traffic,2021
2021-01-23,17.202083,Zuerich_Stampfenbachstrasse,2.200683e+01,2.798045e+01,9.038810,5,low traffic,2021


In [18]:
df_compare_all

Unnamed: 0,no2_2020,city,no2_2019,pre_lockdown_model,dummy,lockdown_model,weekday,class
3/16,38.374583,Opfikon_Balsberg,,36.360338,36.360338,25.383890,0,high traffic
3/17,29.481667,Opfikon_Balsberg,,37.946756,36.105586,28.572008,1,high traffic
3/18,24.460000,Opfikon_Balsberg,36.611250,34.867268,32.582890,25.124802,2,high traffic
3/19,31.156364,Opfikon_Balsberg,32.167500,42.667897,39.086061,33.501277,3,high traffic
3/20,27.580455,Opfikon_Balsberg,28.043529,40.020162,36.502912,28.327356,4,high traffic
...,...,...,...,...,...,...,...,...
4/21,14.397083,Zuerich_Stampfenbachstrasse,18.341667,26.809157,25.962207,14.254965,1,low traffic
4/22,16.070000,Zuerich_Stampfenbachstrasse,21.331250,30.763931,30.010812,19.239624,2,low traffic
4/23,23.260000,Zuerich_Stampfenbachstrasse,33.136667,41.743617,39.397788,30.192129,3,low traffic
4/24,25.080417,Zuerich_Stampfenbachstrasse,30.844286,27.719828,26.696079,13.703137,4,low traffic


In [201]:
df_compare_all.to_csv('./' + loc+ '/'+in_var+'_compare_lockdown.csv')
df_compare_all2.to_csv('./' + loc+ '/'+in_var+'_compare_all.csv')

## Evaluate the Performance of the Transfer Model

The lockdown model is evaluated performing cross validation with random test sets of 3 days each. The remaining days of the lockdown are used for training.

In [96]:
def GAM_transfer_covid_eval(gam, df_covid, chosen_vars, city):
    
    # filter for city
    df_covid = df_covid[df_covid['city']==city]
    
    # generate list of explanatory variables with dummy name instead of true transfer variable name
    ex_vars_dummy = chosen_vars.loc[city].dropna().values.tolist()
    if 'weekday' in ex_vars_dummy:
        w_index = ex_vars_dummy.index('weekday')
        ex_vars_dummy[w_index] = 'weekday_d'
    else:
        ex_vars_dummy.append('weekday_d')
    
    ex_vars = chosen_vars.loc[city].dropna().values.tolist()
    if 'weekday' not in ex_vars:
        ex_vars.append('weekday')
    
    rmse_normal =[]
    rmse_covid = []

    for i in range(1, int(len(df_covid)/3)):
    
        test_index = df_covid.index[3*(i-1):3*(i-1)+3]
        train_bool = (df_covid.index != test_index[0])*(df_covid.index != test_index[1])*(df_covid.index != test_index[2])
        test_bool = ~train_bool

        # predict based on the values of the test dataset expect with weekday being always 0
        pred_0 = gam.predict(df_covid[train_bool][ex_vars_dummy])
        df_transfer = pd.DataFrame(pd.np.column_stack([df_covid[train_bool][['weekday',in_var]], np.exp(pred_0)]))
        df_transfer.columns = ['weekday', 'pred', in_var]
        df_transfer['diff'] = df_transfer['pred'] - df_transfer[in_var]

        # get newly fitted GAM Model
        gamT = LinearGAM(f(0), fit_intercept = False)
        gamT.fit(df_transfer['weekday'], df_transfer['diff'])

        # predict test data with covid model
        pred_1 = np.exp(gam.predict(df_covid[test_bool][ex_vars_dummy]))
        df_transfer1 = pd.DataFrame(pd.np.column_stack([df_covid[test_bool][['weekday',in_var]], np.exp(pred_1)]))
        df_transfer1.columns = ['weekday', 'pred', in_var]
        pred_diff = gamT.predict(df_transfer1['weekday'])
        pred_covid = pred_1 + pred_diff

        # predict test data with normal model
        pred_norm = np.exp(gam.predict(df_covid[test_bool][ex_vars]))

        # gather rmse values
        rmse_covid.append(np.sqrt(mean_squared_error(pred_covid, df_covid[test_bool][in_var])))
        rmse_normal.append(np.sqrt(mean_squared_error(pred_norm, df_covid[test_bool][in_var])))
    
    len(rmse_normal)
    len(rmse_covid)
    return(gamT, rmse_normal, rmse_covid)


In [97]:
RMSE = pd.DataFrame(data=None, columns = list(chain(*[cities +'_normal', cities + '_covid'])))
gam_transfer_eval = pd.DataFrame(data = None, index = cities, columns = ['model'])
stats =[]
for city in cities:
    gam_transfer_eval.loc[city, 'model'],  RMSE.loc[:, city +'_normal'],  RMSE.loc[:, city +'_covid']  = GAM_transfer_covid_eval(gam_pre_lockdown.loc[city, 'model'], df_ld, chosen_vars, city)

## Post Lockdown Period

The period after lockdown is predicted with both models, pre-LD model and LD model. These predictions can serve as indicators if the specific city has already gone back to normal or behaves more like during the lockdown. 

In [248]:
def post_predict(df_post, gam_lockdown, gam_pre_lockdown, city):
    
    ex_vars = chosen_vars.loc[city].dropna().values.tolist()
    if 'weekday' not in ex_vars:
        ex_vars.append('weekday')
    
    # specify the same list but with the fake weekday variable instead of the true values
    ex_vars_dummy = chosen_vars.loc[city].dropna().values.tolist()
    if 'weekday' in ex_vars_dummy:
        w_index = ex_vars_dummy.index('weekday')
        ex_vars_dummy[w_index] = 'weekday_d'
    else:
        ex_vars_dummy.append('weekday_d')
        
        
    df_post = df_post.dropna(subset=ex_vars)
    
    df_city = df_post[df_post['city'] == city]
    
    max_value = df_city[df_city.index<=datetime.strptime(starts[loc], '%m/%d/%Y')][in_var].max()
    min_value = df_city[df_city.index>=datetime.strptime(starts[loc], '%m/%d/%Y')][in_var].min()
    
    pre_lockdown = np.exp(gam_pre_lockdown.predict(df_city[ex_vars]))
    pre_lockdown[pre_lockdown>max_value] = max_value
    
    dummy = np.exp(gam_pre_lockdown.predict(df_city[ex_vars_dummy]))
    dummy[dummy>max_value] = max_value
    
    post_pred = df_city
    post_pred['pre_lockdown_model'] = pre_lockdown
    post_pred['dummy_pred'] = dummy
    post_pred['diff_pred'] = gam_lockdown.predict(post_pred['weekday'])
    post_pred['lockdown_model'] = post_pred['dummy_pred'] + post_pred['diff_pred']
    post_pred['time'] = post_pred.index
    return(post_pred)

In [249]:
classes = loc_classes[loc]
performance = pd.DataFrame(data=None, columns =np.unique(np.array(list(classes.values()))))
post_preds = pd.DataFrame(data =None)
for city in cities:
    post_pred = post_predict(df_post_ld, gam_lockdown.loc[city, 'model'],gam_pre_lockdown.loc[city, 'model'], city) 
    post_preds = post_preds.append(post_pred)
post_preds['class'] =  post_preds['city'].replace(classes)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

## "Predict" Pollutant Levels for the Year 2019 under Lockdown 
How much would have the pollutant levels reduced during 2019 if the country had been under lockdown the whole year?

In [288]:
def predict_2019(df, gam_lockdown, gam_pre_lockdown, city):
    
    ex_vars = chosen_vars.loc[city].dropna().values.tolist()
    if 'weekday' not in ex_vars:
        ex_vars.append('weekday')
    
    # specify the same list but with the fake weekday variable instead of the true values
    ex_vars_dummy = chosen_vars.loc[city].dropna().values.tolist()
    if 'weekday' in ex_vars_dummy:
        w_index = ex_vars_dummy.index('weekday')
        ex_vars_dummy[w_index] = 'weekday_d'
    else:
        ex_vars_dummy.append('weekday_d')
        
        
    df_pred =df[df['city'] == city]
    df_pred =df_pred[df_pred['year']==2019]
    
    df_pred = df_pred.dropna(subset=ex_vars)
    
    pre_lockdown = np.exp(gam_pre_lockdown.predict(df_pred[ex_vars]))
    pre_lockdown[pre_lockdown>max_value] = max_value
    pre_lockdown[pre_lockdown<0]= 0
    dummy = np.exp(gam_pre_lockdown.predict(df_pred[ex_vars_dummy]))
    dummy[dummy>max_value] = max_value
    dummy[dummy<0] = 0

    lockdown = dummy +  gam_lockdown.predict(df_pred['weekday'])
    lockdown[lockdown<0]= 0
    # write the pre lockdown odel prediction, the dummy prediction (i.e. with weekday always equal to 0)
    # and the transfer model prediction into the dataframe
    print(df_compare)
    #print(pre_lockdown)
    
    
    df_pred['pre_lockdown_model'] = pre_lockdown
    df_pred['dummy_pred'] = dummy
   # df_pred['diff_pred'] =lockdown
    df_pred['lockdown_model'] = lockdown
    #df_pred['diff'] = df_pred['normal_pred'] - df_pred['covid_pred'] 
    df_pred['time'] = df_pred.index
    df_pred['city'] = city
    return(df_pred)
    

In [289]:
pred_2019 = pd.DataFrame(data =None)
for city in cities:
    pred_ = predict_2019(df, gam_lockdown.loc[city, 'model'],gam_pre_lockdown.loc[city, 'model'], city)
    pred_2019 = pred_2019.append(pred_)

       no2_2020   city   no2_2019  pre_lockdown_model      dummy  \
time                                                               
1/23  12.458333  1334A        NaN           23.755223  22.369213   
1/24   6.500000  1334A  47.875000           21.085763  19.392310   
1/25   6.750000  1334A  55.916667           17.239599  15.763418   
1/26   4.541667  1334A  38.333333           13.897965  12.973017   
1/27   4.833333  1334A  38.791667           10.194701  10.194701   
...         ...    ...        ...                 ...        ...   
4/3   14.333333  1334A  17.130435           20.771982  19.103730   
4/4   13.125000  1334A  15.208333           17.062747  15.601709   
4/5   29.850000  1334A   9.291667           20.354800  19.000133   
4/6   18.291667  1334A  31.583333           19.413333  19.413333   
4/7   18.083333  1334A  29.333333           21.770321  19.906601   

      lockdown_model  weekday  
time                           
1/23       12.712662        3  
1/24       11.94391

In [31]:
#df_compare_all.to_csv('./lockdown2/'+ 'compare_all.csv')
RMSE.to_csv('./' + loc+'/RMSE.csv')
post_preds.to_csv('./' + loc+  '/post_preds.csv')
pred_2019.to_csv('./' + loc+  '/'+'pred_2019.csv')