# Estimate Change during the Lockdown Period & Train a Lockdown Model by Using Transfer Learning


### Read in and prepare data

As input are needed:
- the name of the chosen variable (i.e. air pollutant)
- a dataframe containing the observations of weather variables and air pollutants (as produced by the script "Generate Variables")
- a dataframe containing the selected explanatory variables for the models for each station (as produced by the script "Model Selection")
- the dates of beginning and end of the lockdown in the country/region of observation (here read in from the file 'dictionaries')

In [1]:
# get data 
import numpy as np
import pandas as pd
from ipywidgets import interact, interact_manual
from pygam import LinearGAM, LogisticGAM, s, f,l, GAM, te
from functions import GAMf, time_plot_conf, curves
from sklearn.metrics import mean_squared_error
from itertools import chain
import matplotlib.pyplot as plt
from datetime import datetime
from dictionaries import starts, ends, che_classes, beijing_classes, wuhan_classes, at_classes, loc_classes

In [2]:
loc = 'at'

In [3]:
# choose pollutant to examine
in_var = 'no2'
IN_VAR = in_var.upper()
loc = 'che'

In [4]:
# read in dataframe with chosen variables as produced by the variable selection algorithm
chosen_vars = pd.read_csv('./' + loc+ '/'+in_var+'_chosen_vars.csv', index_col =0)

In [6]:
# read in dataset of explanatory & independent variables observations as produced by the variable generation script
data_path =  './' + loc+ '/'+ 'df_' + loc + '2.csv'
df = pd.read_csv(data_path, index_col = 0)

In [7]:
# convert index to datetime format
df.index = pd.to_datetime(df.index)

In [8]:
# as we are working with a log normal distribution, 0 values for the independent variable cannot be processed 
# they are replaced by a very low non zero number
df[in_var] = df[in_var].replace(0,0.0001)

In [9]:
# drop all columns with NaN values, infinite values or values negative values for the observed variable
df = df.replace([np.inf, -np.inf], np.nan)
df = df.dropna(subset=[in_var])
df = df.drop(df[df[in_var]<0].index)

In [10]:
# get vector with names of all cities
cities = np.unique(df['city'].values)

In [11]:
# create a fake variable that is always 0
df['weekday_d'] = 0

In [12]:
# define the dataset before, during and after the lockdown
df_pre_ld = df[df.index< datetime.strptime(starts[loc], '%m/%d/%Y')]
df_ld = df[df.index>=datetime.strptime(starts[loc], '%m/%d/%Y')]
df_ld = df_ld[df_ld.index<datetime.strptime(ends[loc], '%m/%d/%Y')]
df_post_ld = df[df.index> datetime.strptime(ends[loc], '%m/%d/%Y')]

# Estimation of the Change during the Lockdown & Training of the LD-Model

In a frist step, a model is trained on the pre-Lockdown data as in the "Model Evaluation" script. In the next step, the weekday variable is fitted newly to the data of the lockdown period. This is, as it can be assumed that relationships between pollutants and weather variables will stay constant also during the lockdown period. However, the weekday variable is simultaneously a proxy for traffic which is expected to change its pattern during the lockdown. This offers an opportunity to train a model for the lockdown conditions.

In [13]:
# function to calculate the transfer model
def GAM_transfer_covid(gam, df_test, chosen_vars, city):
    
    # filter for city
    df_test = df_test[df_test['city']==city]
    
    # generate list of explanatory variables with dummy name instead of true transfer variable name
    ex_vars_dummy = chosen_vars.loc[city].dropna().values.tolist()
    if 'weekday' in ex_vars_dummy:
        w_index = ex_vars_dummy.index('weekday')
        ex_vars_dummy[w_index] = 'weekday_d'
    else:
        ex_vars_dummy.append('weekday_d')
    #print(ex_vars_dummy)
    
    # predict based on the values of the test dataset expect with weekday being always 0
    pred_0 = gam.predict(df_test[ex_vars_dummy])
    df_transfer = pd.DataFrame(pd.np.column_stack([df_test[['weekday',in_var]], np.exp(pred_0)]))
    df_transfer.columns = ['weekday', 'pred', in_var]
    df_transfer['diff'] = df_transfer['pred'] - df_transfer[in_var]
    
    # plot newly fitted curve
    gamT = LinearGAM(f(0), fit_intercept = False)
    gamT.fit(df_transfer['weekday'], df_transfer['diff'])
    return(gamT)


In [14]:
from dateutil.relativedelta import relativedelta
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# initalize dataframes
names = ['RMSE', 'R_squared', 'FAC2', 'testdays', 'traidays','ratio','avg_err']
gam_pre_lockdown = pd.DataFrame(data = None, index = cities, columns = ['model'])
out_pre_lockdown = pd.DataFrame(data=None, index = cities, columns = names)
gam_lockdown = pd.DataFrame(data = None, index = cities, columns = ['model'])
df_compare_all = pd.DataFrame(data=None)

# define start and end date of lockdown
start = datetime.strptime(starts[loc], '%m/%d/%Y')
end = datetime.strptime(ends[loc], '%m/%d/%Y')

# transform index to datetime format
df.index = pd.to_datetime(df.index)


# drop observations with negative, infinite or NaN values for the pollutant
df = df.dropna(subset=[in_var])
df = df.drop(df[df[in_var]<0].index)
df = df.replace([np.inf, -np.inf], np.nan)

# perform the estimation for each city
for city in cities:

    # subset dataset to city
    df_city = df[df['city']==city]
    
    # create a dataframe only containing the data from 2020
    df_2020 =  df_city[df_city.index>=datetime.strptime(starts[loc], '%m/%d/%Y')]
    df_2020 = df_2020[df_2020.index<datetime.strptime(ends[loc], '%m/%d/%Y')]
    df_2020.index = df_2020.index.month.astype(str) + '/' + df_2020.index.day.astype(str)
    
    # create a dataframe only containing the data from 2019
    df_2019 = df_city[df_city.index<(end- relativedelta(years=1))]
    df_2019 = df_2019[df_2019.index>(start - relativedelta(years=1))]
    df_2019.index = df_2019.index.month.astype(str) + '/' + df_2019.index.day.astype(str)
    
    
    # specify list of explanatory variables according to the variables chosen by the selection algorithm
    ex_vars = chosen_vars.loc[city].dropna().values.tolist()
    if 'weekday' not in ex_vars:
        ex_vars.append('weekday')
    
    # specify the same list but with the fake weekday variable instead of the true values
    ex_vars_dummy = chosen_vars.loc[city].dropna().values.tolist()
    if 'weekday' in ex_vars_dummy:
        w_index = ex_vars_dummy.index('weekday')
        ex_vars_dummy[w_index] = 'weekday_d'
    else:
        ex_vars_dummy.append('weekday_d')
    
    # fit the model on 24 months of pre lockdown data
    gam_pre_lockdown.loc[city, 'model'], out_pre_lockdown.loc[city, names[0]:names[6]], preds=  GAMf(df, in_var, ex_vars, city, starts[loc], ends[loc],24)
    gam_lockdown.loc[city, 'model'] = GAM_transfer_covid(gam_pre_lockdown.loc[city, 'model'], df_2020, chosen_vars, city)
   
   
    # create a dataframe to compare prediction and true values of 2019 & 2020
    df_compare = df_2020[[in_var,'city']]
    
    max_value = df_city[df_city.index<=datetime.strptime(starts[loc], '%m/%d/%Y')][in_var].max()
    min_value = df_city[df_city.index>=datetime.strptime(starts[loc], '%m/%d/%Y')][in_var].min()
    
    # use the max_value as the upper limit for the predictions
    pre_lockdown = np.exp(gam_pre_lockdown.loc[city, 'model'].predict(df_2020[ex_vars]))
    pre_lockdown[pre_lockdown>max_value] = max_value
    
    dummy = np.exp(gam_pre_lockdown.loc[city, 'model'].predict(df_2020[ex_vars_dummy]))
    dummy[dummy>max_value] = max_value

    # write the pre lockdown odel prediction, the dummy prediction (i.e. with weekday always equal to 0)
    # and the transfer model prediction into the dataframe
    df_compare['pre_lockdown_model'] = pre_lockdown
    df_compare['dummy'] = dummy
    df_compare['lockdown_model'] = df_compare['dummy'] + gam_lockdown.loc[city, 'model'].predict(df_2020['weekday'])
    df_compare = df_compare.join(df_2019[in_var], lsuffix='_2020', rsuffix = '_2019')
    
    df_compare_all = df_compare_all.append(df_compare)

df_compare_all['class'] = df_compare_all['city'].replace(loc_classes[loc])
df_compare_all = df_compare_all.rename(columns = {'pm10_2019': 'pm_2019', 'pm10_2020': 'pm_2020'})
    
 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

## Evaluate the Performance of the Transfer Model

The lockdown model is evaluated performing cross validation with random test sets of 3 days each. The remaining days of the lockdown are used for training.

In [15]:
def GAM_transfer_covid_eval(gam, df_covid, chosen_vars, city):
    
    # filter for city
    df_covid = df_covid[df_covid['city']==city]
    
    # generate list of explanatory variables with dummy name instead of true transfer variable name
    ex_vars_dummy = chosen_vars.loc[city].dropna().values.tolist()
    if 'weekday' in ex_vars_dummy:
        w_index = ex_vars_dummy.index('weekday')
        ex_vars_dummy[w_index] = 'weekday_d'
    else:
        ex_vars_dummy.append('weekday_d')
    
    ex_vars = chosen_vars.loc[city].dropna().values.tolist()
    if 'weekday' not in ex_vars:
        ex_vars.append('weekday')
    
    rmse_normal =[]
    rmse_covid = []

    for i in range(1, int(len(df_covid)/3)):
    
        test_index = df_covid.index[3*(i-1):3*(i-1)+3]
        train_bool = (df_covid.index != test_index[0])*(df_covid.index != test_index[1])*(df_covid.index != test_index[2])
        test_bool = ~train_bool

        # predict based on the values of the test dataset expect with weekday being always 0
        pred_0 = gam.predict(df_covid[train_bool][ex_vars_dummy])
        df_transfer = pd.DataFrame(pd.np.column_stack([df_covid[train_bool][['weekday',in_var]], np.exp(pred_0)]))
        df_transfer.columns = ['weekday', 'pred', in_var]
        df_transfer['diff'] = df_transfer['pred'] - df_transfer[in_var]

        # get newly fitted GAM Model
        gamT = LinearGAM(f(0), fit_intercept = False)
        gamT.fit(df_transfer['weekday'], df_transfer['diff'])

        # predict test data with covid model
        pred_1 = np.exp(gam.predict(df_covid[test_bool][ex_vars_dummy]))
        df_transfer1 = pd.DataFrame(pd.np.column_stack([df_covid[test_bool][['weekday',in_var]], np.exp(pred_1)]))
        df_transfer1.columns = ['weekday', 'pred', in_var]
        pred_diff = gamT.predict(df_transfer1['weekday'])
        pred_covid = pred_1 + pred_diff

        # predict test data with normal model
        pred_norm = np.exp(gam.predict(df_covid[test_bool][ex_vars]))

        # gather rmse values
        rmse_covid.append(np.sqrt(mean_squared_error(pred_covid, df_covid[test_bool][in_var])))
        rmse_normal.append(np.sqrt(mean_squared_error(pred_norm, df_covid[test_bool][in_var])))
    
    len(rmse_normal)
    len(rmse_covid)
    return(gamT, rmse_normal, rmse_covid)


In [16]:
RMSE = pd.DataFrame(data=None, columns = list(chain(*[cities +'_normal', cities + '_covid'])))
gam_transfer_eval = pd.DataFrame(data = None, index = cities, columns = ['model'])
stats =[]
for city in cities:
    gam_transfer_eval.loc[city, 'model'],  RMSE.loc[:, city +'_normal'],  RMSE.loc[:, city +'_covid']  = GAM_transfer_covid_eval(gam_pre_lockdown.loc[city, 'model'], df_ld, chosen_vars, city)

## Post Lockdown Period

The period after lockdown is predicted with both models, pre-LD model and LD model. These predictions can serve as indicators if the specific city has already gone back to normal or behaves more like during the lockdown. 

In [17]:
def post_predict(df_post, gam_lockdown, gam_pre_lockdown, city):
    
    ex_vars = chosen_vars.loc[city].dropna().values.tolist()
    if 'weekday' not in ex_vars:
        ex_vars.append('weekday')
    
    # specify the same list but with the fake weekday variable instead of the true values
    ex_vars_dummy = chosen_vars.loc[city].dropna().values.tolist()
    if 'weekday' in ex_vars_dummy:
        w_index = ex_vars_dummy.index('weekday')
        ex_vars_dummy[w_index] = 'weekday_d'
    else:
        ex_vars_dummy.append('weekday_d')
        
        
    df_post = df_post.dropna(subset=ex_vars)
    
    df_city = df_post[df_post['city'] == city]
    
    max_value = df_city[df_city.index<=datetime.strptime(starts[loc], '%m/%d/%Y')][in_var].max()
    min_value = df_city[df_city.index>=datetime.strptime(starts[loc], '%m/%d/%Y')][in_var].min()
    
    pre_lockdown = np.exp(gam_pre_lockdown.predict(df_city[ex_vars]))
    pre_lockdown[pre_lockdown>max_value] = max_value
    
    dummy = np.exp(gam_pre_lockdown.predict(df_city[ex_vars_dummy]))
    dummy[dummy>max_value] = max_value
    
    post_pred = df_city
    post_pred['pre_lockdown_model'] = pre_lockdown
    post_pred['dummy_pred'] = dummy
    post_pred['diff_pred'] = gam_lockdown.predict(post_pred['weekday'])
    post_pred['lockdown_model'] = post_pred['dummy_pred'] + post_pred['diff_pred']
    post_pred['time'] = post_pred.index
    return(post_pred)

In [18]:
classes = loc_classes[loc]
performance = pd.DataFrame(data=None, columns =np.unique(np.array(list(classes.values()))))
post_preds = pd.DataFrame(data =None)
for city in cities:
    post_pred = post_predict(df_post_ld, gam_lockdown.loc[city, 'model'],gam_pre_lockdown.loc[city, 'model'], city) 
    post_preds = post_preds.append(post_pred)
post_preds['class'] =  post_preds['city'].replace(classes)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

## "Predict" Pollutant Levels for the Year 2019 under Lockdown 
How much would have the pollutant levels reduced during 2019 if the country had been under lockdown the whole year?

In [19]:
def predict_2019(df, gam_lockdown, gam_pre_lockdown, city):
    
    ex_vars = chosen_vars.loc[city].dropna().values.tolist()
    if 'weekday' not in ex_vars:
        ex_vars.append('weekday')
    
    # specify the same list but with the fake weekday variable instead of the true values
    ex_vars_dummy = chosen_vars.loc[city].dropna().values.tolist()
    if 'weekday' in ex_vars_dummy:
        w_index = ex_vars_dummy.index('weekday')
        ex_vars_dummy[w_index] = 'weekday_d'
    else:
        ex_vars_dummy.append('weekday_d')
        
        
    df_pred =df[df['city'] == city]
    df_pred =df_pred[df_pred['year']==2019]
    
    df_pred = df_pred.dropna(subset=ex_vars)
    
    df_pred['pre_lockdown_model'] = np.exp(gam_pre_lockdown.predict(df_pred[ex_vars]))
    df_pred['dummy_pred'] = np.exp(gam_pre_lockdown.predict(df_pred[ex_vars_dummy]))
    df_pred['diff_pred'] = gam_lockdown.predict(df_pred['weekday'])
    df_pred['lockdown_model'] = df_pred['dummy_pred'] + df_pred['diff_pred']
    #df_pred['diff'] = df_pred['normal_pred'] - df_pred['covid_pred'] 
    df_pred['time'] = df_pred.index
    df_pred['city'] = city
    return(df_pred)
    

In [20]:
pred_2019 = pd.DataFrame(data =None)
for city in cities:
    pred_ = predict_2019(df, gam_lockdown.loc[city, 'model'],gam_pre_lockdown.loc[city, 'model'], city)
    pred_2019 = pred_2019.append(pred_)

In [21]:
# add column for classes
pred_2019['class'] = pred_2019['city'].replace(classes)

In [22]:
df_compare_all.to_csv('./' + loc+ '/'+in_var+'_compare_all.csv')
RMSE.to_csv('./' + loc+  '/'+in_var+'_RMSE.csv')
post_preds.to_csv('./' + loc+  '/'+in_var+'_post_preds.csv')
pred_2019.to_csv('./' + loc+  '/'+in_var+'_pred_2019.csv')