# Pipeline Climate Factors

## 1. Import Modules

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, RobustScaler, QuantileTransformer, FunctionTransformer, MinMaxScaler, Normalizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, accuracy_score
from sklearn.impute import KNNImputer
from sklearn.compose import ColumnTransformer
from scipy.stats import shapiro
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import classification_report,  confusion_matrix
from sklearn.multioutput import MultiOutputRegressor
from sklearn.svm import SVR

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

## 2. Prepare Data

### 2.1 Load Data

In [5]:
df = pd.read_csv('clean data/final_data.csv', index_col=0)
df.head()

Unnamed: 0,Country,WS_MDG,WUE_SDG,WS_SDG,Temp,Rain,IRWR,ERWR,TRWR,Dep_ratio,rural_pop,urban_pop,HDI,r_u,r_u_access,pop_growth,mort_rate,GDP_pcp,life_ex
0,AFG,31.045462,0.923778,54.757019,14.074742,349.736945,47.15,18.18,65.33,0.27828,26558.609,8971.472,0.493,2.96034,0.601023,3.06,53.2,2226.0,63.4
1,AGO,0.475539,142.467836,1.871883,22.182196,960.024065,148.0,0.4,148.4,0.002695,10472.554,19311.639,0.576,0.542292,0.374005,3.44,58.6,7859.4,59.2
2,ALB,3.933775,6.656907,7.139423,12.754647,1079.459167,26.9,3.3,30.2,0.109272,1190.155,1740.032,0.789,0.683985,1.003161,-0.2,8.6,12227.4,78.0
3,ARE,1708.0,92.773763,1708.0,28.010773,64.449765,0.15,0.0,0.15,0.0,1292.709,8107.436,0.864,0.159447,1.004016,0.74,7.0,64243.0,77.2
4,ARG,4.301333,13.616564,10.456664,14.767043,598.5103,292.0,584.24,876.24,0.666758,3652.804,40618.237,0.832,0.08993,1.010101,1.08,10.2,23732.2,76.0


### 2.1 Add additional climate variables

In [6]:
df['IRWR_capita'] = df['IRWR'] / ((df['urban_pop'] + df['rural_pop']) * 1000)
df['ERWR_capita'] = df['ERWR'] / ((df['urban_pop'] + df['rural_pop']) * 1000)
df['TRWR_capita'] = df['TRWR'] / ((df['urban_pop'] + df['rural_pop']) * 1000)

In [7]:
df.head()

Unnamed: 0,Country,WS_MDG,WUE_SDG,WS_SDG,Temp,Rain,IRWR,ERWR,TRWR,Dep_ratio,...,HDI,r_u,r_u_access,pop_growth,mort_rate,GDP_pcp,life_ex,IRWR_capita,ERWR_capita,TRWR_capita
0,AFG,31.045462,0.923778,54.757019,14.074742,349.736945,47.15,18.18,65.33,0.27828,...,0.493,2.96034,0.601023,3.06,53.2,2226.0,63.4,1.327045e-06,5.116791e-07,1.838724e-06
1,AGO,0.475539,142.467836,1.871883,22.182196,960.024065,148.0,0.4,148.4,0.002695,...,0.576,0.542292,0.374005,3.44,58.6,7859.4,59.2,4.969079e-06,1.342994e-08,4.982509e-06
2,ALB,3.933775,6.656907,7.139423,12.754647,1079.459167,26.9,3.3,30.2,0.109272,...,0.789,0.683985,1.003161,-0.2,8.6,12227.4,78.0,9.180301e-06,1.126208e-06,1.030651e-05
3,ARE,1708.0,92.773763,1708.0,28.010773,64.449765,0.15,0.0,0.15,0.0,...,0.864,0.159447,1.004016,0.74,7.0,64243.0,77.2,1.59572e-08,0.0,1.59572e-08
4,ARG,4.301333,13.616564,10.456664,14.767043,598.5103,292.0,584.24,876.24,0.666758,...,0.832,0.08993,1.010101,1.08,10.2,23732.2,76.0,6.595734e-06,1.319689e-05,1.979262e-05


### 2.2 Split dataframe into chosen predictor and target variables

In [8]:
df_pred_climate = df.iloc[:, np.r_[4:6,9:10, 19:22]]
df_pred_socioec = df.iloc[:, 10:18]
df_target = df.iloc[:, 1:4]

In [9]:
df_pred_climate.head()

Unnamed: 0,Temp,Rain,Dep_ratio,IRWR_capita,ERWR_capita,TRWR_capita
0,14.074742,349.736945,0.27828,1.327045e-06,5.116791e-07,1.838724e-06
1,22.182196,960.024065,0.002695,4.969079e-06,1.342994e-08,4.982509e-06
2,12.754647,1079.459167,0.109272,9.180301e-06,1.126208e-06,1.030651e-05
3,28.010773,64.449765,0.0,1.59572e-08,0.0,1.59572e-08
4,14.767043,598.5103,0.666758,6.595734e-06,1.319689e-05,1.979262e-05


In [10]:
df_pred_socioec.head()

Unnamed: 0,rural_pop,urban_pop,HDI,r_u,r_u_access,pop_growth,mort_rate,GDP_pcp
0,26558.609,8971.472,0.493,2.96034,0.601023,3.06,53.2,2226.0
1,10472.554,19311.639,0.576,0.542292,0.374005,3.44,58.6,7859.4
2,1190.155,1740.032,0.789,0.683985,1.003161,-0.2,8.6,12227.4
3,1292.709,8107.436,0.864,0.159447,1.004016,0.74,7.0,64243.0
4,3652.804,40618.237,0.832,0.08993,1.010101,1.08,10.2,23732.2


## 3. dimensionality reduction

### 3.1. PCA climate

In [11]:
df_climate_st = StandardScaler().fit_transform(df_pred_climate)


In [12]:
pca_cl = PCA(n_components=3)

In [13]:
pca_cl_fit = pca_cl.fit_transform(df_climate_st)

In [14]:
pca_cl.components_


array([[-0.23840768,  0.16543886, -0.14849401,  0.65429923,  0.15648811,
         0.66418768],
       [-0.05135407, -0.36973778,  0.72334101, -0.01634961,  0.56864627,
         0.1175101 ],
       [ 0.6406001 ,  0.63537163,  0.02921009, -0.06219761,  0.4238429 ,
         0.03962107]])

component 1 loads high on IRWR_capita, TRWR_capita --> water resources

\n component 2 loads high on dep_ratio, ERWR --> externality/dependence

\n component 3 loads high on temp,rain, erwr --> climate factor

In [15]:
print('Explained Variance = ', pca_cl.explained_variance_)

Explained Variance =  [2.18254258 1.39407877 1.33066562]


### 3.2. PCA socioec

In [16]:
df_socioec_st = StandardScaler().fit_transform(df_pred_socioec)
pca_sec = PCA(n_components=3)
pca_sec_fit = pca_sec.fit_transform(df_socioec_st)

In [17]:
df_pred_socioec.columns

Index(['rural_pop', 'urban_pop', 'HDI', 'r_u', 'r_u_access', 'pop_growth',
       'mort_rate', 'GDP_pcp'],
      dtype='object')

In [18]:
pca_sec.components_


array([[ 0.02177992, -0.05194561, -0.49463955,  0.34170165, -0.4104213 ,
         0.30293245,  0.4663791 , -0.39708413],
       [ 0.69910472,  0.69130434, -0.02950557,  0.06245959,  0.05920273,
        -0.11689803,  0.00446021, -0.1067198 ],
       [ 0.09721653,  0.15336383,  0.08364942, -0.40114641, -0.23298698,
         0.74314214,  0.07549006,  0.43228565]])

Component 1 loads moderately negative on HDI, r_u_acces, GDP_pcp; moderately positive on mort_rate, pop_growth, r_u --> proxy for low development

Component 2 loads strongly on rural_pop, urban_pop --> proxy for population

Component 3 loads negative on r_u, high on pop_growth, moderate on HDI --> population growth with gdp?

In [19]:
print('Explained Variance = ', pca_sec.explained_variance_)

Explained Variance =  [3.88269356 1.88645794 0.86243428]


low explained variance for PC3 let's drop

In [20]:
df_socioec_st = StandardScaler().fit_transform(df_pred_socioec)
pca_sec = PCA(n_components=2)
pca_sec_fit = pca_sec.fit_transform(df_socioec_st)

let's take the two principal components for socioec and three principal components for climate as new explanatory variables 

In [21]:
ev_pca = pd.DataFrame(data = np.column_stack((pca_sec_fit,pca_cl_fit)),
                     columns = ['sec_PC1','sec_PC2','cl_PCA1',
                                'cl_PCA2','cl_PCA3'])
ev_pca

Unnamed: 0,sec_PC1,sec_PC2,cl_PCA1,cl_PCA2,cl_PCA3
0,3.187162,-0.149055,-0.402480,0.245839,-1.215637
1,2.926301,-0.469594,-0.296487,-0.781350,-0.045688
2,-1.368661,-0.155260,0.116140,-0.466997,-0.676084
3,-2.555001,-0.507111,-0.816780,-0.375462,-0.351853
4,-1.570293,-0.059512,-0.070081,1.674398,-0.432794
...,...,...,...,...,...
117,-2.728887,1.899395,0.161417,-0.328137,-1.398394
118,0.272298,-0.124219,-0.610957,1.232120,-1.292956
119,-0.068673,0.503873,-0.335298,0.376901,1.251310
120,0.162996,0.022160,-0.484369,-0.183493,-0.818301


## 4. models


In [22]:
from sklearn.metrics import mean_squared_error, r2_score

In [23]:
ev_pca_tr, ev_pca_te, WS_tr, WS_te = train_test_split(
                                ev_pca, df['WS_SDG'], test_size=0.2, random_state=42)

In [24]:
regr = LinearRegression()

In [25]:
reg_pca = regr.fit(ev_pca_tr,WS_tr)
reg_pca_tr_pred = regr.predict(ev_pca_tr)
reg_pca_te_pred = regr.predict(ev_pca_te)

In [26]:
print('Coefficients: \n', regr.coef_)
print('Mean squared error: %.2f'
      % mean_squared_error(WS_tr, reg_pca_tr_pred))
print('train Coefficient of determination: %.2f'
      % r2_score(WS_tr, reg_pca_tr_pred))

Coefficients: 
 [-23.4663472  -11.40768139 -21.79517408  -4.16215389  -5.71340873]
Mean squared error: 40331.08
train Coefficient of determination: 0.07


In [27]:
print('Mean squared error: %.2f'
      % mean_squared_error(WS_te, reg_pca_te_pred))
print('test Coefficient of determination: %.2f'
      % r2_score(WS_te, reg_pca_te_pred))

Mean squared error: 562795.01
test Coefficient of determination: 0.00


perfectly predictive, as all models should be

In [28]:
rfregr = RandomForestRegressor()

In [29]:
rfreg_pca = rfregr.fit(ev_pca_tr,WS_tr)
rfreg_pca_tr_pred = rfregr.predict(ev_pca_tr)
rfreg_pca_te_pred = rfregr.predict(ev_pca_te)

In [31]:
print('Mean squared error: %.2f'
      % mean_squared_error(WS_tr, rfreg_pca_tr_pred))
print('train Coefficient of determination: %.2f'
      % r2_score(WS_tr, rfreg_pca_tr_pred))

Mean squared error: 6057.07
train Coefficient of determination: 0.86


In [32]:
print('Mean squared error: %.2f'
      % mean_squared_error(WS_te, rfreg_pca_te_pred))
print('test Coefficient of determination: %.2f'
      % r2_score(WS_te, rfreg_pca_te_pred))

Mean squared error: 528603.04
test Coefficient of determination: 0.07


Once again, picture perfect prediction

# Using flexible models with this data essentially means we're just fitting random noise

In [62]:
for i in range(1,10):
    randomnoiseset = pd.DataFrame(data = np.random.randn(122,5))
    noise_tr, noise_te, WS_tr, WS_te = train_test_split(
                                    randomnoiseset, df['WS_SDG'], test_size=0.2, random_state=42)
    rfregr = RandomForestRegressor()
    rfreg_noise = rfregr.fit(noise_tr,WS_tr)
    rfreg_noise_tr_pred = rfregr.predict(noise_tr)
    rfreg_noise_te_pred = rfregr.predict(noise_te)
    print('Mean squared error: %.2f'
          % mean_squared_error(WS_tr, rfreg_noise_tr_pred))
    print('train Coefficient of determination: %.2f'
          % r2_score(WS_tr, rfreg_noise_tr_pred))
    print('Mean squared error: %.2f'
          % mean_squared_error(WS_te, rfreg_noise_te_pred))
    print('test Coefficient of determination: %.2f'
          % r2_score(WS_te, rfreg_noise_te_pred))

Mean squared error: 6661.93
train Coefficient of determination: 0.85
Mean squared error: 606750.58
test Coefficient of determination: -0.07
Mean squared error: 11076.69
train Coefficient of determination: 0.74
Mean squared error: 606293.58
test Coefficient of determination: -0.07
Mean squared error: 7593.29
train Coefficient of determination: 0.82
Mean squared error: 580705.23
test Coefficient of determination: -0.03
Mean squared error: 8859.45
train Coefficient of determination: 0.80
Mean squared error: 576867.75
test Coefficient of determination: -0.02
Mean squared error: 8628.61
train Coefficient of determination: 0.80
Mean squared error: 594251.23
test Coefficient of determination: -0.05
Mean squared error: 8186.43
train Coefficient of determination: 0.81
Mean squared error: 620477.38
test Coefficient of determination: -0.10
Mean squared error: 4661.51
train Coefficient of determination: 0.89
Mean squared error: 497889.38
test Coefficient of determination: 0.12
Mean squared error: 

By pure chance you get a test coefficient of determination of 0.12 once, with purely random data, so if we fit too many models with this data by pure chance we will get a model once that fits the test set somewhat okay-ish, even with pure noise

If we want to work in this way we'd need to introduce a validation set which further reduces our data