# Modelling Pipeline

## 1. Import Modules

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, RobustScaler, QuantileTransformer, FunctionTransformer, MinMaxScaler, Normalizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, accuracy_score
from sklearn.impute import KNNImputer
from sklearn.compose import ColumnTransformer
from scipy.stats import shapiro
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import classification_report,  confusion_matrix
from sklearn.multioutput import MultiOutputRegressor
from sklearn.svm import SVR

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

## 2. Prepare Data

### 2.1 Load Data

In [2]:
df = pd.read_csv('clean data/final_data.csv', index_col=0)
df.head()

Unnamed: 0,Country,WS_MDG,WUE_SDG,WS_SDG,Temp,Rain,IRWR,ERWR,TRWR,Dep_ratio,rural_pop,urban_pop,HDI,r_u,r_u_access,pop_growth,mort_rate,GDP_pcp,life_ex
0,AFG,31.045462,0.923778,54.757019,14.074742,349.736945,47.15,18.18,65.33,0.27828,26558.609,8971.472,0.493,2.96034,0.601023,3.06,53.2,2226.0,63.4
1,AGO,0.475539,142.467836,1.871883,22.182196,960.024065,148.0,0.4,148.4,0.002695,10472.554,19311.639,0.576,0.542292,0.374005,3.44,58.6,7859.4,59.2
2,ALB,3.933775,6.656907,7.139423,12.754647,1079.459167,26.9,3.3,30.2,0.109272,1190.155,1740.032,0.789,0.683985,1.003161,-0.2,8.6,12227.4,78.0
3,ARE,1708.0,92.773763,1708.0,28.010773,64.449765,0.15,0.0,0.15,0.0,1292.709,8107.436,0.864,0.159447,1.004016,0.74,7.0,64243.0,77.2
4,ARG,4.301333,13.616564,10.456664,14.767043,598.5103,292.0,584.24,876.24,0.666758,3652.804,40618.237,0.832,0.08993,1.010101,1.08,10.2,23732.2,76.0


### 2.1 Add additional climate variables

In [3]:
df['IRWR_capita'] = df['IRWR'] / ((df['urban_pop'] + df['rural_pop']) * 1000)
df['ERWR_capita'] = df['ERWR'] / ((df['urban_pop'] + df['rural_pop']) * 1000)
df['TRWR_capita'] = df['TRWR'] / ((df['urban_pop'] + df['rural_pop']) * 1000)

### 2.2 Split dataframe into chosen predictor and target variables

In [89]:
df_pred_climate = df.iloc[:, np.r_[4:10, 19:22]]
df_pred_socioec = df.iloc[:, 10:18]
df_pred = df.iloc[:, 4:22]
df_target = df.iloc[:, 1:4]

## 3. Pipeline: Predictor Variables

### 3.1 Scalers

In [5]:
def log_transform(x):
    return np.log(x + 1)

In [6]:
logscaler = FunctionTransformer(log_transform)

### 3.2 Dimensionality reduction

In [7]:
pca_target = PCA(n_components=2)

### 3.3 Setup pipeline

In [8]:
pipe_target = Pipeline([
    ('scaler', logscaler),
    ('reduce_dim', pca_target)
])

### 3.4 Fit pipeline

In [9]:
df_target_ = pipe_target.fit_transform(df_target)

In [10]:
print('Principale components: \n', pipe_target.steps[1][1].components_)

Principale components: 
 [[ 0.7208814  -0.06902087  0.68961303]
 [ 0.06914256  0.99722682  0.0275312 ]]


Principal component 1 = Water stress\
Principal component 2 = Water use efficiency

In [11]:
print('Explained variance: \n',
      pipe_target.steps[1][1].explained_variance_ratio_)

Explained variance: 
 [0.75740679 0.2363991 ]


## 4. Pipeline: Climate Variables

### 4.1 Scalers

All scalers combinations to test:

In [15]:
scalers_to_test = [
    ColumnTransformer(remainder='passthrough',
                      transformers=[
                          ("logscaler", logscaler, [
                           'Rain', 'IRWR', 'ERWR', 'TRWR', 'IRWR_capita', 'ERWR_capita', 'TRWR_capita'])
                      ]),
    ColumnTransformer(remainder='passthrough',
                      transformers=[
                          ("logscaler", logscaler, [
                           'Rain', 'IRWR', 'ERWR', 'TRWR', 'IRWR_capita', 'ERWR_capita', 'TRWR_capita']),
                          ("standardscaler", StandardScaler(),
                           df_pred_climate.columns.values)
                      ]),
    ColumnTransformer(remainder='passthrough',
                      transformers=[
                          ("logscaler", logscaler, [
                           'Rain', 'IRWR', 'ERWR', 'TRWR', 'IRWR_capita', 'ERWR_capita', 'TRWR_capita']),
                          ("robustscaler", RobustScaler(),
                           df_pred_climate.columns.values)
                      ]),
    ColumnTransformer(remainder='passthrough',
                      transformers=[
                          ("logscaler", logscaler, [
                           'Rain', 'IRWR', 'ERWR', 'TRWR', 'IRWR_capita', 'ERWR_capita', 'TRWR_capita']),
                          ("minmaxscaler", MinMaxScaler(),
                           df_pred_climate.columns.values)
                      ])
]

### 4.2 Dimensionality reduction

All number of components to test for the PCA analysis: 

In [16]:
pca_climate = PCA()

In [17]:
n_components_to_test = np.arange(3, 8)

### 4.3 Regression model

All models + model parameters to test: 

In [25]:
model_1 = RandomForestRegressor(random_state=0)
max_depth_to_test = np.arange(2, 8)

model_2 = LinearRegression()

model_3 = Ridge()
alpha_to_test = np.arange(1, 8)

### 4.4 List of parameters to test

Make list of parameter dictionaries (one for each model):

In [30]:
params = [
    {'scaler': scalers_to_test,
     'reduce_dim__n_components': n_components_to_test,
     'regressor': [model_1],
     'regressor__max_depth': max_depth_to_test},

    {'scaler': scalers_to_test,
     'reduce_dim__n_components': n_components_to_test,
     'regressor': [model_2]},

    {'scaler': scalers_to_test,
     'reduce_dim__n_components': n_components_to_test,
     'regressor': [model_3],
     'regressor__alpha': alpha_to_test}
]

### 4.5 Train, test, split + setup pipeline

In [31]:
X_train, X_test, y_train, y_test = train_test_split(
    df_pred_climate, df_target_, random_state=0)

In [32]:
pipe_climate = Pipeline([
    ('scaler', scalers_to_test[0]),
    ('reduce_dim', pca_climate),
    ('regressor', model_1)
])

### 4.6 Gridsearch pipeline

#### 4.6.1 Target principal component 1: water stress

In [33]:
gridsearch_1 = GridSearchCV(
    pipe_climate, params, verbose=1).fit(X_train, y_train[:, 0])

Fitting 5 folds for each of 280 candidates, totalling 1400 fits


In [34]:
gridsearch_1.best_params_

{'reduce_dim__n_components': 7,
 'regressor': RandomForestRegressor(max_depth=7, random_state=0),
 'regressor__max_depth': 7,
 'scaler': ColumnTransformer(remainder='passthrough',
                   transformers=[('logscaler',
                                  FunctionTransformer(func=<function log_transform at 0x000001AE4331AAF0>),
                                  ['Rain', 'IRWR', 'ERWR', 'TRWR', 'IRWR_capita',
                                   'ERWR_capita', 'TRWR_capita']),
                                 ('robustscaler', RobustScaler(),
                                  array(['Temp', 'Rain', 'IRWR', 'ERWR', 'TRWR', 'Dep_ratio', 'IRWR_capita',
        'ERWR_capita', 'TRWR_capita'], dtype=object))])}

#### 4.6.2 Target principal component 2: water use efficiency

In [35]:
gridsearch_2 = GridSearchCV(
    pipe_climate, params, verbose=1).fit(X_train, y_train[:, 1])

Fitting 5 folds for each of 280 candidates, totalling 1400 fits


In [36]:
gridsearch_2.best_params_

{'reduce_dim__n_components': 7,
 'regressor': RandomForestRegressor(max_depth=5, random_state=0),
 'regressor__max_depth': 5,
 'scaler': ColumnTransformer(remainder='passthrough',
                   transformers=[('logscaler',
                                  FunctionTransformer(func=<function log_transform at 0x000001AE4331AAF0>),
                                  ['Rain', 'IRWR', 'ERWR', 'TRWR', 'IRWR_capita',
                                   'ERWR_capita', 'TRWR_capita']),
                                 ('robustscaler', RobustScaler(),
                                  array(['Temp', 'Rain', 'IRWR', 'ERWR', 'TRWR', 'Dep_ratio', 'IRWR_capita',
        'ERWR_capita', 'TRWR_capita'], dtype=object))])}

### 4.7 Best fit model

#### 4.7.1 Target principal component 1: water stress

In [47]:
pipe_climate_1 = Pipeline([
    ('scaler', scalers_to_test[2]),
    ('reduce_dim', PCA(n_components=7)),
    ('regressor', RandomForestRegressor(max_depth=7, random_state=0))
])

In [48]:
pipe_climate_1.fit(X_train, y_train[:, 0])
print('Model score: ', pipe_climate_1.score(X_test, y_test[:, 0]))
y_pred = pipe_climate_1.predict(df_pred_climate)
print('R²', r2_score(df_target_[:, 0], y_pred))

Model score:  0.5207132828451724
R² 0.8194772324657671


#### 4.7.2 Target principal component 2: water use efficiency

In [57]:
pipe_climate_2 = Pipeline([
    ('scaler', scalers_to_test[2]),
    ('reduce_dim', PCA(n_components=7)),
    ('regressor', RandomForestRegressor(max_depth=5, random_state=0))
])

In [58]:
pipe_climate_2.fit(X_train, y_train[:, 1])
print('Model score: ', pipe_climate_2.score(X_test, y_test[:, 1]))
y_pred = pipe_climate_2.predict(df_pred_climate)
print('R²', r2_score(df_target_[:, 1], y_pred))

Model score:  0.1531348686223618
R² 0.5404516824943363


## 5. Pipeline: Socio-Economic Variables

### 5.1 Scalers

All scalers combinations to test:

In [71]:
scalers_to_test = [
    ColumnTransformer(remainder='passthrough',
                      transformers=[
                          ("logscaler", logscaler, [
                           'rural_pop', 'urban_pop', 'GDP_pcp'])
                      ]),
    ColumnTransformer(remainder='passthrough',
                      transformers=[
                          ("logscaler", logscaler, [
                           'rural_pop', 'urban_pop', 'GDP_pcp']),
                          ("standardscaler", StandardScaler(),
                           df_pred_socioec.columns.values)
                      ]),
    ColumnTransformer(remainder='passthrough',
                      transformers=[
                          ("logscaler", logscaler, [
                           'rural_pop', 'urban_pop', 'GDP_pcp']),
                          ("robustscaler", RobustScaler(),
                           df_pred_socioec.columns.values)
                      ]),
    ColumnTransformer(remainder='passthrough',
                      transformers=[
                          ("logscaler", logscaler, [
                           'rural_pop', 'urban_pop', 'GDP_pcp']),
                          ("minmaxscaler", MinMaxScaler(),
                           df_pred_socioec.columns.values)
                      ])
]

### 5.2 Dimensionality reduction

All number of components to test for the PCA analysis: 

In [72]:
pca_socioec = PCA()

In [73]:
n_components_to_test = np.arange(3, 8)

### 5.3 Regression model

All models + model parameters to test: 

In [74]:
model_1 = RandomForestRegressor(random_state=0)
max_depth_to_test = np.arange(2, 8)

model_2 = LinearRegression()

model_3 = Ridge()
alpha_to_test = np.arange(1, 8)

### 5.4 List of parameters to test

Make list of parameter dictionaries (one for each model):

In [75]:
params = [
    {'scaler': scalers_to_test,
     'reduce_dim__n_components': n_components_to_test,
     'regressor': [model_1],
     'regressor__max_depth': max_depth_to_test},

    {'scaler': scalers_to_test,
     'reduce_dim__n_components': n_components_to_test,
     'regressor': [model_2]},

    {'scaler': scalers_to_test,
     'reduce_dim__n_components': n_components_to_test,
     'regressor': [model_3],
     'regressor__alpha': alpha_to_test}
]

### 5.5 Train, test, split + setup pipeline

In [76]:
X_train, X_test, y_train, y_test = train_test_split(
    df_pred_socioec, df_target_, random_state=0)

In [77]:
pipe_socioec = Pipeline([
    ('scaler', scalers_to_test[0]),
    ('reduce_dim', pca_socioec),
    ('regressor', model_1)
])

### 5.6 Gridsearch pipeline

#### 5.6.1 Target principal component 1: water stress

In [78]:
gridsearch_1 = GridSearchCV(
    pipe_socioec, params, verbose=1).fit(X_train, y_train[:, 0])

Fitting 5 folds for each of 280 candidates, totalling 1400 fits


In [79]:
gridsearch_1.best_params_

{'reduce_dim__n_components': 6,
 'regressor': RandomForestRegressor(max_depth=6, random_state=0),
 'regressor__max_depth': 6,
 'scaler': ColumnTransformer(remainder='passthrough',
                   transformers=[('logscaler',
                                  FunctionTransformer(func=<function log_transform at 0x000001AE4331AAF0>),
                                  ['rural_pop', 'urban_pop', 'GDP_pcp']),
                                 ('minmaxscaler', MinMaxScaler(),
                                  array(['rural_pop', 'urban_pop', 'HDI', 'r_u', 'r_u_access', 'pop_growth',
        'mort_rate', 'GDP_pcp'], dtype=object))])}

#### 5.6.2 Target principal component 2: water use efficiency

In [80]:
gridsearch_2 = GridSearchCV(
    pipe_climate, params, verbose=1).fit(X_train, y_train[:, 1])

Fitting 5 folds for each of 280 candidates, totalling 1400 fits


In [83]:
gridsearch_2.best_params_

{'reduce_dim__n_components': 6,
 'regressor': RandomForestRegressor(max_depth=7, random_state=0),
 'regressor__max_depth': 7,
 'scaler': ColumnTransformer(remainder='passthrough',
                   transformers=[('logscaler',
                                  FunctionTransformer(func=<function log_transform at 0x000001AE4331AAF0>),
                                  ['rural_pop', 'urban_pop', 'GDP_pcp']),
                                 ('standardscaler', StandardScaler(),
                                  array(['rural_pop', 'urban_pop', 'HDI', 'r_u', 'r_u_access', 'pop_growth',
        'mort_rate', 'GDP_pcp'], dtype=object))])}

### 5.7 Best fit model

#### 5.7.1 Target principal component 1: water stress

In [84]:
pipe_socioec_1 = Pipeline([
    ('scaler', scalers_to_test[3]),
    ('reduce_dim', PCA(n_components=6)),
    ('regressor', RandomForestRegressor(max_depth=6, random_state=0))
])

In [86]:
pipe_socioec_1.fit(X_train, y_train[:, 0])
print('Model score: ', pipe_socioec_1.score(X_test, y_test[:, 0]))
y_pred = pipe_socioec_1.predict(df_pred_socioec)
print('R²', r2_score(df_target_[:, 0], y_pred))

Model score:  0.3601873578596295
R² 0.689661127240961


#### 5.7.2 Target principal component 2: water use efficiency

In [87]:
pipe_socioec_2 = Pipeline([
    ('scaler', scalers_to_test[1]),
    ('reduce_dim', PCA(n_components=6)),
    ('regressor', RandomForestRegressor(max_depth=7, random_state=0))
])

In [88]:
pipe_socioec_2.fit(X_train, y_train[:, 1])
print('Model score: ', pipe_socioec_2.score(X_test, y_test[:, 1]))
y_pred = pipe_socioec_2.predict(df_pred_socioec)
print('R²', r2_score(df_target_[:, 1], y_pred))

Model score:  0.6547175502991724
R² 0.8207094111758227


## 6. Pipeline: Climate + Socio-Economic Variables

### 6.1 Scalers

All scalers combinations to test:

In [90]:
scalers_to_test = [
    ColumnTransformer(remainder='passthrough',
                      transformers=[
                          ("logscaler", logscaler, [
                           'Rain', 'IRWR', 'ERWR', 'TRWR', 'IRWR_capita', 'ERWR_capita', 'TRWR_capita','rural_pop', 'urban_pop', 'GDP_pcp'])
                      ]),
    ColumnTransformer(remainder='passthrough',
                      transformers=[
                          ("logscaler", logscaler, [
                           'Rain', 'IRWR', 'ERWR', 'TRWR', 'IRWR_capita', 'ERWR_capita', 'TRWR_capita','rural_pop', 'urban_pop', 'GDP_pcp']),
                          ("standardscaler", StandardScaler(),
                           df_pred.columns.values)
                      ]),
    ColumnTransformer(remainder='passthrough',
                      transformers=[
                          ("logscaler", logscaler, [
                           'Rain', 'IRWR', 'ERWR', 'TRWR', 'IRWR_capita', 'ERWR_capita', 'TRWR_capita','rural_pop', 'urban_pop', 'GDP_pcp']),
                          ("robustscaler", RobustScaler(),
                           df_pred.columns.values)
                      ]),
    ColumnTransformer(remainder='passthrough',
                      transformers=[
                          ("logscaler", logscaler, [
                           'Rain', 'IRWR', 'ERWR', 'TRWR', 'IRWR_capita', 'ERWR_capita', 'TRWR_capita','rural_pop', 'urban_pop', 'GDP_pcp']),
                          ("minmaxscaler", MinMaxScaler(),
                           df_pred.columns.values)
                      ])
]

### 6.2 Dimensionality reduction

All number of components to test for the PCA analysis: 

In [91]:
pca_pred = PCA()

In [93]:
n_components_to_test = np.arange(3, 15)

### 6.3 Regression model

All models + model parameters to test: 

In [98]:
model_1 = RandomForestRegressor(random_state=0)
max_depth_to_test = np.arange(2, 15)

model_2 = LinearRegression()

model_3 = Ridge()
alpha_to_test = np.arange(1, 15)

### 6.4 List of parameters to test

Make list of parameter dictionaries (one for each model):

In [95]:
params = [
    {'scaler': scalers_to_test,
     'reduce_dim__n_components': n_components_to_test,
     'regressor': [model_1],
     'regressor__max_depth': max_depth_to_test},

    {'scaler': scalers_to_test,
     'reduce_dim__n_components': n_components_to_test,
     'regressor': [model_2]},

    {'scaler': scalers_to_test,
     'reduce_dim__n_components': n_components_to_test,
     'regressor': [model_3],
     'regressor__alpha': alpha_to_test}
]

### 6.5 Train, test, split + setup pipeline

In [96]:
X_train, X_test, y_train, y_test = train_test_split(
    df_pred, df_target_, random_state=0)

In [100]:
pipe_pred = Pipeline([
    ('scaler', scalers_to_test[0]),
    ('reduce_dim', pca_pred),
    ('regressor', model_1)
])

### 6.6 Gridsearch pipeline

#### 6.6.1 Target principal component 1: water stress

In [101]:
gridsearch_1 = GridSearchCV(
    pipe_pred, params, verbose=1).fit(X_train, y_train[:, 0])

Fitting 5 folds for each of 1344 candidates, totalling 6720 fits


In [102]:
gridsearch_1.best_params_

{'reduce_dim__n_components': 12,
 'regressor': Ridge(alpha=4),
 'regressor__alpha': 4,
 'scaler': ColumnTransformer(remainder='passthrough',
                   transformers=[('logscaler',
                                  FunctionTransformer(func=<function log_transform at 0x000001AE4331AAF0>),
                                  ['Rain', 'IRWR', 'ERWR', 'TRWR', 'IRWR_capita',
                                   'ERWR_capita', 'TRWR_capita', 'rural_pop',
                                   'urban_pop', 'GDP_pcp'])])}

#### 6.6.2 Target principal component 2: water use efficiency

In [103]:
gridsearch_2 = GridSearchCV(
    pipe_pred, params, verbose=1).fit(X_train, y_train[:, 1])

Fitting 5 folds for each of 1344 candidates, totalling 6720 fits


In [104]:
gridsearch_2.best_params_

{'reduce_dim__n_components': 14,
 'regressor': Ridge(alpha=2),
 'regressor__alpha': 2,
 'scaler': ColumnTransformer(remainder='passthrough',
                   transformers=[('logscaler',
                                  FunctionTransformer(func=<function log_transform at 0x000001AE4331AAF0>),
                                  ['Rain', 'IRWR', 'ERWR', 'TRWR', 'IRWR_capita',
                                   'ERWR_capita', 'TRWR_capita', 'rural_pop',
                                   'urban_pop', 'GDP_pcp'])])}

### 6.7 Best fit model

#### 6.7.1 Target principal component 1: water stress

In [109]:
pipe_pred_1 = Pipeline([
    ('scaler', scalers_to_test[0]),
    ('reduce_dim', PCA(n_components=12)),
    ('regressor', Ridge(alpha=4))
])

In [110]:
pipe_pred_1.fit(X_train, y_train[:, 0])
print('Model score: ', pipe_pred_1.score(X_test, y_test[:, 0]))
y_pred = pipe_pred_1.predict(df_pred)
print('R²', r2_score(df_target_[:, 0], y_pred))

Model score:  0.6384512159002249
R² 0.758258564340669


#### 6.7.2 Target principal component 2: water use efficiency

In [111]:
pipe_pred_2 = Pipeline([
    ('scaler', scalers_to_test[0]),
    ('reduce_dim', PCA(n_components=14)),
    ('regressor', Ridge(alpha=2))
])

In [112]:
pipe_pred_2.fit(X_train, y_train[:, 1])
print('Model score: ', pipe_pred_2.score(X_test, y_test[:, 1]))
y_pred = pipe_pred_2.predict(df_pred)
print('R²', r2_score(df_target_[:, 1], y_pred))

Model score:  0.684873871046202
R² 0.6583904593828581
