# Modelling Pipeline

## Import Modules

In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, RobustScaler, QuantileTransformer, FunctionTransformer, MinMaxScaler, Normalizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, accuracy_score
from sklearn.impute import KNNImputer
from sklearn.compose import ColumnTransformer
from scipy.stats import shapiro
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import classification_report,  confusion_matrix
from sklearn.multioutput import MultiOutputRegressor
from sklearn.svm import SVR

## Prepare Data

### Load Data

In [3]:
df = pd.read_csv('clean data/final_data.csv', index_col=0)
df.head()

Unnamed: 0,Country,WS_MDG,WUE_SDG,WS_SDG,Temp,Rain,IRWR,ERWR,TRWR,Dep_ratio,rural_pop,urban_pop,HDI,r_u,r_u_access,pop_growth,mort_rate,GDP_pcp,life_ex
0,AFG,31.045462,0.923778,54.757019,14.074742,349.736945,47.15,18.18,65.33,0.27828,26558.609,8971.472,0.493,2.96034,0.601023,3.06,53.2,2226.0,63.4
1,AGO,0.475539,142.467836,1.871883,22.182196,960.024065,148.0,0.4,148.4,0.002695,10472.554,19311.639,0.576,0.542292,0.374005,3.44,58.6,7859.4,59.2
2,ALB,3.933775,6.656907,7.139423,12.754647,1079.459167,26.9,3.3,30.2,0.109272,1190.155,1740.032,0.789,0.683985,1.003161,-0.2,8.6,12227.4,78.0
3,ARE,1708.0,92.773763,1708.0,28.010773,64.449765,0.15,0.0,0.15,0.0,1292.709,8107.436,0.864,0.159447,1.004016,0.74,7.0,64243.0,77.2
4,ARG,4.301333,13.616564,10.456664,14.767043,598.5103,292.0,584.24,876.24,0.666758,3652.804,40618.237,0.832,0.08993,1.010101,1.08,10.2,23732.2,76.0


### Add additional climate variables

In [4]:
df['IRWR_capita'] = df['IRWR'] / ((df['urban_pop'] + df['rural_pop']) * 1000)
df['ERWR_capita'] = df['ERWR'] / ((df['urban_pop'] + df['rural_pop']) * 1000)
df['TRWR_capita'] = df['TRWR'] / ((df['urban_pop'] + df['rural_pop']) * 1000)

### Split dataframe into chosen predictor and target variables

In [5]:
df_pred_climate = df.iloc[:, np.r_[4:10, 19:22]]
df_pred_socioec = df.iloc[:, 10:19]
df_pred = df.iloc[:, 4:22]
df_target = df.iloc[:, 1:4]

## Pipeline: Predictor Variables

### Scalers

In [17]:
def log_transform(x):
    return np.log(x + 1)

In [18]:
logscaler = FunctionTransformer(log_transform)

### Setup pipeline

In [19]:
pipe_target = Pipeline([
    ('scaler', logscaler)
])

### Fit pipeline

In [20]:
df_target_ = pipe_target.fit_transform(df_target)

## Pipeline: Climate Variables

### Scalers

All scalers combinations to test:

In [9]:
scalers_to_test = [
    ColumnTransformer(remainder='passthrough',
                      transformers=[
                          ("logscaler", logscaler, [
                           'Rain', 'IRWR', 'ERWR', 'TRWR', 'IRWR_capita', 'ERWR_capita', 'TRWR_capita'])
                      ]),
    ColumnTransformer(remainder='passthrough',
                      transformers=[
                          ("logscaler", logscaler, [
                           'Rain', 'IRWR', 'ERWR', 'TRWR', 'IRWR_capita', 'ERWR_capita', 'TRWR_capita']),
                          ("standardscaler", StandardScaler(),
                           df_pred_climate.columns.values)
                      ]),
    ColumnTransformer(remainder='passthrough',
                      transformers=[
                          ("logscaler", logscaler, [
                           'Rain', 'IRWR', 'ERWR', 'TRWR', 'IRWR_capita', 'ERWR_capita', 'TRWR_capita']),
                          ("robustscaler", RobustScaler(),
                           df_pred_climate.columns.values)
                      ]),
    ColumnTransformer(remainder='passthrough',
                      transformers=[
                          ("logscaler", logscaler, [
                           'Rain', 'IRWR', 'ERWR', 'TRWR', 'IRWR_capita', 'ERWR_capita', 'TRWR_capita']),
                          ("minmaxscaler", MinMaxScaler(),
                           df_pred_climate.columns.values)
                      ])
]

### Dimensionality reduction

All number of components to test for the PCA analysis: 

In [10]:
pca_climate = PCA()

In [11]:
n_components_to_test = np.arange(3, 8)

### Regression model

All models + model parameters to test: 

In [12]:
model_1 = RandomForestRegressor(random_state=0)
max_depth_to_test = np.arange(2, 8)

model_2 = LinearRegression()

model_3 = Ridge()
alpha_to_test = np.power(10,np.arange(-2,0,0.02))

### List of parameters to test

Make list of parameter dictionaries (one for each model):

In [13]:
params = [
    {'scaler': scalers_to_test,
     'reduce_dim__n_components': n_components_to_test,
     'regressor': [model_1],
     'regressor__max_depth': max_depth_to_test},

    {'scaler': scalers_to_test,
     'reduce_dim__n_components': n_components_to_test,
     'regressor': [model_2]},

    {'scaler': scalers_to_test,
     'reduce_dim__n_components': n_components_to_test,
     'regressor': [model_3],
     'regressor__alpha': alpha_to_test}
]

### Train, test, split + setup pipeline

In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    df_pred_climate, df_target_, random_state=0)

In [15]:
pipe_climate = Pipeline([
    ('scaler', scalers_to_test[0]),
    ('reduce_dim', pca_climate),
    ('regressor', model_1)
])

### Gridsearch pipeline

#### Target variable 1: WS_MDG

In [16]:
gridsearch_climate_1 = GridSearchCV(
    pipe_climate, params, verbose=1, n_jobs=-1).fit(X_train, y_train['WS_MDG'])

Fitting 5 folds for each of 2140 candidates, totalling 10700 fits


In [17]:
print(gridsearch_climate_1.best_score_)
print(gridsearch_climate_1.best_params_)

0.5323961594174167
{'reduce_dim__n_components': 7, 'regressor': RandomForestRegressor(max_depth=6, random_state=0), 'regressor__max_depth': 6, 'scaler': ColumnTransformer(remainder='passthrough',
                  transformers=[('logscaler',
                                 FunctionTransformer(func=<function log_transform at 0x000001A95C66CA60>),
                                 ['Rain', 'IRWR', 'ERWR', 'TRWR', 'IRWR_capita',
                                  'ERWR_capita', 'TRWR_capita']),
                                ('robustscaler', RobustScaler(),
                                 array(['Temp', 'Rain', 'IRWR', 'ERWR', 'TRWR', 'Dep_ratio', 'IRWR_capita',
       'ERWR_capita', 'TRWR_capita'], dtype=object))])}


#### Target variable 2: WUE_SDG

In [18]:
gridsearch_climate_2 = GridSearchCV(
    pipe_climate, params, verbose=1, n_jobs=-1).fit(X_train, y_train['WUE_SDG'])

Fitting 5 folds for each of 2140 candidates, totalling 10700 fits


In [19]:
print(gridsearch_climate_2.best_score_)
print(gridsearch_climate_2.best_params_)

0.0062427867783801496
{'reduce_dim__n_components': 7, 'regressor': RandomForestRegressor(max_depth=5, random_state=0), 'regressor__max_depth': 5, 'scaler': ColumnTransformer(remainder='passthrough',
                  transformers=[('logscaler',
                                 FunctionTransformer(func=<function log_transform at 0x000001A95C66CA60>),
                                 ['Rain', 'IRWR', 'ERWR', 'TRWR', 'IRWR_capita',
                                  'ERWR_capita', 'TRWR_capita']),
                                ('standardscaler', StandardScaler(),
                                 array(['Temp', 'Rain', 'IRWR', 'ERWR', 'TRWR', 'Dep_ratio', 'IRWR_capita',
       'ERWR_capita', 'TRWR_capita'], dtype=object))])}


#### Target variable 3: WS_SDG

In [20]:
gridsearch_climate_3 = GridSearchCV(
    pipe_climate, params, verbose=1, n_jobs=-1).fit(X_train, y_train['WS_SDG'])

Fitting 5 folds for each of 2140 candidates, totalling 10700 fits


In [21]:
print(gridsearch_climate_3.best_score_)
print(gridsearch_climate_3.best_params_)

0.5243820829795485
{'reduce_dim__n_components': 7, 'regressor': RandomForestRegressor(max_depth=7, random_state=0), 'regressor__max_depth': 7, 'scaler': ColumnTransformer(remainder='passthrough',
                  transformers=[('logscaler',
                                 FunctionTransformer(func=<function log_transform at 0x000001A95C66CA60>),
                                 ['Rain', 'IRWR', 'ERWR', 'TRWR', 'IRWR_capita',
                                  'ERWR_capita', 'TRWR_capita']),
                                ('robustscaler', RobustScaler(),
                                 array(['Temp', 'Rain', 'IRWR', 'ERWR', 'TRWR', 'Dep_ratio', 'IRWR_capita',
       'ERWR_capita', 'TRWR_capita'], dtype=object))])}


### Best fit model

#### Target variable 1: WS_MDG

In [24]:
pipe_climate_1 = gridsearch_climate_1.best_estimator_

In [25]:
pipe_climate_1.fit(X_train, y_train['WS_MDG'])
print('Model score: ', pipe_climate_1.score(X_test, y_test['WS_MDG']))
y_pred = pipe_climate_1.predict(df_pred_climate)
print('R²', r2_score(df_target_['WS_MDG'], y_pred))

Model score:  0.5854410039752573
R² 0.8280132185491356


#### Target variable 2: WUE_SDG

In [26]:
pipe_climate_2 = gridsearch_climate_2.best_estimator_

In [27]:
pipe_climate_2.fit(X_train, y_train['WUE_SDG'])
print('Model score: ', pipe_climate_2.score(X_test, y_test['WUE_SDG']))
y_pred = pipe_climate_2.predict(df_pred_climate)
print('R²', r2_score(df_target_["WUE_SDG"], y_pred))

Model score:  0.14211126438607113
R² 0.5628417177775362


#### Target variable 3: WS_SDG

In [28]:
pipe_climate_3 = gridsearch_climate_3.best_estimator_

In [29]:
pipe_climate_3.fit(X_train, y_train['WS_SDG'])
print('Model score: ', pipe_climate_3.score(X_test, y_test['WS_SDG']))
y_pred = pipe_climate_3.predict(df_pred_climate)
print('R²', r2_score(df_target_['WS_SDG'], y_pred))

Model score:  0.5157602060099814
R² 0.8177014414614187


## Pipeline: Socio-Economic Variables

### Scalers

All scalers combinations to test:

In [30]:
scalers_to_test = [
    ColumnTransformer(remainder='passthrough',
                      transformers=[
                          ("logscaler", logscaler, [
                           'rural_pop', 'urban_pop', 'GDP_pcp'])
                      ]),
    ColumnTransformer(remainder='passthrough',
                      transformers=[
                          ("logscaler", logscaler, [
                           'rural_pop', 'urban_pop', 'GDP_pcp']),
                          ("standardscaler", StandardScaler(),
                           df_pred_socioec.columns.values)
                      ]),
    ColumnTransformer(remainder='passthrough',
                      transformers=[
                          ("logscaler", logscaler, [
                           'rural_pop', 'urban_pop', 'GDP_pcp']),
                          ("robustscaler", RobustScaler(),
                           df_pred_socioec.columns.values)
                      ]),
    ColumnTransformer(remainder='passthrough',
                      transformers=[
                          ("logscaler", logscaler, [
                           'rural_pop', 'urban_pop', 'GDP_pcp']),
                          ("minmaxscaler", MinMaxScaler(),
                           df_pred_socioec.columns.values)
                      ])
]

### Dimensionality reduction

All number of components to test for the PCA analysis: 

In [31]:
pca_socioec = PCA()

In [32]:
n_components_to_test = np.arange(3, 8)

### Regression model

All models + model parameters to test: 

In [33]:
model_1 = RandomForestRegressor(random_state=0)
max_depth_to_test = np.arange(2, 8)

model_2 = LinearRegression()

model_3 = Ridge()
alpha_to_test = np.power(10,np.arange(-2,0,0.02))

### List of parameters to test

Make list of parameter dictionaries (one for each model):

In [34]:
params = [
    {'scaler': scalers_to_test,
     'reduce_dim__n_components': n_components_to_test,
     'regressor': [model_1],
     'regressor__max_depth': max_depth_to_test},

    {'scaler': scalers_to_test,
     'reduce_dim__n_components': n_components_to_test,
     'regressor': [model_2]},

    {'scaler': scalers_to_test,
     'reduce_dim__n_components': n_components_to_test,
     'regressor': [model_3],
     'regressor__alpha': alpha_to_test}
]

### Train, test, split + setup pipeline

In [35]:
X_train, X_test, y_train, y_test = train_test_split(
    df_pred_socioec, df_target_, random_state=0)

In [36]:
pipe_socioec = Pipeline([
    ('scaler', scalers_to_test[0]),
    ('reduce_dim', pca_socioec),
    ('regressor', model_1)
])

### Gridsearch pipeline

#### Target variable 1: WS_MDG

In [37]:
gridsearch_socioec_1 = GridSearchCV(
    pipe_socioec, params, verbose=1, n_jobs=-1).fit(X_train, y_train['WS_MDG'])

Fitting 5 folds for each of 2140 candidates, totalling 10700 fits


In [38]:
print(gridsearch_socioec_1.best_score_)
print(gridsearch_socioec_1.best_params_)

0.16486168656847294
{'reduce_dim__n_components': 7, 'regressor': RandomForestRegressor(max_depth=6, random_state=0), 'regressor__max_depth': 6, 'scaler': ColumnTransformer(remainder='passthrough',
                  transformers=[('logscaler',
                                 FunctionTransformer(func=<function log_transform at 0x000001A95C66CA60>),
                                 ['rural_pop', 'urban_pop', 'GDP_pcp']),
                                ('minmaxscaler', MinMaxScaler(),
                                 array(['rural_pop', 'urban_pop', 'HDI', 'r_u', 'r_u_access', 'pop_growth',
       'mort_rate', 'GDP_pcp', 'life_ex'], dtype=object))])}


#### Target variable 2: WUE_SDG

In [39]:
gridsearch_socioec_2 = GridSearchCV(
    pipe_socioec, params, verbose=1, n_jobs=-1).fit(X_train, y_train['WUE_SDG'])

Fitting 5 folds for each of 2140 candidates, totalling 10700 fits


In [40]:
print(gridsearch_socioec_2.best_score_)
print(gridsearch_socioec_2.best_params_)

0.4247616291079849
{'reduce_dim__n_components': 6, 'regressor': RandomForestRegressor(max_depth=4, random_state=0), 'regressor__max_depth': 4, 'scaler': ColumnTransformer(remainder='passthrough',
                  transformers=[('logscaler',
                                 FunctionTransformer(func=<function log_transform at 0x000001A95C66CA60>),
                                 ['rural_pop', 'urban_pop', 'GDP_pcp']),
                                ('standardscaler', StandardScaler(),
                                 array(['rural_pop', 'urban_pop', 'HDI', 'r_u', 'r_u_access', 'pop_growth',
       'mort_rate', 'GDP_pcp', 'life_ex'], dtype=object))])}


#### Target variable 3: WS_SDG

In [41]:
gridsearch_socioec_3 = GridSearchCV(
    pipe_socioec, params, verbose=1, n_jobs=-1).fit(X_train, y_train['WS_SDG'])

Fitting 5 folds for each of 2140 candidates, totalling 10700 fits


In [42]:
print(gridsearch_socioec_3.best_score_)
print(gridsearch_socioec_3.best_params_)

0.23148922402470892
{'reduce_dim__n_components': 5, 'regressor': RandomForestRegressor(max_depth=3, random_state=0), 'regressor__max_depth': 3, 'scaler': ColumnTransformer(remainder='passthrough',
                  transformers=[('logscaler',
                                 FunctionTransformer(func=<function log_transform at 0x000001A95C66CA60>),
                                 ['rural_pop', 'urban_pop', 'GDP_pcp']),
                                ('standardscaler', StandardScaler(),
                                 array(['rural_pop', 'urban_pop', 'HDI', 'r_u', 'r_u_access', 'pop_growth',
       'mort_rate', 'GDP_pcp', 'life_ex'], dtype=object))])}


### Best fit model

#### Target variable 1: WS_MDG

In [43]:
pipe_socioec_1 = gridsearch_socioec_1.best_estimator_

In [44]:
pipe_socioec_1.fit(X_train, y_train['WS_MDG'])
print('Model score: ', pipe_socioec_1.score(X_train, y_train['WS_MDG']))
y_pred = pipe_socioec_1.predict(df_pred_socioec)
print('R²', r2_score(df_target_['WS_MDG'], y_pred))

Model score:  0.8285903340685249
R² 0.6426520925761263


#### Target variable 2: WUE_SDG

In [45]:
pipe_socioec_2 = gridsearch_socioec_2.best_estimator_

In [46]:
pipe_socioec_2.fit(X_train, y_train['WUE_SDG'])
print('Model score: ', pipe_socioec_2.score(X_test, y_test['WUE_SDG']))
y_pred = pipe_socioec_2.predict(df_pred_socioec)
print('R²', r2_score(df_target_['WUE_SDG'], y_pred))

Model score:  0.5594247535878233
R² 0.7284534757150141


#### Target variable 3: WS_SDG

In [47]:
pipe_socioec_3 = gridsearch_socioec_3.best_estimator_

In [48]:
pipe_socioec_3.fit(X_train, y_train['WS_SDG'])
print('Model score: ', pipe_socioec_3.score(X_test, y_test['WS_SDG']))
y_pred = pipe_socioec_3.predict(df_pred_socioec)
print('R²', r2_score(df_target_['WS_SDG'], y_pred))

Model score:  0.03881003653261894
R² 0.4273816702154125


## Pipeline: Climate + Socio-Economic Variables

### Scalers

All scalers combinations to test:

In [8]:
scalers_to_test = [
    ColumnTransformer(remainder='passthrough',
                      transformers=[
                          ("logscaler", logscaler, [
                           'Rain', 'IRWR', 'ERWR', 'TRWR', 'IRWR_capita', 'ERWR_capita', 'TRWR_capita','rural_pop', 'urban_pop', 'GDP_pcp'])
                      ]),
    ColumnTransformer(remainder='passthrough',
                      transformers=[
                          ("logscaler", logscaler, [
                           'Rain', 'IRWR', 'ERWR', 'TRWR', 'IRWR_capita', 'ERWR_capita', 'TRWR_capita','rural_pop', 'urban_pop', 'GDP_pcp']),
                          ("standardscaler", StandardScaler(),
                           df_pred.columns.values)
                      ]),
    ColumnTransformer(remainder='passthrough',
                      transformers=[
                          ("logscaler", logscaler, [
                           'Rain', 'IRWR', 'ERWR', 'TRWR', 'IRWR_capita', 'ERWR_capita', 'TRWR_capita','rural_pop', 'urban_pop', 'GDP_pcp']),
                          ("robustscaler", RobustScaler(),
                           df_pred.columns.values)
                      ]),
    ColumnTransformer(remainder='passthrough',
                      transformers=[
                          ("logscaler", logscaler, [
                           'Rain', 'IRWR', 'ERWR', 'TRWR', 'IRWR_capita', 'ERWR_capita', 'TRWR_capita','rural_pop', 'urban_pop', 'GDP_pcp']),
                          ("minmaxscaler", MinMaxScaler(),
                           df_pred.columns.values)
                      ])
]

### Dimensionality reduction

All number of components to test for the PCA analysis: 

In [9]:
pca_pred = PCA()

In [10]:
n_components_to_test = np.arange(3, 15)

### Regression models

All models + model parameters to test: 

In [42]:
model_1 = RandomForestRegressor(random_state=0)
max_depth_to_test = np.arange(5, 15)

model_2 = LinearRegression()

model_3 = Ridge()
alpha_to_test = np.arange(0.1,20,0.1)

### List of parameters to test

Make list of parameter dictionaries (one for each model):

In [47]:
params = [
    {'scaler': scalers_to_test,
     'reduce_dim__n_components': n_components_to_test,
     'regressor': [model_1],
     'regressor__max_depth': max_depth_to_test},

    {'scaler': scalers_to_test,
     'reduce_dim__n_components': n_components_to_test,
     'regressor': [model_2]},

    {'scaler': scalers_to_test,
     'reduce_dim__n_components': n_components_to_test,
     'regressor': [model_3],
     'regressor__alpha': alpha_to_test}
]

### Train, test, split + setup pipeline

In [48]:
X_train, X_test, y_train, y_test = train_test_split(
    df_pred, df_target_, random_state=0)

In [49]:
pipe_pred = Pipeline([
    ('scaler', scalers_to_test[0]),
    ('reduce_dim', pca_pred),
    ('regressor', model_1)
])

### Gridsearch pipeline

#### Target variable 1: WS_MDG

In [50]:
gridsearch_1 = GridSearchCV(
    pipe_pred, params, verbose=1, n_jobs=-1).fit(X_train, y_train['WS_MDG'])

Fitting 5 folds for each of 2520 candidates, totalling 12600 fits


In [51]:
print(gridsearch_1.best_score_)
print(gridsearch_1.best_params_)

0.7107286375254427
{'reduce_dim__n_components': 12, 'regressor': Ridge(alpha=3.8000000000000003), 'regressor__alpha': 3.8000000000000003, 'scaler': ColumnTransformer(remainder='passthrough',
                  transformers=[('logscaler',
                                 FunctionTransformer(func=<function log_transform at 0x0000026F292D5F70>),
                                 ['Rain', 'IRWR', 'ERWR', 'TRWR', 'IRWR_capita',
                                  'ERWR_capita', 'TRWR_capita', 'rural_pop',
                                  'urban_pop', 'GDP_pcp'])])}


#### Target variable 2: WUE_SDG

In [52]:
gridsearch_2 = GridSearchCV(
    pipe_pred, params, verbose=1, n_jobs=-1).fit(X_train, y_train['WUE_SDG'])

Fitting 5 folds for each of 2520 candidates, totalling 12600 fits


In [53]:
print(gridsearch_2.best_score_)
print(gridsearch_2.best_params_)

0.37343250310543874
{'reduce_dim__n_components': 14, 'regressor': Ridge(alpha=2.7), 'regressor__alpha': 2.7, 'scaler': ColumnTransformer(remainder='passthrough',
                  transformers=[('logscaler',
                                 FunctionTransformer(func=<function log_transform at 0x0000026F292D5F70>),
                                 ['Rain', 'IRWR', 'ERWR', 'TRWR', 'IRWR_capita',
                                  'ERWR_capita', 'TRWR_capita', 'rural_pop',
                                  'urban_pop', 'GDP_pcp'])])}


#### Target variable 3: WS_SDG

In [54]:
gridsearch_3 = GridSearchCV(
    pipe_pred, params, verbose=1, n_jobs=-1).fit(X_train, y_train['WS_SDG'])

Fitting 5 folds for each of 2520 candidates, totalling 12600 fits


In [55]:
print(gridsearch_3.best_score_)
print(gridsearch_3.best_params_)

0.7064338743330832
{'reduce_dim__n_components': 12, 'regressor': Ridge(alpha=2.8000000000000003), 'regressor__alpha': 2.8000000000000003, 'scaler': ColumnTransformer(remainder='passthrough',
                  transformers=[('logscaler',
                                 FunctionTransformer(func=<function log_transform at 0x0000026F292D5F70>),
                                 ['Rain', 'IRWR', 'ERWR', 'TRWR', 'IRWR_capita',
                                  'ERWR_capita', 'TRWR_capita', 'rural_pop',
                                  'urban_pop', 'GDP_pcp'])])}


### Best fit model

#### Target variable 1: WS_MDG

In [56]:
pipe_pred_1 = gridsearch_1.best_estimator_

In [57]:
pipe_pred_1.fit(X_train, y_train['WS_MDG'])
print('Model score: ', pipe_pred_1.score(X_test, y_test['WS_MDG']))
y_pred = pipe_pred_1.predict(df_pred)
print('R²', r2_score(df_target_['WS_MDG'], y_pred))

Model score:  0.6681031342149129
R² 0.7767260877841584


#### Target variable 2: WUE_SDG

In [58]:
pipe_pred_2 = gridsearch_2.best_estimator_

In [59]:
pipe_pred_2.fit(X_train, y_train['WUE_SDG'])
print('Model score: ', pipe_pred_2.score(X_test, y_test['WUE_SDG']))
y_pred = pipe_pred_2.predict(df_pred)
print('R²', r2_score(df_target_['WUE_SDG'], y_pred))

Model score:  0.6125779024810283
R² 0.6087918096530236


#### Target variable 3: WS_SDG

In [60]:
pipe_pred_3 = gridsearch_3.best_estimator_

In [61]:
pipe_pred_3.fit(X_train, y_train['WS_SDG'])
print('Model score: ', pipe_pred_3.score(X_test, y_test['WS_SDG']))
y_pred = pipe_pred_3.predict(df_pred)
print('R²', r2_score(df_target_['WS_SDG'], y_pred))

Model score:  0.6627483392955431
R² 0.7687793898866029
