# Pipeline climate factors

## 1. Import Modules

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import StandardScaler, RobustScaler, QuantileTransformer, FunctionTransformer, MinMaxScaler, Normalizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, accuracy_score
from sklearn.impute import KNNImputer
from sklearn.compose import ColumnTransformer
from scipy.stats import shapiro
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import classification_report,  confusion_matrix
from sklearn.multioutput import MultiOutputRegressor
from sklearn.svm import SVR

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

## 2. Load Data

In [28]:
df = pd.read_csv('clean data/final_data.csv',index_col=0)
df

Unnamed: 0,Country,WS_MDG,WUE_SDG,WS_SDG,Temp,Rain,IRWR,ERWR,TRWR,Dep_ratio,rural_pop,urban_pop,HDI,r_u,r_u_access,pop_growth,mort_rate,GDP_pcp,life_ex
0,AFG,31.045462,0.923778,54.757019,14.074742,349.736945,47.15,18.18,65.33,0.278280,26558.609,8971.472,0.493,2.960340,0.601023,3.06,53.2,2226.0,63.4
1,AGO,0.475539,142.467836,1.871883,22.182196,960.024065,148.00,0.40,148.40,0.002695,10472.554,19311.639,0.576,0.542292,0.374005,3.44,58.6,7859.4,59.2
2,ALB,3.933775,6.656907,7.139423,12.754647,1079.459167,26.90,3.30,30.20,0.109272,1190.155,1740.032,0.789,0.683985,1.003161,-0.20,8.6,12227.4,78.0
3,ARE,1708.000000,92.773763,1708.000000,28.010773,64.449765,0.15,0.00,0.15,0.000000,1292.709,8107.436,0.864,0.159447,1.004016,0.74,7.0,64243.0,77.2
4,ARG,4.301333,13.616564,10.456664,14.767043,598.510300,292.00,584.24,876.24,0.666758,3652.804,40618.237,0.832,0.089930,1.010101,1.08,10.2,23732.2,76.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120,USA,14.480160,42.378501,28.161984,8.218411,708.156050,2818.00,251.00,3069.00,0.081786,58215.947,266243.516,0.919,0.218657,0.987928,0.68,6.0,58237.6,79.0
121,UZB,120.523839,1.337755,168.913106,13.707806,234.213108,16.34,32.53,48.87,0.665644,15779.684,16130.957,0.707,0.978224,0.821320,1.68,20.4,6037.2,70.8
122,VNM,9.259150,2.349448,18.130315,24.860117,1879.263025,359.42,524.70,884.12,0.593471,61898.302,33642.498,0.690,1.839884,0.977800,1.04,17.2,6455.0,75.0
123,ZAF,37.740993,14.659097,62.055716,18.620716,403.002932,44.80,6.55,51.35,0.127556,19369.002,37348.154,0.704,0.518607,0.817269,1.52,28.6,12796.6,62.6


### 2.1 Add additional climate variables

In [34]:
df['IRWR_capita'] = df['IRWR'] / ((df['urban_pop'] + df['rural_pop']) * 1000)
df['ERWR_capita'] = df['ERWR'] / ((df['urban_pop'] + df['rural_pop']) * 1000)
df['TRWR_capita'] = df['TRWR'] / ((df['urban_pop'] + df['rural_pop']) * 1000)

### 2.2 Split dataframe into chosen predictor and target variables

In [43]:
df_pred_climate = df.iloc[:,np.r_[4:10, 19:22]]
df_pred_socioec = df.iloc[:,10:18]
df_target = df.iloc[:,1:4]

## 3. Pipeline

### 3.1 Scalers

Make function to log transform

In [39]:
def log_transform(x):
    return np.log(x + 1)

In [40]:
logscaler = FunctionTransformer(log_transform)
standardscaler = StandardScaler()
robustscaler = RobustScaler()
minmaxscaler = MinMaxScaler()

In [41]:
preprocess = ColumnTransformer(remainder = 'passthrough', 
                               transformers = 
                               [("logscaler", logscaler, [9,44,46]),
                                ("standardscaler", logscaler, [9,44,46]),
                               ])

### 3.2 Dimensionality reduction

In [None]:
pca = PCA(n_components=0.95)

### 3.3 Regression Models

In [37]:
model_1 = RandomForestRegressor(max_depth=2,random_state=0)
model_2 = LinearRegression()
model_3 = Ridge(alpha=19)
model_4 = SVR()

models=[model_1, model_2, model_3,model_4]

### 3.3 Assemble pipeline

In [None]:
pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('regressor', LinearRegression())
        ])

In [None]:
pipe = pipe.fit(X_train, y_train)

### 3.4 Run pipeline

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_pred_climate_pca,df_target_pca,random_state=0)

## 3. Dimensionality reduction

### 3.1 Target Variable

In [15]:
print('Explained variance:',pca.explained_variance_ratio_)
pd.DataFrame(pca.components_,columns=df_target.columns)

Explained variance: [0.6808065  0.31904668]


Unnamed: 0,Water stress (MDG),Water use efficiency (SDG),Water stress (SDG)
0,0.693191,0.198744,0.692811
1,-0.138533,0.980047,-0.142533


Principale Components:

0. Large internal and external water resources
1. Only large internal water resources per capita 
2. Only large external water resources per capita + rely on external sources 
3. Don't know how to make sense on the last component, maybe we can leave it out


In [19]:
df_pred_climate_pca = pd.DataFrame(np.concatenate((df_pred_climate.iloc[:,:2].to_numpy(),pca.transform(df_pred_climate.iloc[:,2:])),axis=1))

## 4. Modelling

### 4.1 Climate Factors

#### 4.1.1 Train test split 

In [32]:
for i in range(len(df_target_pca.columns)):
    print('Target Principale Component: ',df_target_pca.columns[i])
    for model in models:
        model.fit(X_train,y_train.iloc[:,i])
        print(' ',model)
        print('   Model score: ',model.score(X_train,y_train.iloc[:,i]))
        y_pred = model.predict(df_pred_climate_pca)
        print('   R²',r2_score(df_target_pca.iloc[:,i],y_pred))
        print('')
    print('')

Target Principale Component:  0
  RandomForestRegressor(max_depth=2, random_state=0)
   Model score:  0.7284031440837294
   R² 0.6649369469979624

  LinearRegression()
   Model score:  0.10220943595518006
   R² 0.0015633669778680748

  Ridge(alpha=19)
   Model score:  0.09136603163959334
   R² 0.0730585481399697

  SVR()
   Model score:  0.007619660884640167
   R² 0.016178367957373818


Target Principale Component:  1
  RandomForestRegressor(max_depth=2, random_state=0)
   Model score:  0.6192542844782101
   R² 0.5789688076041271

  LinearRegression()
   Model score:  0.10704725403034032
   R² 0.02175874034084857

  Ridge(alpha=19)
   Model score:  0.10049674432986155
   R² 0.0512653807582768

  SVR()
   Model score:  -0.005656292612457481
   R² 0.00641367517529301




#### 4.1.3 Fine Tune models

##### Random Forest

In [23]:
depth = np.arange(2,20)

In [24]:
grid = GridSearchCV(RandomForestRegressor(random_state=0),{'max_depth': depth})

In [25]:
grid.fit(X_train,y_train)

GridSearchCV(estimator=RandomForestRegressor(random_state=0),
             param_grid={'max_depth': array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       19])})

In [26]:
grid.best_params_

{'max_depth': 2}

--> best depth of the randomforest is 2

##### Ridge

In [27]:
alpha = np.arange(1,20)

In [28]:
grid = GridSearchCV(Ridge(),{'alpha': alpha})

In [29]:
grid.fit(X_train,y_train)

GridSearchCV(estimator=Ridge(),
             param_grid={'alpha': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19])})

In [30]:
grid.best_params_

{'alpha': 19}

--> best alpha for ridge is 19