# Pipeline climate factors

## 1. Import Modules

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import StandardScaler, RobustScaler, QuantileTransformer, FunctionTransformer, MinMaxScaler, Normalizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, accuracy_score
from sklearn.impute import KNNImputer
from sklearn.compose import ColumnTransformer
from scipy.stats import shapiro
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import classification_report,  confusion_matrix
from sklearn.multioutput import MultiOutputRegressor
from sklearn.svm import SVR

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

## 2. Data Preprocessing

In [2]:
df_waterstress = pd.read_csv('clean data/water_stress.csv',index_col='Unnamed: 0')
df_climate = pd.read_csv('clean data/climate_factors.csv',index_col='Unnamed: 0')
df_socioec = pd.read_csv('clean data/socioec_factors.csv',index_col='Unnamed: 0')

### 2.1 Merge

Merging all three data sets together

In [3]:
df_waterstress.set_index('Country',inplace=True)
df_climate.set_index('Country',inplace=True)
df_socioec.set_index('Country',inplace=True)
df = pd.merge(df_waterstress,df_climate,left_index=True,right_index=True,how='outer')
df = pd.merge(df, df_socioec,left_index=True,right_index=True,how='outer')

### 2.2 Add additional climate variables

In [4]:
df['Total internal renewable water resources (IRWR) per capita'] = df['Total internal renewable water resources (IRWR)'] / (df['Total population (thousands)'] * 1000)
df['Total external renewable water resources (ERWR) per capita'] = df['Total external renewable water resources (ERWR)'] / (df['Total population (thousands)'] * 1000)
df['Total renewable water resources per capita'] = df['Total renewable water resources'] / (df['Total population (thousands)'] * 1000)

### 2.3 Drop Columns with low data availibility

All the NaNs in df, the index number is used later to refer to these variables instead of the full variable names

In [5]:
df.isna().sum().reset_index().rename(columns={'index':'Variables',0:'# NaN'})

Unnamed: 0,Variables,# NaN
0,Water stress (MDG),24
1,Water use efficiency (SDG),34
2,Water stress (SDG),24
3,Temperature (°C),7
4,Total Rainfall (mm),7
5,Total internal renewable water resources (IRWR),19
6,Total external renewable water resources (ERWR),7
7,Total renewable water resources,5
8,Dependency ratio,9
9,Total exploitable water resources,138


In [6]:
df = df.drop(df.columns[[9,44,46]],axis=1)

### 2.4 Remove NaNs

Only use one of both options

#### Option 1: Impute all missing NaNs using KNNImputer

So the NaNs are filled with values from countries that they resemble most based on the non missing values.

In [None]:
imputer = KNNImputer(n_neighbors=2,weights='distance')
df[:] = imputer.fit_transform(df)

--> Still need to test further if imputation is ok or not

#### Option 2: Remove countries with missing NaNs

In [7]:
df.dropna(inplace=True)

### 2.5 Scale Columns 

In [8]:
pd.DataFrame(df.columns)

Unnamed: 0,0
0,Water stress (MDG)
1,Water use efficiency (SDG)
2,Water stress (SDG)
3,Temperature (°C)
4,Total Rainfall (mm)
5,Total internal renewable water resources (IRWR)
6,Total external renewable water resources (ERWR)
7,Total renewable water resources
8,Dependency ratio
9,GDP per capita (current US$/inhab)


Make function to log transform

In [9]:
def log_transform(x):
    return np.log(x + 1)

Some commonly used scalers:

In [10]:
logscaler = FunctionTransformer(log_transform)
standardscaler = StandardScaler()
robustscaler = RobustScaler()
minmaxscaler = MinMaxScaler()

Log scaler on the target variables

In [11]:
df[0:3] = logscaler.fit_transform(df[0:3])

Standard scaler on the whole dataset

In [12]:
df[:] = standardscaler.fit_transform(df)

--> Still many options to play around with regarding the scaling. Only tested logscaling on target variables now (as Herbert suggested in modelling_climate_factors.ipynb).

### 2.6 Split dataframe into chosen predictor and target variables

In [13]:
df_pred_climate = df.iloc[:,np.r_[3:9, 45:48]]
df_pred_socioec = df.iloc[:,9:45]
df_target = df.iloc[:,0:3]

## 3. Dimensionality reduction

### 3.1 Target Variable

it takes the number of components until the total explained variance is 95%

In [14]:
pca = PCA(n_components=0.95)
pca.fit(df_target)

PCA(n_components=0.95)

In [15]:
print('Explained variance:',pca.explained_variance_ratio_)
pd.DataFrame(pca.components_,columns=df_target.columns)

Explained variance: [0.6808065  0.31904668]


Unnamed: 0,Water stress (MDG),Water use efficiency (SDG),Water stress (SDG)
0,0.693191,0.198744,0.692811
1,-0.138533,0.980047,-0.142533


--> Logical, so 1 principal component are both water stress scores (from MDG and SDG) and second component is the water use efficiency

In [16]:
df_target_pca = pd.DataFrame(pca.transform(df_target))

### 3.2 Climate Variables: Water resources

There are many variables for the water resources than are correlated, so tried to reduce the dimensions. Rainfall and temperature are not included.

In [17]:
pca = PCA(n_components=0.95)
pca.fit(df_pred_climate.iloc[:,2:])

PCA(n_components=0.95)

In [18]:
print('Explained variance:',pca.explained_variance_ratio_)
pd.DataFrame(pca.components_,columns=df_pred_climate.iloc[:,2:].columns)

Explained variance: [0.39547784 0.27906268 0.20021092 0.08867165]


Unnamed: 0,Total internal renewable water resources (IRWR),Total external renewable water resources (ERWR),Total renewable water resources,Dependency ratio,Total internal renewable water resources (IRWR) per capita,Total external renewable water resources (ERWR) per capita,Total renewable water resources per capita
0,0.532581,0.511428,0.568173,0.017996,0.194406,0.197442,0.23426
1,-0.16111,-0.206302,-0.188279,-0.149235,0.660238,0.097809,0.654432
2,-0.256921,0.153722,-0.146731,0.692417,-0.075626,0.629901,0.083047
3,-0.007198,-0.089753,-0.03405,-0.654425,-0.239213,0.70877,-0.053691


Principale Components:

0. Large internal and external water resources
1. Only large internal water resources per capita 
2. Only large external water resources per capita + rely on external sources 
3. Don't know how to make sense on the last component, maybe we can leave it out


In [19]:
df_pred_climate_pca = pd.DataFrame(np.concatenate((df_pred_climate.iloc[:,:2].to_numpy(),pca.transform(df_pred_climate.iloc[:,2:])),axis=1))

## 4. Modelling

### 4.1 Climate Factors

#### 4.1.1 Train test split 

In [20]:
X_train, X_test, y_train, y_test = train_test_split(df_pred_climate_pca,df_target_pca,random_state=0)

#### 4.1.2 Test multiple models

In [31]:
model_1 = RandomForestRegressor(max_depth=2,random_state=0)
model_2 = LinearRegression()
model_3 = Ridge(alpha=19)
model_4 = SVR()

models=[model_1, model_2, model_3,model_4]

--> feel free to try other models

In [32]:
for i in range(len(df_target_pca.columns)):
    print('Target Principale Component: ',df_target_pca.columns[i])
    for model in models:
        model.fit(X_train,y_train.iloc[:,i])
        print(' ',model)
        print('   Model score: ',model.score(X_train,y_train.iloc[:,i]))
        y_pred = model.predict(df_pred_climate_pca)
        print('   R²',r2_score(df_target_pca.iloc[:,i],y_pred))
        print('')
    print('')

Target Principale Component:  0
  RandomForestRegressor(max_depth=2, random_state=0)
   Model score:  0.7284031440837294
   R² 0.6649369469979624

  LinearRegression()
   Model score:  0.10220943595518006
   R² 0.0015633669778680748

  Ridge(alpha=19)
   Model score:  0.09136603163959334
   R² 0.0730585481399697

  SVR()
   Model score:  0.007619660884640167
   R² 0.016178367957373818


Target Principale Component:  1
  RandomForestRegressor(max_depth=2, random_state=0)
   Model score:  0.6192542844782101
   R² 0.5789688076041271

  LinearRegression()
   Model score:  0.10704725403034032
   R² 0.02175874034084857

  Ridge(alpha=19)
   Model score:  0.10049674432986155
   R² 0.0512653807582768

  SVR()
   Model score:  -0.005656292612457481
   R² 0.00641367517529301




**Option 1 (section 2.4) = imputing NaNs:** With the RandomForest regressor we have an R² of 0.78 on the water stress and 0.18 on the water use efficiency, so most of the water stress can be explained by the climate, but the water use efficiency not, as expected.

**Option 2 (section 2.4) = dropping NaNs:** With the RandomForest regressor we have an R² of about 0.60 for both water stress and water use efficiency.

--> So something about the imputation makes the water use efficiency predicitons worse

#### 4.1.3 Fine Tune models

##### Random Forest

In [23]:
depth = np.arange(2,20)

In [24]:
grid = GridSearchCV(RandomForestRegressor(random_state=0),{'max_depth': depth})

In [25]:
grid.fit(X_train,y_train)

GridSearchCV(estimator=RandomForestRegressor(random_state=0),
             param_grid={'max_depth': array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       19])})

In [26]:
grid.best_params_

{'max_depth': 2}

--> best depth of the randomforest is 2

##### Ridge

In [27]:
alpha = np.arange(1,20)

In [28]:
grid = GridSearchCV(Ridge(),{'alpha': alpha})

In [29]:
grid.fit(X_train,y_train)

GridSearchCV(estimator=Ridge(),
             param_grid={'alpha': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19])})

In [30]:
grid.best_params_

{'alpha': 19}

--> best alpha for ridge is 19

## Leftover from first attempt to make pipeline

In [None]:
preprocess_1 = ColumnTransformer(remainder = 'passthrough', 
                               transformers = 
                               [("drop_cols", 'drop', [9,44,46])])

In [None]:
preprocess_2 = ColumnTransformer(remainder = 'passthrough', 
                               transformers = 
                               [("impute_nans", KNNImputer(n_neighbors=3), df.columns.tolist())])

In [None]:
pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('regressor', LinearRegression())
        ])

In [None]:
pipe = pipe.fit(X_train, y_train)

### Plot Model

In [None]:
y_model = pipe.predict(X_test)

In [None]:
plt.scatter(y_test['Water stress (SDG)'], y_model[:,2], c='crimson')
plt.xlabel('Observed')
plt.ylabel('Predicted')
plt.show()

In [None]:
plt.scatter(y_test['Water stress (MDG)'], y_model[:,0], c='crimson')
plt.xlabel('Observed')
plt.ylabel('Predicted')
plt.show()

In [None]:
plt.scatter(y_test['Water use efficiency (SDG)'], y_model[:,1], c='crimson')
plt.xlabel('Observed')
plt.ylabel('Predicted')
plt.show()