# Part 2 Preprocessing and Modeling
----

### Imports

In [1]:
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score , GridSearchCV
from sklearn import metrics

import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn import metrics

### Load Data

In [2]:
# Set random seed for reproducible purposes
np.random.seed(42)

In [3]:
# Load in the clean train data
df = pd.read_csv('./data/clean_data.csv')
df.head(2)

Unnamed: 0,Latitude,Longitude,country,commodity,year,loss_percentage,activity,food_supply_stage
0,33.0,65.0,Afghanistan,Wheat,2017,15.0,wsc,Whole supply chain
1,33.0,65.0,Afghanistan,Maize (corn),2017,14.95,wsc,Whole supply chain


In [4]:
# Create a function that evaluates each model
def eval_model(model):
    # Make preds
    preds = model.predict(X_test)
    # R2 score and MSE
    r2= metrics.r2_score(y_test, preds)
    mse = metrics.mean_squared_error(y_test, preds)
    rmse = np.sqrt(mse)
    
    print(model)
    print('---------------------------')
    print("R2:", r2)
    print("MSE:", mse)
    print("RMSE:", rmse)

#### Preprocessing (Regression)

In [5]:
# Set target and features
X = df.drop(columns = 'loss_percentage')
y = df['loss_percentage']

In [6]:
# Getting dummy cols for catergorical columns 
X = pd.get_dummies(data =X, columns=['country','commodity','activity','food_supply_stage'], drop_first=True)

In [7]:
# Check out new shape with dummy cols
print(X.shape)
print(y.shape)

(23810, 404)
(23810,)


In [8]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [9]:
# Scale data
sc = StandardScaler()

# Fit and transform only on training to avoid data leakage, transform test
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

### Model 1 Random Forest (no tuning)

In [10]:
# Instantiate model
rf = RandomForestRegressor()

# Fit model
rf.fit(X_train, y_train)

# Estimate model's performance on unseen data
cross_val_score(rf, X_train, y_train, cv=3).mean()

0.6711981108090667

In [11]:
# Get eval results
eval_model(rf)

RandomForestRegressor()
---------------------------
R2: 0.7176073916075209
MSE: 7.567397687698302
RMSE: 2.7508903445427086


### Model 2 Random Forest w/ GridSearch

In [12]:
# Set params
params_rf = {
    'n_estimators': [100, 120,150],
    'max_depth': [None, 10,50,80],
    'min_samples_leaf': [1,3, 4, 5]
}

In [13]:
# Instantiate GridSearchCV
gs_rf = GridSearchCV(rf,
                  param_grid= params_rf,
                  cv = 3)

In [14]:
# Fit GridSearchCV
gs_rf.fit(X_train, y_train)

# Print best score
print(gs_rf.best_score_)

# Print best params
gs_rf.best_params_

0.6722218579114441


{'max_depth': None, 'min_samples_leaf': 1, 'n_estimators': 100}

In [15]:
# Get results
eval_model(gs_rf)

GridSearchCV(cv=3, estimator=RandomForestRegressor(),
             param_grid={'max_depth': [None, 10, 50, 80],
                         'min_samples_leaf': [1, 3, 4, 5],
                         'n_estimators': [100, 120, 150]})
---------------------------
R2: 0.7172789359512087
MSE: 7.576199456938073
RMSE: 2.7524896833481636


### Model 3 Random Forest w/ GridSearch

In [16]:
params_rf2 = {
    'n_estimators': [100, 120,150],
    'max_depth': [None,5,10,25, 50],
    'min_samples_leaf': [1,3, 4, 5],
    'min_samples_split': [2, 5, 10]
}

In [17]:
# Instantiate GridSearchCV
gs_rf2 = GridSearchCV(rf,
                  param_grid= params_rf2,
                  cv = 3)

In [18]:
# Fit GridSearchCV
gs_rf2.fit(X_train, y_train)

# Print best score
print(gs_rf2.best_score_)

# Print best params
gs_rf2.best_params_

0.6736165762009092


{'max_depth': 50,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 150}

In [19]:
# Get results
eval_model(gs_rf2)

GridSearchCV(cv=3, estimator=RandomForestRegressor(),
             param_grid={'max_depth': [None, 5, 10, 25, 50],
                         'min_samples_leaf': [1, 3, 4, 5],
                         'min_samples_split': [2, 5, 10],
                         'n_estimators': [100, 120, 150]})
---------------------------
R2: 0.7182964995353436
MSE: 7.548931362502095
RMSE: 2.7475318674224862


### Model 4 Support Vector Regressor

In [20]:
# https://www.analyticsvidhya.com/blog/2020/03/support-vector-regression-tutorial-for-machine-learning/
# https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html

In [21]:
# Instantiate model
svr = SVR(kernel='rbf')# this is the default

# Fit model
svr.fit(X_train, y_train)

# Estimate model's performance on unseen data
cross_val_score(svr, X_train, y_train, cv=3).mean()

0.2864476441975558

In [22]:
# Get results
eval_model(svr)

SVR()
---------------------------
R2: 0.3474930733226702
MSE: 17.48551223154698
RMSE: 4.181568154597864


### Model 5 DecisionTree 

In [23]:
# Instantiate model
dt = DecisionTreeRegressor()

# Fit model 
dt.fit(X_train, y_train)

# Estimate model's performance on unseen data
cross_val_score(dt, X_train, y_train, cv=3).mean()

0.5501243684108242

In [24]:
# Get results
eval_model(dt)

DecisionTreeRegressor()
---------------------------
R2: 0.6059045175441484
MSE: 10.560748242120717
RMSE: 3.249730487612891


### Model 6 Decision Tree w/ GridSearch

In [25]:
params_dt = {
    'max_depth': [9, 10,13,15],
    'min_samples_split': [ 15, 18,20],
    'min_samples_leaf': [1, 3, 5]
}

In [26]:
# Instantiate GridSearchCV
gs_dt = GridSearchCV(dt,
                  param_grid= params_dt,
                  cv = 3)

In [27]:
# Fit GridSearchCV
gs_dt.fit(X_train, y_train)

# Print best score
print(gs_dt.best_score_)

# Print best params
gs_dt.best_params_

0.5180265016637876


{'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 15}

In [28]:
# Get results
eval_model(gs_dt)

GridSearchCV(cv=3, estimator=DecisionTreeRegressor(),
             param_grid={'max_depth': [9, 10, 13, 15],
                         'min_samples_leaf': [1, 3, 5],
                         'min_samples_split': [15, 18, 20]})
---------------------------
R2: 0.5678647807427905
MSE: 11.580115632612637
RMSE: 3.402956895497302


### Model 7 Gradient Boosting

In [29]:
# Instantiate Model
gb = GradientBoostingRegressor()

# Fit model
gb.fit(X_train, y_train)


# Estimate model's performance on unseen data
cross_val_score(gb, X_train, y_train, cv=3).mean()

0.45664705076030304

In [30]:
# Get results
eval_model(gb)

GradientBoostingRegressor()
---------------------------
R2: 0.46876094719974315
MSE: 14.235844212283444
RMSE: 3.7730417718709988
