## Imports

In [103]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random

from sklearn import metrics
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, StackingRegressor, BaggingRegressor, GradientBoostingRegressor 
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline

from sklearn.svm import SVR

---

## Title: Overview? Description?

For this project, we are exploring two types of models, classification and regression. We going to see if we create a model that will predict the loss_percentage given a country while also identifying accurately which food stage is highly associated with food loss. 

The models we explored include:


---

## [EDIT]: File Path

In [50]:
np.random.seed(42)

In [51]:
df = pd.read_csv('merged_df.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Latitude,Longitude,country,commodity,year,loss_percentage,activity,food_supply_stage
0,0,33.0,65.0,Afghanistan,Wheat,2017,15.0,Missing,Whole supply chain
1,1,33.0,65.0,Afghanistan,Maize (corn),2017,14.95,Missing,Whole supply chain
2,2,33.0,65.0,Afghanistan,Rice,2017,7.09,Missing,Whole supply chain
3,3,33.0,65.0,Afghanistan,Barley,2017,14.74,Missing,Whole supply chain
4,4,33.0,65.0,Afghanistan,Wheat,2016,15.02,Missing,Whole supply chain


In [52]:
df.drop(columns = 'Unnamed: 0', inplace = True)

In [53]:
df.head()

Unnamed: 0,Latitude,Longitude,country,commodity,year,loss_percentage,activity,food_supply_stage
0,33.0,65.0,Afghanistan,Wheat,2017,15.0,Missing,Whole supply chain
1,33.0,65.0,Afghanistan,Maize (corn),2017,14.95,Missing,Whole supply chain
2,33.0,65.0,Afghanistan,Rice,2017,7.09,Missing,Whole supply chain
3,33.0,65.0,Afghanistan,Barley,2017,14.74,Missing,Whole supply chain
4,33.0,65.0,Afghanistan,Wheat,2016,15.02,Missing,Whole supply chain


---

## Functions

### GridSearch Functions
* best_params: returns the best score and paramters for pipeline/model
* return_gs: returns the optimal paramters of a pipeline/model
* tts_scores: returns the Train and Test scores of a pipeline/model

In [91]:
def best_params(pipeline, params, X_train, y_train):
    gs = GridSearchCV(pipeline,
                      param_grid = params,
                      n_jobs=-1, 
                      cv = 3)

    gs.fit(X_train, y_train)
    
    return f'Best Score: {gs.best_score_}, Params: {gs.best_params_}'

In [92]:
def return_gs(pipeline, params, X_train, y_train):
    gs = GridSearchCV(pipeline,
                      param_grid = params,
                      n_jobs=-1, 
                      cv = 3)
    return gs

In [56]:
def tts_scores(gs, X_train, y_train, X_test, y_test):
    gs.fit(X_train, y_train)
    
    return f'Train Score: {gs.score(X_train, y_train)}, Test Score: {gs.score(X_test, y_test)}'

---

### Evaluation Functions

* predictions: accepts a pipeline and returns the predictions for y
* regression_scores: returns DataFrame of data for a model and its regression metric scores (R2, MSE, and RMSE)
* classification_scores: returns DataFrame of data fro a model and its classification metric scores (recall, precision, f1, accuracy)

In [60]:
def predictions(pipeline, X_train, X_test, y_train):
    pipeline.fit(X_train, y_train)
    prediction = pipeline.predict(X_test)
    
    return prediction

In [61]:
def regression_scores(model, y_test, y_pred):
    dataframe = pd.DataFrame(columns = ['R2', 'MSE', 'RMSE'])
    
    r2 = metrics.r2_score(y_test, y_pred)
    mse = metrics.mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    
    dataframe.loc[model] = [r2, mse, rmse]
    
    pd.options.display.float_format = '{:.4f}'.format 
    
    return dataframe

In [62]:
def classification_scores(model, y_test, y_pred):
    dataframe = pd.DataFrame(columns = ['Recall', 'Precision', 'F1', 'Accuracy'])
    
    recall = recall_score(y_test, y_pred, average = 'micro')
    precision = precision_score(y_test, y_pred, average = 'micro')
    f1 = f1_score(y_test, y_pred, average = 'micro')
    accuracy = accuracy_score(y_test, y_pred)
    
    dataframe.loc[model] = [recall, precision, f1, accuracy]
    
    return dataframe

---

### Train-Test-Split (for Regression Modeling)

In [217]:
df_dummy = pd.get_dummies(df, columns = ['country',
                             'commodity',
                             'year',
                             'activity',
                             'food_supply_stage'])

In [218]:
X = df_dummy.drop(columns = 'loss_percentage')
y = df_dummy['loss_percentage']

In [219]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

---

## Regression Models

Insert description of problem statement

* Best:
* Other:

### Random Forest Regressor

In [197]:
rfr_pipe = Pipeline([
    ('ss', StandardScaler()),
    ('rfr', RandomForestRegressor())
])

In [198]:
rfr_params = {
    'ss__with_mean': [False, True],
    'ss__with_std': [False, True],
    'rfr__n_estimators': [100, 150, 200, 250],
    'rfr__max_depth': [None, 10,50,80],
    'rfr__min_samples_leaf': [1,3, 4, 5]
}

In [220]:
best_params(rfr_pipe, rfr_params, X_train, y_train)

"Best Score: 0.6653859451377918, Params: {'rfr__max_depth': 80, 'rfr__min_samples_leaf': 1, 'rfr__n_estimators': 250, 'ss__with_mean': True, 'ss__with_std': False}"

In [221]:
rfr_gs = return_gs(rfr_pipe, rfr_params, X_train, y_train)

In [222]:
tts_scores(rfr_gs, X_train, y_train, X_test, y_test)

'Train Score: 0.960325912301814, Test Score: 0.6817730095143225'

In [223]:
cross_val_score(rfr_pipe, X_train, y_train, cv=3).mean()

0.6579587506001923

In [224]:
rfr_pred = predictions(rfr_pipe, X_train, X_test, y_train)

In [225]:
regression_scores('RandomForest', y_test, rfr_pred)

Unnamed: 0,R2,MSE,RMSE
RandomForest,0.6893,8.8347,2.9723


### Decision Tree Regressor

[finished, add description]

In [71]:
dtr_pipe = Pipeline([
    ('ss', StandardScaler()),
    ('dtr', DecisionTreeRegressor())
])

In [77]:
dtr_params = {
    'ss__with_mean': [False, True],
    'ss__with_std': [False, True],
    'dtr__max_depth': [9, 10,13,15],
    'dtr__min_samples_split': [15, 18,20],
    'dtr__min_samples_leaf': [1, 3, 5]
}

In [78]:
best_params(dtr_pipe, dtr_params, X_train, y_train)

"Best Score: 0.5780335094098881, Params: {'dtr__max_depth': 15, 'dtr__min_samples_leaf': 1, 'dtr__min_samples_split': 15, 'ss__with_mean': False, 'ss__with_std': True}"

In [85]:
dtr_gs = return_gs(dtr_pipe, dtr_params, X_train, y_train)

In [86]:
tts_scores(dtr_gs, X_train, y_train, X_test, y_test)

'Train Score: 0.7055254156441129, Test Score: 0.5521184992017156'

In [93]:
cross_val_score(dtr_pipe, X_train, y_train, cv=3).mean()

0.5333742126823715

In [89]:
dtr_pred = predictions(dtr_pipe, X_train, X_test, y_train)

In [90]:
regression_scores('DecisionTree', y_test, dtr_pred)

Unnamed: 0,R2,MSE,RMSE
DecisionTree,0.5844,11.8152,3.4373


### Stacking (Regression)

[finished, add description]

In [104]:
level1_estimators =[
    ('rfr_pipe', Pipeline([
        ('ss', StandardScaler()),
        ('rfr', RandomForestRegressor())
    ])), 
    ('dt_pipe', Pipeline([
        ('ss', StandardScaler()),
        ('dt', DecisionTreeRegressor())
    ])),  
    ('bag_pipe', Pipeline([
        ('ss', StandardScaler()),
        ('bag', BaggingRegressor())
    ])),       
]

In [105]:
stacked_model = StackingRegressor(estimators = level1_estimators,
                                 final_estimator = LinearRegression())

In [106]:
cross_val_score(stacked_model, X_train, y_train).mean()

0.7007155748197995

In [107]:
stacked_model.fit(X_train, y_train)

In [108]:
stacked_model.score(X_train, y_train), stacked_model.score(X_test, y_test)

(0.9301829317367171, 0.6852239075332078)

In [109]:
stacked_pred = predictions(stacked_model, X_train, X_test, y_train)

In [203]:
regression_scores('Stacking', y_test, stacked_pred)

Unnamed: 0,R2,MSE,RMSE
Stacking,0.6903,8.8057,2.9674


### Gradient Boosting

[finished, add description, decent model]

In [113]:
gb_pipe = Pipeline([
    ('ss', StandardScaler()),
    ('gb', GradientBoostingRegressor())
])

In [114]:
gb_params = {
    'ss__with_mean': [False, True],
    'ss__with_std': [False, True],
}

In [116]:
best_params(dtr_pipe, dtr_params, X_train, y_train)

"Best Score: 0.5325108305294306, Params: {'dtr__max_depth': 15, 'dtr__min_samples_leaf': 3, 'dtr__min_samples_split': 18, 'ss__with_mean': True, 'ss__with_std': False}"

In [117]:
dtr_gs = return_gs(dtr_pipe, dtr_params, X_train, y_train)

In [118]:
tts_scores(dtr_gs, X_train, y_train, X_test, y_test)

'Train Score: 0.7055254156441129, Test Score: 0.5517650233874261'

In [115]:
cross_val_score(gb_pipe, X_train, y_train, cv=3).mean()

0.4844617544504683

In [119]:
gb_pred = predictions(gb_pipe, X_train, X_test, y_train)

In [204]:
regression_scores('GradientBoost', y_test, gb_pred)

Unnamed: 0,R2,MSE,RMSE
GradientBoost,0.4648,15.2176,3.901


### Support Vector Regressor
[finished, add description, bad model so we will not look into it further]

In [101]:
svr_pipe = Pipeline([
    ('ss', StandardScaler()),
    ('svr', SVR(kernel = 'rbf'))
])

In [102]:
cross_val_score(svr_pipe, X_train, y_train, cv=3).mean()

0.28358629460901513

## Regression Model DataFrame

In [173]:
# Stacking
reg_table = regression_scores('Stacking', y_test, stacked_pred)

In [174]:
# Decision Tree
reg_table = pd.concat([reg_table, regression_scores('DecisionTree', y_test, dtr_pred)])

In [175]:
# Gradient Boosting
reg_table = pd.concat([reg_table, regression_scores('GradientBoost', y_test, gb_pred)])

In [226]:
# Random Forest
reg_table = pd.concat([reg_table, regression_scores('RandomForest', y_test, rfr_pred)])

In [227]:
reg_table

Unnamed: 0,R2,MSE,RMSE
Stacking,0.6903,8.8057,2.9674
DecisionTree,0.5844,11.8152,3.4373
GradientBoost,0.4648,15.2176,3.901
RandomForest,0.6893,8.8347,2.9723


---

## Classification Models

[Insert description of problem statement]

### Random Forest Classifier

[almost, need to clarify tomorrow which data to use (it's a good score either way)]

In [144]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

In [205]:
df = pd.read_csv('./merged_df.csv')

In [206]:
for column in df.select_dtypes(include=['object']).columns:
    imputer = SimpleImputer(strategy='most_frequent')
    df[column] = imputer.fit_transform(df[[column]])

In [207]:
# For numerical columns use mean strategy
for column in df.select_dtypes(include=[np.number]).columns:
    imputer = SimpleImputer(strategy='mean')
    df[column] = imputer.fit_transform(df[[column]])

In [208]:
# Encode categorical features
label_encoders = {}
for column in df.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    df[column] = label_encoders[column].fit_transform(df[column])

In [209]:
# Split into features and target variable
X = df.drop('food_supply_stage', axis=1)
y = df['food_supply_stage']

In [210]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [211]:
# Create a Random Forest Classifier and fit it to the training data
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

In [152]:
tts_scores(clf, X_train, y_train, X_test, y_test)

'Train Score: 1.0, Test Score: 0.9620624607000628'

In [154]:
rfc_pred = predictions(clf, X_train, X_test, y_train)

In [212]:
classification_scores('RandomForestClassifier', y_test, rfc_pred)

Unnamed: 0,Recall,Precision,F1,Accuracy
RandomForestClassifier,0.9621,0.9621,0.9621,0.9621


### Decision Tree Classifier
[almost, need clarification]

In [213]:
features = pd.get_dummies(df, columns = ['Longitude', 'Latitude','country',
                              'commodity',
                              'year',
                              'activity'])

In [214]:
X = features.drop(columns = 'food_supply_stage')
y = pd.get_dummies(df['food_supply_stage'], columns = ['food_supply_stage'])

In [215]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [188]:
dt_pipe = Pipeline([
    ('ss', StandardScaler()),
    ('dt', DecisionTreeClassifier())
])

In [189]:
dt_params = {
    'ss__with_mean': [False, True],
    'ss__with_std': [False, True],
    'dt__max_depth': [1, 5, 10, 25, 100],
    'dt__min_samples_leaf': [1, 3, 6, 10],
    'dt__min_samples_split': [5, 10, 15, 20]
}

In [190]:
best_params(dt_pipe, dt_params, X_train, y_train)

"Best Score: 0.950807758958019, Params: {'dt__max_depth': 100, 'dt__min_samples_leaf': 1, 'dt__min_samples_split': 5, 'ss__with_mean': True, 'ss__with_std': False}"

In [163]:
dt_gs = return_gs(dt_pipe, dt_params, X_train, y_train)

In [164]:
tts_scores(dt_gs, X_train, y_train, X_test, y_test)

'Train Score: 0.9845156241265582, Test Score: 0.9587525150905433'

In [165]:
dt_pred = predictions(dt_pipe, X_train, X_test, y_train)

In [216]:
classification_scores('DecisionTree', y_test, dt_pred)

Unnamed: 0,Recall,Precision,F1,Accuracy
DecisionTree,0.9634,0.9634,0.9634,0.9634


### Classification Model DataFrame

In [184]:
class_model = classification_scores('RandomForestClassifier', y_test, rfc_pred)

In [192]:
class_model = pd.concat([class_model, classification_scores('DecisionTree', y_test, dt_pred)])

In [193]:
class_model

Unnamed: 0,Recall,Precision,F1,Accuracy
RandomForestClassifier,0.9621,0.9621,0.9621,0.9621
DecisionTree,0.9634,0.9634,0.9634,0.9634
