## Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random

# encoding imports
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

# evaluation imports
from sklearn import metrics
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline

# modeling imports
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, StackingRegressor, BaggingRegressor, GradientBoostingRegressor 
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVR

# imbalanced modeling
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline

---

## Title: Overview? Description?

For this project, we are exploring two types of models, classification and regression. We going to see if we create a model that will predict the loss_percentage given a country while also identifying accurately which food stage is highly associated with food loss. 

The models we explored include:


---

## [EDIT]: File Path

In [5]:
np.random.seed(42)

In [6]:
df = pd.read_csv('./data/clean_data.csv')
df.head()

Unnamed: 0,Latitude,Longitude,country,commodity,year,loss_percentage,activity,food_supply_stage
0,33.0,65.0,Afghanistan,Wheat,2017,15.0,wsc,Whole supply chain
1,33.0,65.0,Afghanistan,Maize (corn),2017,14.95,wsc,Whole supply chain
2,33.0,65.0,Afghanistan,Rice,2017,7.09,wsc,Whole supply chain
3,33.0,65.0,Afghanistan,Barley,2017,14.74,wsc,Whole supply chain
4,33.0,65.0,Afghanistan,Wheat,2016,15.02,wsc,Whole supply chain


---

## Functions

### GridSearch Functions
* best_params: returns the best score and paramters for pipeline/model
* return_gs: returns the optimal paramters of a pipeline/model
* tts_scores: returns the Train and Test scores of a pipeline/model

In [7]:
def best_params(pipeline, params, X_train, y_train):
    gs = GridSearchCV(pipeline,
                      param_grid = params,
                      n_jobs=-1, 
                      cv = 3)

    gs.fit(X_train, y_train)
    
    return f'Best Score: {gs.best_score_}, Params: {gs.best_params_}'

In [8]:
def return_gs(pipeline, params, X_train, y_train):
    gs = GridSearchCV(pipeline,
                      param_grid = params,
                      n_jobs=-1, 
                      cv = 3)
    return gs

In [9]:
def tts_scores(gs, X_train, y_train, X_test, y_test):
    gs.fit(X_train, y_train)
    
    return f'Train Score: {gs.score(X_train, y_train)}, Test Score: {gs.score(X_test, y_test)}'

---

### Evaluation Functions

* predictions: accepts a pipeline and returns the predictions for y
* regression_scores: returns DataFrame of data for a model and its regression metric scores (R2, MSE, and RMSE)
* classification_scores: returns DataFrame of data fro a model and its classification metric scores (recall, precision, f1, accuracy)

In [10]:
def predictions(pipeline, X_train, X_test, y_train):
    pipeline.fit(X_train, y_train)
    prediction = pipeline.predict(X_test)
    
    return prediction

In [11]:
def regression_scores(model, y_test, y_pred):
    dataframe = pd.DataFrame(columns = ['R2', 'MSE', 'RMSE'])
    
    r2 = metrics.r2_score(y_test, y_pred)
    mse = metrics.mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    
    dataframe.loc[model] = [r2, mse, rmse]
    
    pd.options.display.float_format = '{:.4f}'.format 
    
    return dataframe

In [12]:
def classification_scores(model, y_test, y_pred):
    dataframe = pd.DataFrame(columns = ['Recall', 'Precision', 'F1', 'Accuracy'])
    
    recall = recall_score(y_test, y_pred, average = 'micro')
    precision = precision_score(y_test, y_pred, average = 'micro')
    f1 = f1_score(y_test, y_pred, average = 'micro')
    accuracy = accuracy_score(y_test, y_pred)
    
    dataframe.loc[model] = [recall, precision, f1, accuracy]
    
    return dataframe

---

### Train-Test-Split (for Regression Modeling)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23810 entries, 0 to 23809
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Latitude           23810 non-null  float64
 1   Longitude          23810 non-null  float64
 2   country            23810 non-null  object 
 3   commodity          23810 non-null  object 
 4   year               23810 non-null  int64  
 5   loss_percentage    23810 non-null  float64
 6   activity           23810 non-null  object 
 7   food_supply_stage  23810 non-null  object 
dtypes: float64(3), int64(1), object(4)
memory usage: 1.5+ MB


* Note: We encoded year as a dummy variable as it being an integer may have the model believe it has numerical value to the data

In [174]:
df_dummy = pd.get_dummies(df, columns = ['country',
                                         'commodity', 
                                         'year',
                                         'activity',
                                         'food_supply_stage'])

In [175]:
X = df_dummy.drop(columns = 'loss_percentage')
y = df_dummy['loss_percentage']

In [176]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

---

## Regression Models

Insert description of problem statement

* Best:
* Other:

### Baseline Accuracy

In [111]:
def bl_compare(model, pred):
    dataframe = pd.DataFrame(columns = ['Mean'])
    
    pipe_median = np.mean(pred)
    
    dataframe.loc[model] = [pipe_median]

    return dataframe

In [112]:
bl_mean = np.mean(y_train)

In [113]:
bl_pred = np.full_like(y_test, bl_mean)

In [114]:
regression_scores('Baseline', y_test, bl_pred)

Unnamed: 0,R2,MSE,RMSE
Baseline,-0.0001,26.799,5.1768


In [115]:
bl_compare('Baseline', bl_pred)

Unnamed: 0,Mean
Baseline,3.9391


### Random Forest Regressor

In [177]:
rfr_pipe = Pipeline([
    ('ss', StandardScaler()),
    ('rfr', RandomForestRegressor())
])

In [178]:
rfr_params = {
    'ss__with_mean': [False, True],
    'ss__with_std': [False, True],
    'rfr__n_estimators': [100, 150, 200, 250],
    'rfr__max_depth': [None, 10,50,80],
    'rfr__min_samples_leaf': [1,3, 4, 5]
}

In [179]:
best_params(rfr_pipe, rfr_params, X_train, y_train)

"Best Score: 0.6675328082564227, Params: {'rfr__max_depth': None, 'rfr__min_samples_leaf': 1, 'rfr__n_estimators': 150, 'ss__with_mean': True, 'ss__with_std': True}"

In [180]:
rfr_gs = return_gs(rfr_pipe, rfr_params, X_train, y_train)

In [None]:
tts_scores(rfr_gs, X_train, y_train, X_test, y_test)

In [None]:
cross_val_score(rfr_pipe, X_train, y_train, cv=3).mean()

In [None]:
rfr_pred = predictions(rfr_pipe, X_train, X_test, y_train)

In [None]:
regression_scores('RandomForest', y_test, rfr_pred)

### Decision Tree Regressor

[finished, add description]
readability

In [50]:
dtr_pipe = Pipeline([
    ('ss', StandardScaler()),
    ('dtr', DecisionTreeRegressor())
])

In [51]:
dtr_params = {
    'ss__with_mean': [False, True],
    'ss__with_std': [False, True],
    'dtr__max_depth': [9, 10,13,15],
    'dtr__min_samples_split': [15, 18,20],
    'dtr__min_samples_leaf': [1, 3, 5]
}

In [52]:
best_params(dtr_pipe, dtr_params, X_train, y_train)

"Best Score: 0.5147373797899001, Params: {'dtr__max_depth': 15, 'dtr__min_samples_leaf': 1, 'dtr__min_samples_split': 15, 'ss__with_mean': True, 'ss__with_std': False}"

In [53]:
dtr_gs = return_gs(dtr_pipe, dtr_params, X_train, y_train)

In [54]:
tts_scores(dtr_gs, X_train, y_train, X_test, y_test)

'Train Score: 0.6928624378266306, Test Score: 0.5704785376327788'

In [55]:
cross_val_score(dtr_pipe, X_train, y_train, cv=3).mean()

0.5511820606894577

In [56]:
dtr_pred = predictions(dtr_pipe, X_train, X_test, y_train)

In [57]:
regression_scores('DecisionTree', y_test, dtr_pred)

Unnamed: 0,R2,MSE,RMSE
DecisionTree,0.6063,10.5502,3.2481


In [116]:
bl_compare('DecisionTree', dtr_pred)

Unnamed: 0,Mean
DecisionTree,3.8507


### Stacking (Regression)

[finished, add description]

In [61]:
level1_estimators =[
    ('rfr_pipe', Pipeline([
        ('ss', StandardScaler()),
        ('rfr', RandomForestRegressor())
    ])), 
    ('dt_pipe', Pipeline([
        ('ss', StandardScaler()),
        ('dt', DecisionTreeRegressor())
    ])),  
    ('bag_pipe', Pipeline([
        ('ss', StandardScaler()),
        ('bag', BaggingRegressor())
    ])),       
]

In [62]:
stacked_model = StackingRegressor(estimators = level1_estimators,
                                 final_estimator = LinearRegression())

In [63]:
cross_val_score(stacked_model, X_train, y_train).mean()

0.6619324829246788

In [64]:
stacked_model.fit(X_train, y_train)

In [65]:
stacked_model.score(X_train, y_train), stacked_model.score(X_test, y_test)

(0.9215573741286196, 0.7153878067192359)

In [66]:
stacked_pred = predictions(stacked_model, X_train, X_test, y_train)

In [67]:
regression_scores('Stacking', y_test, stacked_pred)

Unnamed: 0,R2,MSE,RMSE
Stacking,0.7224,7.4384,2.7273


In [117]:
bl_compare('Stacking', stacked_pred)

Unnamed: 0,Mean
Stacking,3.9683


### Gradient Boosting

[finished, add description, decent model]

In [118]:
gb_pipe = Pipeline([
    ('ss', StandardScaler()),
    ('gb', GradientBoostingRegressor())
])

In [119]:
gb_params = {
    'ss__with_mean': [False, True],
    'ss__with_std': [False, True],
}

In [120]:
best_params(dtr_pipe, dtr_params, X_train, y_train)

"Best Score: 0.5135995306416755, Params: {'dtr__max_depth': 15, 'dtr__min_samples_leaf': 1, 'dtr__min_samples_split': 15, 'ss__with_mean': True, 'ss__with_std': False}"

In [121]:
dtr_gs = return_gs(dtr_pipe, dtr_params, X_train, y_train)

In [122]:
tts_scores(dtr_gs, X_train, y_train, X_test, y_test)

'Train Score: 0.6928624378266306, Test Score: 0.5948264402413468'

In [123]:
cross_val_score(gb_pipe, X_train, y_train, cv=3).mean()

0.4617622234122014

In [124]:
gb_pred = predictions(gb_pipe, X_train, X_test, y_train)

In [125]:
regression_scores('GradientBoost', y_test, gb_pred)

Unnamed: 0,R2,MSE,RMSE
GradientBoost,0.4874,13.7363,3.7063


In [126]:
bl_compare('GradientBoost', gb_pred)

Unnamed: 0,Mean
GradientBoost,3.8992


### Support Vector Regressor
[finished, add description, bad model so we will not look into it further]

In [127]:
svr_pipe = Pipeline([
    ('ss', StandardScaler()),
    ('svr', SVR(kernel = 'rbf'))
])

In [128]:
cross_val_score(svr_pipe, X_train, y_train, cv=3).mean()

0.2819942304995868

## Regression Model DataFrame

In [129]:
# Stacking
reg_table = regression_scores('Stacking', y_test, stacked_pred)

In [130]:
# Decision Tree
reg_table = pd.concat([reg_table, regression_scores('DecisionTree', y_test, dtr_pred)])

In [131]:
# Gradient Boosting
reg_table = pd.concat([reg_table, regression_scores('GradientBoost', y_test, gb_pred)])

In [226]:
# Random Forest
reg_table = pd.concat([reg_table, regression_scores('RandomForest', y_test, rfr_pred)])

In [132]:
reg_table

Unnamed: 0,R2,MSE,RMSE
Stacking,0.7224,7.4384,2.7273
DecisionTree,0.6063,10.5502,3.2481
GradientBoost,0.4874,13.7363,3.7063


---

## Classification Models

Within this dataset, producers documented their highest loss percentage of food and its corresponding food supply stage the loss occurred. There are approximately 17 stages identified and utilized to build a model that will accurately classify the stage for a datapoint's highest loss percentage. However, the classes are incredibly imbalanced; according to the baseline accuracy, 'Farm' appears in ~47% of the the data while the remaining 16 stages exist in less than 20% each. This aspect along with the large number of classes to classify may skew the classification model.

We evaluated multiple classification models: Random Forest Classifer, Decision Tree Classifier, and x, and assessed their accuracy and F1 scores. To encode the categorical variable (food_supply_stage), we explored using a Label Encoder and dummy variables. The Label Encoder assigned a numerical value to each category (e.g. Farm may be instantiated to 0, etc.) even though they do not have an apparent order or rank. This enables the model to process data that requires numerical inputs. Dummy variables creates binary columns for the categories which ensure indepedence from one another. Both methods resulted in similar scores and we made the conclusion that the encoding choices did not have a huge influence. 

### Baseline Accuracy

Below you will find the baseline accuracy of all 17 stages. Since the dataset is imbalanced, we will consider the F1 scores (and if needed, precision and recall) as the accuracy may not be fully representative. 

In [158]:
df['food_supply_stage'].value_counts(normalize = True)

Farm                 0.4701
Whole supply chain   0.1799
Harvest              0.1405
Storage              0.0989
Transport            0.0719
Wholesale            0.0096
Processing           0.0092
Retail               0.0089
Trader               0.0031
Households           0.0021
Export               0.0021
Post-harvest         0.0018
Food Services        0.0006
Pre-harvest          0.0004
Distribution         0.0004
Market               0.0004
Stacking             0.0000
Name: food_supply_stage, dtype: float64

### Random Forest Classifier

The Random Forest Classifier will generate decision trees based on randomly-selected columns and average the predictions. These actions will mitigate the impact of imbalanced classes and outliers through its likelihood in choosing data from minority classes. On the other hand, this model could also select low instances. One challenge that we could encounter is the time requirement to run this model as its making multiple predictions over a large dataset (> 20k).

In [310]:
# if there are missing values, use this for numerical columns by using the mean strategy
#for column in df.select_dtypes(include=[np.number]).columns:
    #imputer = SimpleImputer(strategy='mean')
    #df[column] = imputer.fit_transform(df[[column]])

In [166]:
# Using label encoder
label_encoders = {}
for column in df.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    df[column] = label_encoders[column].fit_transform(df[column])

* Train-Test-Split with Label Encoder

In [167]:
X = df.drop('food_supply_stage', axis=1)
y = df['food_supply_stage']

In [168]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [173]:
# create a Random Forest Classifier and fit it to the training data
rfc = RandomForestClassifier (n_estimators=100, random_state=42)
rfc.fit(X_train, y_train)

In [170]:
tts_scores(clf, X_train, y_train, X_test, y_test)

'Train Score: 0.998687526249475, Test Score: 0.9628307433851323'

In [171]:
rfc_pred = predictions(clf, X_train, X_test, y_train)

In [172]:
classification_scores('RandomForestClassifier', y_test, rfc_pred)

Unnamed: 0,Recall,Precision,F1,Accuracy
RandomForestClassifier,0.9628,0.9628,0.9628,0.9628


### Decision Tree Classifier

However, it's important to note that the decision tree classifier can get overfit quickly as it improves the model through each split. 

One difficulty is that it'll continue

In [None]:
features = pd.get_dummies(df, columns = ['Longitude', 'Latitude','country',
                              'commodity',
                              'year',
                              'activity'])

X = features.drop(columns = 'food_supply_stage')
y = pd.get_dummies(df['food_supply_stage'], columns = ['food_supply_stage'])

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

dt_pipe = Pipeline([
    ('ss', StandardScaler()),
    ('dt', DecisionTreeClassifier())
])

In [146]:
features = pd.get_dummies(df, columns = ['Longitude', 'Latitude','country',
                              'commodity',
                              'year',
                              'activity'])

In [147]:
X = features.drop(columns = 'food_supply_stage')
y = pd.get_dummies(df['food_supply_stage'], columns = ['food_supply_stage'])

In [148]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [149]:
dt_pipe = Pipeline([
    ('ss', StandardScaler()),
    ('dt', DecisionTreeClassifier())
])

In [164]:
dt_params = {
    'ss__with_mean': [False, True],
    'ss__with_std': [False, True],
    'dt__max_depth': [1, 5, 10, 25, 100],
    'dt__min_samples_leaf': [1, 3, 6, 10],
    'dt__min_samples_split': [5, 10, 15, 20]
}

In [151]:
best_params(dt_pipe, dt_params, X_train, y_train)

"Best Score: 0.9521194567270381, Params: {'dt__max_depth': 100, 'dt__min_samples_leaf': 1, 'dt__min_samples_split': 5, 'ss__with_mean': True, 'ss__with_std': False}"

In [152]:
dt_gs = return_gs(dt_pipe, dt_params, X_train, y_train)

In [153]:
tts_scores(dt_gs, X_train, y_train, X_test, y_test)

'Train Score: 0.9828078624628998, Test Score: 0.9606920880228457'

In [154]:
dt_pred = predictions(dt_pipe, X_train, X_test, y_train)

In [155]:
classification_scores('DecisionTree', y_test, dt_pred)

Unnamed: 0,Recall,Precision,F1,Accuracy
DecisionTree,0.9654,0.9662,0.9658,0.9654


In [None]:
ConfusionMatrixDisplay.from_estimator(dt_gs, X_test, y_test)

In [161]:
bag_pipe = Pipeline([
    ('ss', StandardScaler()),
    ('bag', BaggingClassifier(random_state = 42))
])

In [163]:
bag_params = {
    'ss__with_mean': [False, True],
    'ss__with_std': [False, True]
}

In [165]:
best_params(bag_pipe, bag_params, X_train, y_train)

ValueError: 
All the 12 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
4 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/lisaliang/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/lisaliang/opt/anaconda3/lib/python3.9/site-packages/sklearn/pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/lisaliang/opt/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_bagging.py", line 337, in fit
    return self._fit(X, y, self.max_samples, sample_weight=sample_weight)
  File "/Users/lisaliang/opt/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_bagging.py", line 393, in _fit
    y = self._validate_y(y)
  File "/Users/lisaliang/opt/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_bagging.py", line 802, in _validate_y
    y = column_or_1d(y, warn=True)
  File "/Users/lisaliang/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 1202, in column_or_1d
    raise ValueError(
ValueError: y should be a 1d array, got an array of shape (11904, 17) instead.

--------------------------------------------------------------------------------
8 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/lisaliang/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/lisaliang/opt/anaconda3/lib/python3.9/site-packages/sklearn/pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/lisaliang/opt/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_bagging.py", line 337, in fit
    return self._fit(X, y, self.max_samples, sample_weight=sample_weight)
  File "/Users/lisaliang/opt/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_bagging.py", line 393, in _fit
    y = self._validate_y(y)
  File "/Users/lisaliang/opt/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_bagging.py", line 802, in _validate_y
    y = column_or_1d(y, warn=True)
  File "/Users/lisaliang/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 1202, in column_or_1d
    raise ValueError(
ValueError: y should be a 1d array, got an array of shape (11905, 17) instead.


In [None]:
bag_gs = return_gs(dt_pipe, dt_params, X_train, y_train)

In [None]:
tts_scores(bag_gs, X_train, y_train, X_test, y_test)

### Classification Model DataFrame

In [184]:
class_model = classification_scores('RandomForestClassifier', y_test, rfc_pred)

In [192]:
class_model = pd.concat([class_model, classification_scores('DecisionTree', y_test, dt_pred)])

In [193]:
class_model

Unnamed: 0,Recall,Precision,F1,Accuracy
RandomForestClassifier,0.9621,0.9621,0.9621,0.9621
DecisionTree,0.9634,0.9634,0.9634,0.9634
