## Imports

In [369]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random

# encoding imports
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

# evaluation imports
from sklearn import metrics
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline

# modeling imports
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, StackingRegressor, BaggingRegressor, GradientBoostingRegressor 
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVR

# imbalanced modeling
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline

---

## Title: Overview? Description?

For this project, we are exploring two types of models, classification and regression. We going to see if we create a model that will predict the loss_percentage given a country while also identifying accurately which food stage is highly associated with food loss. 

The models we explored include:


---

## [EDIT]: File Path

In [343]:
np.random.seed(42)

In [345]:
df = pd.read_csv('./data/clean_data.csv')
df.head()

Unnamed: 0,Latitude,Longitude,country,commodity,year,loss_percentage,activity,food_supply_stage
0,33.0,65.0,Afghanistan,Wheat,2017,15.0,wsc,Whole supply chain
1,33.0,65.0,Afghanistan,Maize (corn),2017,14.95,wsc,Whole supply chain
2,33.0,65.0,Afghanistan,Rice,2017,7.09,wsc,Whole supply chain
3,33.0,65.0,Afghanistan,Barley,2017,14.74,wsc,Whole supply chain
4,33.0,65.0,Afghanistan,Wheat,2016,15.02,wsc,Whole supply chain


In [51]:
df = pd.read_csv('merged_df.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Latitude,Longitude,country,commodity,year,loss_percentage,activity,food_supply_stage
0,0,33.0,65.0,Afghanistan,Wheat,2017,15.0,Missing,Whole supply chain
1,1,33.0,65.0,Afghanistan,Maize (corn),2017,14.95,Missing,Whole supply chain
2,2,33.0,65.0,Afghanistan,Rice,2017,7.09,Missing,Whole supply chain
3,3,33.0,65.0,Afghanistan,Barley,2017,14.74,Missing,Whole supply chain
4,4,33.0,65.0,Afghanistan,Wheat,2016,15.02,Missing,Whole supply chain


In [52]:
df.drop(columns = 'Unnamed: 0', inplace = True)

---

## Functions

### GridSearch Functions
* best_params: returns the best score and paramters for pipeline/model
* return_gs: returns the optimal paramters of a pipeline/model
* tts_scores: returns the Train and Test scores of a pipeline/model

In [350]:
def best_params(pipeline, params, X_train, y_train):
    gs = GridSearchCV(pipeline,
                      param_grid = params,
                      n_jobs=-1, 
                      cv = 3)

    gs.fit(X_train, y_train)
    
    return f'Best Score: {gs.best_score_}, Params: {gs.best_params_}'

In [351]:
def return_gs(pipeline, params, X_train, y_train):
    gs = GridSearchCV(pipeline,
                      param_grid = params,
                      n_jobs=-1, 
                      cv = 3)
    return gs

In [352]:
def tts_scores(gs, X_train, y_train, X_test, y_test):
    gs.fit(X_train, y_train)
    
    return f'Train Score: {gs.score(X_train, y_train)}, Test Score: {gs.score(X_test, y_test)}'

---

### Evaluation Functions

* predictions: accepts a pipeline and returns the predictions for y
* regression_scores: returns DataFrame of data for a model and its regression metric scores (R2, MSE, and RMSE)
* classification_scores: returns DataFrame of data fro a model and its classification metric scores (recall, precision, f1, accuracy)

In [353]:
def predictions(pipeline, X_train, X_test, y_train):
    pipeline.fit(X_train, y_train)
    prediction = pipeline.predict(X_test)
    
    return prediction

In [354]:
def regression_scores(model, y_test, y_pred):
    dataframe = pd.DataFrame(columns = ['R2', 'MSE', 'RMSE'])
    
    r2 = metrics.r2_score(y_test, y_pred)
    mse = metrics.mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    
    dataframe.loc[model] = [r2, mse, rmse]
    
    pd.options.display.float_format = '{:.4f}'.format 
    
    return dataframe

In [355]:
def classification_scores(model, y_test, y_pred):
    dataframe = pd.DataFrame(columns = ['Recall', 'Precision', 'F1', 'Accuracy'])
    
    recall = recall_score(y_test, y_pred, average = 'micro')
    precision = precision_score(y_test, y_pred, average = 'micro')
    f1 = f1_score(y_test, y_pred, average = 'micro')
    accuracy = accuracy_score(y_test, y_pred)
    
    dataframe.loc[model] = [recall, precision, f1, accuracy]
    
    return dataframe

---

### Train-Test-Split (for Regression Modeling)

In [367]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23810 entries, 0 to 23809
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Latitude           23810 non-null  float64
 1   Longitude          23810 non-null  float64
 2   country            23810 non-null  object 
 3   commodity          23810 non-null  object 
 4   year               23810 non-null  int64  
 5   loss_percentage    23810 non-null  float64
 6   activity           23810 non-null  object 
 7   food_supply_stage  23810 non-null  object 
dtypes: float64(3), int64(1), object(4)
memory usage: 1.5+ MB


In [356]:
df_dummy = pd.get_dummies(df, columns = ['country',
                             'commodity',
                             'activity',
                             'food_supply_stage'])

In [357]:
X = df_dummy.drop(columns = 'loss_percentage')
y = df_dummy['loss_percentage']

In [358]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

---

## Regression Models

Insert description of problem statement

* Best:
* Other:

### Random Forest Regressor

In [197]:
rfr_pipe = Pipeline([
    ('ss', StandardScaler()),
    ('rfr', RandomForestRegressor())
])

In [198]:
rfr_params = {
    'ss__with_mean': [False, True],
    'ss__with_std': [False, True],
    'rfr__n_estimators': [100, 150, 200, 250],
    'rfr__max_depth': [None, 10,50,80],
    'rfr__min_samples_leaf': [1,3, 4, 5]
}

In [220]:
best_params(rfr_pipe, rfr_params, X_train, y_train)

"Best Score: 0.6653859451377918, Params: {'rfr__max_depth': 80, 'rfr__min_samples_leaf': 1, 'rfr__n_estimators': 250, 'ss__with_mean': True, 'ss__with_std': False}"

In [221]:
rfr_gs = return_gs(rfr_pipe, rfr_params, X_train, y_train)

In [222]:
tts_scores(rfr_gs, X_train, y_train, X_test, y_test)

'Train Score: 0.960325912301814, Test Score: 0.6817730095143225'

In [223]:
cross_val_score(rfr_pipe, X_train, y_train, cv=3).mean()

0.6579587506001923

In [224]:
rfr_pred = predictions(rfr_pipe, X_train, X_test, y_train)

In [225]:
regression_scores('RandomForest', y_test, rfr_pred)

Unnamed: 0,R2,MSE,RMSE
RandomForest,0.6893,8.8347,2.9723


### Decision Tree Regressor

[finished, add description]

In [359]:
dtr_pipe = Pipeline([
    ('ss', StandardScaler()),
    ('dtr', DecisionTreeRegressor())
])

In [360]:
dtr_params = {
    'ss__with_mean': [False, True],
    'ss__with_std': [False, True],
    'dtr__max_depth': [9, 10,13,15],
    'dtr__min_samples_split': [15, 18,20],
    'dtr__min_samples_leaf': [1, 3, 5]
}

In [361]:
best_params(dtr_pipe, dtr_params, X_train, y_train)

"Best Score: 0.5210192606247354, Params: {'dtr__max_depth': 15, 'dtr__min_samples_leaf': 1, 'dtr__min_samples_split': 15, 'ss__with_mean': False, 'ss__with_std': False}"

In [362]:
dtr_gs = return_gs(dtr_pipe, dtr_params, X_train, y_train)

In [363]:
tts_scores(dtr_gs, X_train, y_train, X_test, y_test)

'Train Score: 0.6832856029246446, Test Score: 0.5919737260183989'

In [364]:
cross_val_score(dtr_pipe, X_train, y_train, cv=3).mean()

0.5558115070181191

In [365]:
dtr_pred = predictions(dtr_pipe, X_train, X_test, y_train)

In [366]:
regression_scores('DecisionTree', y_test, dtr_pred)

Unnamed: 0,R2,MSE,RMSE
DecisionTree,0.6305,9.9015,3.1467


### Stacking (Regression)

[finished, add description]

In [326]:
level1_estimators =[
    ('rfr_pipe', Pipeline([
        ('ss', StandardScaler()),
        ('rfr', RandomForestRegressor())
    ])), 
    ('dt_pipe', Pipeline([
        ('ss', StandardScaler()),
        ('dt', DecisionTreeRegressor())
    ])),  
    ('bag_pipe', Pipeline([
        ('ss', StandardScaler()),
        ('bag', BaggingRegressor())
    ])),       
]

In [327]:
stacked_model = StackingRegressor(estimators = level1_estimators,
                                 final_estimator = LinearRegression())

In [328]:
cross_val_score(stacked_model, X_train, y_train).mean()

0.943188658920737

In [329]:
stacked_model.fit(X_train, y_train)

In [330]:
stacked_model.score(X_train, y_train), stacked_model.score(X_test, y_test)

(0.9914917126686474, 0.947005909124241)

In [331]:
stacked_pred = predictions(stacked_model, X_train, X_test, y_train)

In [332]:
regression_scores('Stacking', y_test, stacked_pred)

Unnamed: 0,R2,MSE,RMSE
Stacking,0.9475,1.6398,1.2806


### Gradient Boosting

[finished, add description, decent model]

In [318]:
gb_pipe = Pipeline([
    ('ss', StandardScaler()),
    ('gb', GradientBoostingRegressor())
])

In [319]:
gb_params = {
    'ss__with_mean': [False, True],
    'ss__with_std': [False, True],
}

In [320]:
best_params(dtr_pipe, dtr_params, X_train, y_train)

"Best Score: 0.9183757025982451, Params: {'dtr__max_depth': 15, 'dtr__min_samples_leaf': 5, 'dtr__min_samples_split': 18, 'ss__with_mean': False, 'ss__with_std': True}"

In [321]:
dtr_gs = return_gs(dtr_pipe, dtr_params, X_train, y_train)

In [322]:
tts_scores(dtr_gs, X_train, y_train, X_test, y_test)

'Train Score: 0.9567698342788437, Test Score: 0.9313249119379815'

In [323]:
cross_val_score(gb_pipe, X_train, y_train, cv=3).mean()

0.8569877345236246

In [324]:
gb_pred = predictions(gb_pipe, X_train, X_test, y_train)

In [325]:
regression_scores('GradientBoost', y_test, gb_pred)

Unnamed: 0,R2,MSE,RMSE
GradientBoost,0.8667,4.1634,2.0404


### Support Vector Regressor
[finished, add description, bad model so we will not look into it further]

In [304]:
svr_pipe = Pipeline([
    ('ss', StandardScaler()),
    ('svr', SVR(kernel = 'rbf'))
])

In [305]:
cross_val_score(svr_pipe, X_train, y_train, cv=3).mean()

0.2819942304995868

## Regression Model DataFrame

In [173]:
# Stacking
reg_table = regression_scores('Stacking', y_test, stacked_pred)

In [174]:
# Decision Tree
reg_table = pd.concat([reg_table, regression_scores('DecisionTree', y_test, dtr_pred)])

In [175]:
# Gradient Boosting
reg_table = pd.concat([reg_table, regression_scores('GradientBoost', y_test, gb_pred)])

In [226]:
# Random Forest
reg_table = pd.concat([reg_table, regression_scores('RandomForest', y_test, rfr_pred)])

In [227]:
reg_table

Unnamed: 0,R2,MSE,RMSE
Stacking,0.6903,8.8057,2.9674
DecisionTree,0.5844,11.8152,3.4373
GradientBoost,0.4648,15.2176,3.901
RandomForest,0.6893,8.8347,2.9723


---

## Classification Models

[Insert description of problem statement]

### Random Forest Classifier

[almost, need to clarify tomorrow which data to use (it's a good score either way)]

In [308]:
df = pd.read_csv('./data/clean_data.csv')

In [309]:
for column in df.select_dtypes(include=['object']).columns:
    imputer = SimpleImputer(strategy='most_frequent')
    df[column] = imputer.fit_transform(df[[column]])

In [310]:
# For numerical columns use mean strategy
for column in df.select_dtypes(include=[np.number]).columns:
    imputer = SimpleImputer(strategy='mean')
    df[column] = imputer.fit_transform(df[[column]])

In [311]:
# Encode categorical features
label_encoders = {}
for column in df.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    df[column] = label_encoders[column].fit_transform(df[column])

In [312]:
# Split into features and target variable
X = df.drop('food_supply_stage', axis=1)
y = df['food_supply_stage']

In [313]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [314]:
# Create a Random Forest Classifier and fit it to the training data
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

In [315]:
tts_scores(clf, X_train, y_train, X_test, y_test)

'Train Score: 0.998687526249475, Test Score: 0.9628307433851323'

In [316]:
rfc_pred = predictions(clf, X_train, X_test, y_train)

In [317]:
classification_scores('RandomForestClassifier', y_test, rfc_pred)

Unnamed: 0,Recall,Precision,F1,Accuracy
RandomForestClassifier,0.9628,0.9628,0.9628,0.9628


### Decision Tree Classifier
[almost, need clarification]

In [None]:
features = pd.get_dummies(df, columns = ['Longitude', 'Latitude','country',
                              'commodity',
                              'year',
                              'activity'])

X = features.drop(columns = 'food_supply_stage')
y = pd.get_dummies(df['food_supply_stage'], columns = ['food_supply_stage'])

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

dt_pipe = Pipeline([
    ('ss', StandardScaler()),
    ('dt', DecisionTreeClassifier())
])

In [234]:
features = pd.get_dummies(df, columns = ['Longitude', 'Latitude','country',
                              'commodity',
                              'year',
                              'activity'])

In [235]:
X = features.drop(columns = 'food_supply_stage')
y = pd.get_dummies(df['food_supply_stage'], columns = ['food_supply_stage'])

In [236]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [237]:
dt_pipe = Pipeline([
    ('ss', StandardScaler()),
    ('dt', DecisionTreeClassifier())
])

In [238]:
dt_params = {
    'ss__with_mean': [False, True],
    'ss__with_std': [False, True],
    'dt__max_depth': [1, 5, 10, 25, 100],
    'dt__min_samples_leaf': [1, 3, 6, 10],
    'dt__min_samples_split': [5, 10, 15, 20]
}

In [239]:
best_params(dt_pipe, dt_params, X_train, y_train)

"Best Score: 0.9510872603275756, Params: {'dt__max_depth': 100, 'dt__min_samples_leaf': 1, 'dt__min_samples_split': 5, 'ss__with_mean': True, 'ss__with_std': True}"

In [240]:
dt_gs = return_gs(dt_pipe, dt_params, X_train, y_train)

In [241]:
tts_scores(dt_gs, X_train, y_train, X_test, y_test)

'Train Score: 0.9845715244004696, Test Score: 0.9592555331991952'

In [242]:
dt_pred = predictions(dt_pipe, X_train, X_test, y_train)

In [243]:
classification_scores('DecisionTree', y_test, dt_pred)

Unnamed: 0,Recall,Precision,F1,Accuracy
DecisionTree,0.9629,0.9629,0.9629,0.9629


### Classification Model DataFrame

In [184]:
class_model = classification_scores('RandomForestClassifier', y_test, rfc_pred)

In [192]:
class_model = pd.concat([class_model, classification_scores('DecisionTree', y_test, dt_pred)])

In [193]:
class_model

Unnamed: 0,Recall,Precision,F1,Accuracy
RandomForestClassifier,0.9621,0.9621,0.9621,0.9621
DecisionTree,0.9634,0.9634,0.9634,0.9634


In [231]:
import pickle

In [264]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [280]:
print(df['food_supply_stage'].unique())

[16  2 17 10  5 13  4 15 11  6 14  8  3  1  7  9  0 12]


In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('numeric', StandardScaler(), ['numeric_feature']),
        ('categorical', OneHotEncoder(), ['categorical_feature'])
    ])

pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', clf)])

# Fit the pipeline to your data
pipeline.fit(X, y)

In [274]:
column_name = 'food_supply_stage'
if column_name in df.columns:
    print(f"Column '{column_name}' exists in the DataFrame.")
else:
    print(f"Column '{column_name}' does not exist in the DataFrame.")

Column 'food_supply_stage' exists in the DataFrame.


In [290]:
dt_pipe.fit(X_train, y_train)

In [286]:
y_train_array = y_train.values

In [291]:
with open('dt_pipe.pkl', 'wb') as f:
    pickle.dump(dt_pipe, f)

In [288]:
X_test_array = X_test.values

In [289]:
np.save('X_test_array.npy', X_test_array)