## Imports

In [107]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random

In [36]:
food_waste = pd.read_csv('./data/food_waste.csv')

In [37]:
food_waste.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27773 entries, 0 to 27772
Data columns (total 18 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   m49_code                  27773 non-null  int64  
 1   country                   27773 non-null  object 
 2   region                    871 non-null    object 
 3   cpc_code                  27773 non-null  object 
 4   commodity                 27773 non-null  object 
 5   year                      27773 non-null  int64  
 6   loss_percentage           27773 non-null  float64
 7   loss_percentage_original  27773 non-null  object 
 8   loss_quantity             4552 non-null   object 
 9   activity                  20873 non-null  object 
 10  food_supply_stage         27724 non-null  object 
 11  treatment                 881 non-null    object 
 12  cause_of_loss             925 non-null    object 
 13  sample_size               1176 non-null   object 
 14  method

In [38]:
countries = pd.read_csv('https://raw.githubusercontent.com/albertyw/avenews/master/old/data/average-latitude-longitude-countries.csv')

In [39]:
countries.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240 entries, 0 to 239
Data columns (total 4 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   ISO 3166 Country Code  239 non-null    object 
 1   Country                240 non-null    object 
 2   Latitude               240 non-null    float64
 3   Longitude              240 non-null    float64
dtypes: float64(2), object(2)
memory usage: 7.6+ KB


## Cleaning Data

In [40]:
# iterate through columns that have more than 1000 null variables
for col in food_waste.columns:
    if food_waste[col].isnull().sum() > 10000:
        print(f'{col}: {food_waste[col].isnull().sum()}')

region: 26902
loss_quantity: 23221
treatment: 26892
cause_of_loss: 26848
sample_size: 26597
reference: 19337
notes: 26423


In [41]:
# dropping columns with many null variables, missing data essentially
for col in food_waste.columns:
    if food_waste[col].isnull().sum() > 10000:
        food_waste.drop(col, axis = 1, inplace = True)

In [42]:
# these columns aren't relevant for us to look at
food_waste.drop(columns = ['m49_code', 'cpc_code', 'method_data_collection', 'url'],
             axis = 1, inplace = True)

In [43]:
food_waste.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27773 entries, 0 to 27772
Data columns (total 7 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   country                   27773 non-null  object 
 1   commodity                 27773 non-null  object 
 2   year                      27773 non-null  int64  
 3   loss_percentage           27773 non-null  float64
 4   loss_percentage_original  27773 non-null  object 
 5   activity                  20873 non-null  object 
 6   food_supply_stage         27724 non-null  object 
dtypes: float64(1), int64(1), object(5)
memory usage: 1.5+ MB


In [44]:
food_waste.isnull().sum()

country                        0
commodity                      0
year                           0
loss_percentage                0
loss_percentage_original       0
activity                    6900
food_supply_stage             49
dtype: int64

In [45]:
food_waste['activity'].fillna('Missing', inplace = True)
food_waste['food_supply_stage'].fillna('Missing', inplace = True)

In [46]:
food_waste.isnull().sum()

country                     0
commodity                   0
year                        0
loss_percentage             0
loss_percentage_original    0
activity                    0
food_supply_stage           0
dtype: int64

In [47]:
food_waste.head()

Unnamed: 0,country,commodity,year,loss_percentage,loss_percentage_original,activity,food_supply_stage
0,Myanmar,"Groundnuts, excluding shelled",2009,5.22,5.22%,Missing,Whole supply chain
1,Myanmar,"Groundnuts, excluding shelled",2008,5.43,5.43%,Missing,Whole supply chain
2,Myanmar,"Groundnuts, excluding shelled",2007,5.61,5.61%,Missing,Whole supply chain
3,Myanmar,"Groundnuts, excluding shelled",2006,5.4,5.4%,Missing,Whole supply chain
4,Myanmar,"Groundnuts, excluding shelled",2005,5.0,5%,Missing,Whole supply chain


In [48]:
countries.drop(columns = 'ISO 3166 Country Code', inplace = True)

In [49]:
countries.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240 entries, 0 to 239
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Country    240 non-null    object 
 1   Latitude   240 non-null    float64
 2   Longitude  240 non-null    float64
dtypes: float64(2), object(1)
memory usage: 5.8+ KB


In [50]:
df = pd.merge(countries, food_waste, left_on = 'Country', right_on='country', how='inner')

In [52]:
df.drop(columns = ['Country',
                   'loss_percentage_original'], inplace = True)

### Baseline Accuracy

In [53]:
df['loss_percentage'].value_counts()

2.500     2038
1.000     1308
3.500     1244
2.650     1116
2.000      754
          ... 
26.400       1
6.920        1
19.480       1
6.610        1
0.395        1
Name: loss_percentage, Length: 1681, dtype: int64

In [54]:
df['food_supply_stage'].value_counts()

Farm                  11193
Whole supply chain     4284
Harvest                3345
Storage                2354
Transport              1711
Wholesale               228
Processing              220
Retail                  213
Trader                   73
Households               51
Export                   50
Post-harvest             44
Missing                  43
Food Services            14
Distribution             10
Pre-harvest              10
Market                    9
Stacking                  1
Name: food_supply_stage, dtype: int64

### EDA

In [55]:
from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

### Functions for GridSearch

In [56]:
from sklearn.metrics import recall_score, precision_score, f1_score
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import BaggingRegressor

In [57]:
def best_params(pipeline, params, X_train, y_train):
    gs = GridSearchCV(pipeline,
                      param_grid = params,
                      n_jobs=-1)

    gs.fit(X_train, y_train)
    return f'Best Score: {gs.best_score_}, Params: {gs.best_params_}'

In [58]:
def return_gs(pipeline, params, X_train, y_train):
    gs = GridSearchCV(pipeline,
                      param_grid = params,
                      n_jobs=-1)
    return gs

In [59]:
def scores(gs, X_train, y_train, X_test, y_test):
    gs.fit(X_train, y_train)
    return f'Train Score: {gs.score(X_train, y_train)}, Test Score: {gs.score(X_test, y_test)}'

In [60]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23853 entries, 0 to 23852
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Latitude           23853 non-null  float64
 1   Longitude          23853 non-null  float64
 2   country            23853 non-null  object 
 3   commodity          23853 non-null  object 
 4   year               23853 non-null  int64  
 5   loss_percentage    23853 non-null  float64
 6   activity           23853 non-null  object 
 7   food_supply_stage  23853 non-null  object 
dtypes: float64(3), int64(1), object(4)
memory usage: 1.6+ MB


In [108]:
random.seed(42)

In [109]:
df_dummy = pd.get_dummies(df, columns = ['country',
                             'commodity',
                             'year',
                             'activity',
                             'food_supply_stage'])

In [110]:
X = df_dummy.drop(columns = 'loss_percentage')
y = df_dummy['loss_percentage']

In [111]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

### Linear Regression
Regression problem with continous variables

In [141]:
lr_pipe = Pipeline([
    ('ss', StandardScaler()),
    ('lr', LinearRegression())
])

In [142]:
lr_params = {
    'ss__with_mean': [False, True],
    'ss__with_std': [False, True],
    'lr__fit_intercept': [False, True],
    'lr__n_jobs': [25, 50, 100, 250],
    'lr__positive': [False, True]
}

In [143]:
best_params(lr_pipe, lr_params, X_train, y_train)

KeyboardInterrupt: 

In [None]:
lr_gs = return_gs(lr_pipe, lr_params, X_train, y_train)

In [None]:
scores(lr_gs, X_train, y_train, X_test, y_test)

### Random Forest

In [55]:
rfr_pipe = Pipeline([
    ('ss', StandardScaler()),
    ('rfr', RandomForestRegressor())
])

In [56]:
rfr_params = {
    'ss__with_mean': [False, True],
    'ss__with_std': [False, True],
    'rfr__n_estimators': [100, 150, 200, 250],
    'rfr__bootstrap': [True, False]
    
}

In [57]:
best_params(rfr_pipe, rfr_params, X_train, y_train)

"Best Score: 0.7214188183619951, Params: {'rfr__bootstrap': True, 'rfr__n_estimators': 100, 'ss__with_mean': False, 'ss__with_std': False}"

In [58]:
rfr_gs = return_gs(rfr_pipe, rfr_params, X_train, y_train)

In [59]:
scores(rfr_gs, X_train, y_train, X_test, y_test)

'Train Score: 0.9327441610043699, Test Score: 0.7034672482884989'

In [None]:
# yikes so overfit

### AdaBoostRegressor

In [33]:
ada_pipe = Pipeline([
    ('ss', StandardScaler()),
    ('ada', AdaBoostRegressor())
])

In [34]:
ada_params = {
    'ss__with_mean': [False, True],
    'ss__with_std': [False, True],
    'ada__n_estimators': [25, 50, 100, 150, 200],
    'ada__learning_rate': [1, 0.9, 1.1]
}

In [35]:
best_params(ada_pipe, ada_params, X_train, y_train)

"Best Score: -0.22695060771379144, Params: {'ada__learning_rate': 1.1, 'ada__n_estimators': 50, 'ss__with_mean': False, 'ss__with_std': True}"

In [36]:
ada_gs = return_gs(ada_pipe, ada_params, X_train, y_train)

In [37]:
scores(ada_gs, X_train, y_train, X_test, y_test)

'Train Score: -0.374967855860741, Test Score: -0.4241848078985586'

### Regression: Stacking

In [None]:
level1_estimators =[
    ('rfr_pipe', Pipeline([
        ('ss', StandardScaler()),
        ('rfr', RandomForestRegressor())
    ])), 
    
    ('dt_pipe', Pipeline([
        ('ss', StandardScaler()),
        ('dt', DecisionTreeRegressor())
    ])),  
    ('bag_pipe', Pipeline([
        ('ss', StandardScaler()),
        ('bag', BaggingRegressor())
    ])),       
]

In [None]:
stacked_model = StackingClassifier(estimators = level1_estimators,
                                 final_estimator = LinearRegression())

In [None]:
cross_val_score(stacked_model, X_train, y_train).mean()

In [None]:
stacked_model.fit(X_train, y_train)

In [None]:
stacked_model.score(X_train, y_train), stacked_model.score(X_test, y_test)

In [None]:
stacked_pred = stacked_model.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test, stacked_pred).ravel()

In [None]:
evaluation_scores(y_test, stacked_pred)

In [None]:
ConfusionMatrixDisplay.from_estimator(stacked_model, X_test, y_test);

### Imbalaced Data Methods

In [112]:
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline

## Classification

In [120]:
from sklearn.tree import DecisionTreeClassifier

In [121]:
df['food_supply_stage'].value_counts(normalize = True)

Farm                  0.469249
Whole supply chain    0.179600
Harvest               0.140234
Storage               0.098688
Transport             0.071731
Wholesale             0.009559
Processing            0.009223
Retail                0.008930
Trader                0.003060
Households            0.002138
Export                0.002096
Post-harvest          0.001845
Missing               0.001803
Food Services         0.000587
Distribution          0.000419
Pre-harvest           0.000419
Market                0.000377
Stacking              0.000042
Name: food_supply_stage, dtype: float64

In [122]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23853 entries, 0 to 23852
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Latitude           23853 non-null  float64
 1   Longitude          23853 non-null  float64
 2   country            23853 non-null  object 
 3   commodity          23853 non-null  object 
 4   year               23853 non-null  int64  
 5   loss_percentage    23853 non-null  float64
 6   activity           23853 non-null  object 
 7   food_supply_stage  23853 non-null  object 
dtypes: float64(3), int64(1), object(4)
memory usage: 1.6+ MB


In [123]:
features = pd.get_dummies(df, columns = ['Longitude', 'Latitude','country',
                              'commodity',
                              'year',
                              'activity'])

In [124]:
X = features.drop(columns = 'food_supply_stage')

In [125]:
y = pd.get_dummies(df['food_supply_stage'], columns = ['food_supply_stage'])

In [126]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

### DecisionTreeClassifier

In [132]:
dt_pipe = Pipeline([
    ('ss', StandardScaler()),
    ('dt', DecisionTreeClassifier())
])

In [133]:
dt_params = {
    'ss__with_mean': [False, True],
    'ss__with_std': [False, True],
    'dt__max_depth': [1, 5, 10, 25, 100],
    'dt__min_samples_leaf': [1, 3, 6, 10],
    'dt__min_samples_split': [5, 10, 15, 20]
}

In [134]:
best_params(dt_pipe, dt_params, X_train, y_train)

"Best Score: 0.9547766278345302, Params: {'dt__max_depth': 100, 'dt__min_samples_leaf': 1, 'dt__min_samples_split': 5, 'ss__with_mean': True, 'ss__with_std': False}"

In [135]:
dt_gs = return_gs(dt_pipe, dt_params, X_train, y_train)

In [136]:
scores(dt_gs, X_train, y_train, X_test, y_test)

'Train Score: 0.9807703057744983, Test Score: 0.9575788061703555'

In [None]:
### Evaluation: omg these are good scores, did I do this right 

In [137]:
dt_ada_pipe = make_pipeline(
    ADASYN(random_state = 34),
    DecisionTreeClassifier()
)

In [138]:
dt_ada_params = {
    'decisiontreeclassifier__max_depth':[100,200,300],
    'decisiontreeclassifier__min_samples_split': [2, 3, 4, 5]
}

In [139]:
dt_ada_gs = GridSearchCV(dt_ada_pipe, 
                         dt_ada_params,
                         scoring = 'balanced_accuracy',
                         n_jobs = -1
                        )

In [140]:
dt_ada_gs.fit(X_train, y_train)

ValueError: 
All the 60 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/lisaliang/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/lisaliang/opt/anaconda3/lib/python3.9/site-packages/imblearn/pipeline.py", line 293, in fit
    Xt, yt = self._fit(X, y, **fit_params_steps)
  File "/Users/lisaliang/opt/anaconda3/lib/python3.9/site-packages/imblearn/pipeline.py", line 250, in _fit
    X, y, fitted_transformer = fit_resample_one_cached(
  File "/Users/lisaliang/opt/anaconda3/lib/python3.9/site-packages/joblib/memory.py", line 349, in __call__
    return self.func(*args, **kwargs)
  File "/Users/lisaliang/opt/anaconda3/lib/python3.9/site-packages/imblearn/pipeline.py", line 422, in _fit_resample_one
    X_res, y_res = sampler.fit_resample(X, y, **fit_params)
  File "/Users/lisaliang/opt/anaconda3/lib/python3.9/site-packages/imblearn/base.py", line 203, in fit_resample
    return super().fit_resample(X, y)
  File "/Users/lisaliang/opt/anaconda3/lib/python3.9/site-packages/imblearn/base.py", line 82, in fit_resample
    X, y, binarize_y = self._check_X_y(X, y)
  File "/Users/lisaliang/opt/anaconda3/lib/python3.9/site-packages/imblearn/base.py", line 155, in _check_X_y
    y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
  File "/Users/lisaliang/opt/anaconda3/lib/python3.9/site-packages/imblearn/utils/_validation.py", line 152, in check_target_type
    y = y.argmax(axis=1)
  File "/Users/lisaliang/opt/anaconda3/lib/python3.9/site-packages/pandas/core/generic.py", line 5902, in __getattr__
    return object.__getattribute__(self, name)
AttributeError: 'DataFrame' object has no attribute 'argmax'


In [None]:
dt_ada_gs.score(X_test_ss, y_test)

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

In [None]:
ConfusionMatrixDisplay.from_estimator(dt_ada_gs, X_test, y_test, cmap="Blues");

### Stacking