## Imports

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [14]:
df = pd.read_csv('./data/food_waste.csv')

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27773 entries, 0 to 27772
Data columns (total 18 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   m49_code                  27773 non-null  int64  
 1   country                   27773 non-null  object 
 2   region                    871 non-null    object 
 3   cpc_code                  27773 non-null  object 
 4   commodity                 27773 non-null  object 
 5   year                      27773 non-null  int64  
 6   loss_percentage           27773 non-null  float64
 7   loss_percentage_original  27773 non-null  object 
 8   loss_quantity             4552 non-null   object 
 9   activity                  20873 non-null  object 
 10  food_supply_stage         27724 non-null  object 
 11  treatment                 881 non-null    object 
 12  cause_of_loss             925 non-null    object 
 13  sample_size               1176 non-null   object 
 14  method

## Cleaning Data

In [16]:
# iterate through columns that have more than 1000 null variables
for col in df.columns:
    if df[col].isnull().sum() > 10000:
        print(f'{col}: {df[col].isnull().sum()}')

region: 26902
loss_quantity: 23221
treatment: 26892
cause_of_loss: 26848
sample_size: 26597
reference: 19337
notes: 26423


In [17]:
# dropping columns with many null variables, missing data essentially
for col in df.columns:
    if df[col].isnull().sum() > 10000:
        df.drop(col, axis = 1, inplace = True)

In [18]:
# these columns aren't relevant for us to look at
df.drop(columns = ['m49_code', 'cpc_code', 'method_data_collection', 'url'],
             axis = 1, inplace = True)

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27773 entries, 0 to 27772
Data columns (total 7 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   country                   27773 non-null  object 
 1   commodity                 27773 non-null  object 
 2   year                      27773 non-null  int64  
 3   loss_percentage           27773 non-null  float64
 4   loss_percentage_original  27773 non-null  object 
 5   activity                  20873 non-null  object 
 6   food_supply_stage         27724 non-null  object 
dtypes: float64(1), int64(1), object(5)
memory usage: 1.5+ MB


In [20]:
df.isnull().sum()

country                        0
commodity                      0
year                           0
loss_percentage                0
loss_percentage_original       0
activity                    6900
food_supply_stage             49
dtype: int64

In [21]:
df['activity'].fillna('Missing', inplace = True)
df['food_supply_stage'].fillna('Missing', inplace = True)

In [22]:
df.isnull().sum()

country                     0
commodity                   0
year                        0
loss_percentage             0
loss_percentage_original    0
activity                    0
food_supply_stage           0
dtype: int64

### Baseline Accuracy

In [23]:
df['loss_percentage'].value_counts()

2.50     2213
1.00     1410
3.50     1317
2.65     1240
2.00      848
         ... 
6.87        1
8.38        1
8.94        1
7.06        1
23.90       1
Name: loss_percentage, Length: 1831, dtype: int64

In [24]:
df['food_supply_stage'].value_counts()

Farm                  12368
Whole supply chain     5918
Harvest                3688
Storage                2525
Transport              1849
Retail                  438
Processing              262
Wholesale               257
Households              167
Trader                   76
Post-harvest             53
Export                   50
Missing                  49
Packing                  17
Food Services            14
Grading                  11
Pre-harvest              11
Distribution             10
Market                    9
Stacking                  1
Name: food_supply_stage, dtype: int64

### EDA

In [29]:
from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

### Functions for GridSearch

In [25]:
from sklearn.metrics import recall_score, precision_score, f1_score
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import BaggingRegressor

In [26]:
def best_params(pipeline, params, X_train, y_train):
    gs = GridSearchCV(pipeline,
                      param_grid = params,
                      n_jobs=-1)

    gs.fit(X_train, y_train)
    return f'Best Score: {gs.best_score_}, Params: {gs.best_params_}'

In [27]:
def return_gs(pipeline, params, X_train, y_train):
    gs = GridSearchCV(pipeline,
                      param_grid = params,
                      n_jobs=-1)
    return gs

In [28]:
def scores(gs, X_train, y_train, X_test, y_test):
    gs.fit(X_train, y_train)
    return f'Train Score: {gs.score(X_train, y_train)}, Test Score: {gs.score(X_test, y_test)}'

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27773 entries, 0 to 27772
Data columns (total 7 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   country                   27773 non-null  object 
 1   commodity                 27773 non-null  object 
 2   year                      27773 non-null  int64  
 3   loss_percentage           27773 non-null  float64
 4   loss_percentage_original  27773 non-null  object 
 5   activity                  27773 non-null  object 
 6   food_supply_stage         27773 non-null  object 
dtypes: float64(1), int64(1), object(5)
memory usage: 1.5+ MB


In [30]:
df_dummy = pd.get_dummies(df, columns = ['country',
                             'commodity',
                             'year',
                             'activity',
                             'food_supply_stage'])

In [31]:
X = df_dummy.drop(columns = ['loss_percentage',
                            'loss_percentage_original'])
y = df_dummy['loss_percentage']

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

### Linear Regression
Regression problem with continous variables

In [39]:
lr_pipe = Pipeline([
    ('ss', StandardScaler()),
    ('lr', LinearRegression())
])

In [45]:
lr_params = {
    'ss__with_mean': [False, True],
    'ss__with_std': [False, True],
    'lr__fit_intercept': [False, True],
    'lr__n_jobs': [25, 50, 100, 250],
    'lr__positive': [False, True]
}

In [46]:
best_params(lr_pipe, lr_params, X_train, y_train)

"Best Score: 0.5438935237502853, Params: {'lr__fit_intercept': False, 'lr__n_jobs': 25, 'lr__positive': True, 'ss__with_mean': False, 'ss__with_std': True}"

In [47]:
lr_gs = return_gs(lr_pipe, lr_params, X_train, y_train)

In [48]:
scores(lr_gs, X_train, y_train, X_test, y_test)

'Train Score: 0.5838162671183593, Test Score: 0.5290938842000981'

### Random Forest

In [55]:
rfr_pipe = Pipeline([
    ('ss', StandardScaler()),
    ('rfr', RandomForestRegressor())
])

In [56]:
rfr_params = {
    'ss__with_mean': [False, True],
    'ss__with_std': [False, True],
    'rfr__n_estimators': [100, 150, 200, 250],
    'rfr__bootstrap': [True, False]
    
}

In [57]:
best_params(rfr_pipe, rfr_params, X_train, y_train)

"Best Score: 0.7214188183619951, Params: {'rfr__bootstrap': True, 'rfr__n_estimators': 100, 'ss__with_mean': False, 'ss__with_std': False}"

In [58]:
rfr_gs = return_gs(rfr_pipe, rfr_params, X_train, y_train)

In [59]:
scores(rfr_gs, X_train, y_train, X_test, y_test)

'Train Score: 0.9327441610043699, Test Score: 0.7034672482884989'

In [None]:
# yikes so overfit

### AdaBoostRegressor

In [33]:
ada_pipe = Pipeline([
    ('ss', StandardScaler()),
    ('ada', AdaBoostRegressor())
])

In [34]:
ada_params = {
    'ss__with_mean': [False, True],
    'ss__with_std': [False, True],
    'ada__n_estimators': [25, 50, 100, 150, 200],
    'ada__learning_rate': [1, 0.9, 1.1]
}

In [35]:
best_params(ada_pipe, ada_params, X_train, y_train)

"Best Score: -0.22695060771379144, Params: {'ada__learning_rate': 1.1, 'ada__n_estimators': 50, 'ss__with_mean': False, 'ss__with_std': True}"

In [36]:
ada_gs = return_gs(ada_pipe, ada_params, X_train, y_train)

In [37]:
scores(ada_gs, X_train, y_train, X_test, y_test)

'Train Score: -0.374967855860741, Test Score: -0.4241848078985586'

### Classification?

In [None]:
# classify food categories