In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [2]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [3]:
# Load the data
file_path = Path('clean_flavors_of_cacao.csv')
df = pd.read_csv(file_path)
df.head()


Unnamed: 0,Company,Bean_Origin_or_Bar_Name,REF,Review_Date,Cocoa_Percent,Company_Location,Rating,Bean_Type,Broad_Bean_Origin,Ingredients,Most_Memorable_Characteristics,country_code,latitude,longitude,continent
0,A. Morin,Agua Grande,1876,2016,63.0,France,3.75,missing,Sao Tome & Principe,"4- B,S,C,L","sweet, chocolatey, vegetal",ST,0.18636,6.613081,Africa
1,A. Morin,Kpime,1676,2015,70.0,France,2.75,missing,Togo,"4- B,S,C,L","burnt wood, earthy, choco",TG,8.619543,0.824782,Africa
2,A. Morin,Atsane,1676,2015,70.0,France,3.0,missing,Togo,"4- B,S,C,L","roasty, acidic, nutty",TG,8.619543,0.824782,Africa
3,A. Morin,Akata,1680,2015,70.0,France,3.5,missing,Togo,"4- B,S,C,L","mild profile, chocolaty, spice",TG,8.619543,0.824782,Africa
4,A. Morin,Quilla,1704,2015,70.0,France,3.5,missing,Peru,"4- B,S,C,L","grainy texture, cocoa, sweet",PE,-9.189967,-75.015152,South America


In [4]:
df= df.drop(columns=["Company","REF","Bean_Origin_or_Bar_Name","Most_Memorable_Characteristics","continent"], axis=1)
df.head()

Unnamed: 0,Review_Date,Cocoa_Percent,Company_Location,Rating,Bean_Type,Broad_Bean_Origin,Ingredients,country_code,latitude,longitude
0,2016,63.0,France,3.75,missing,Sao Tome & Principe,"4- B,S,C,L",ST,0.18636,6.613081
1,2015,70.0,France,2.75,missing,Togo,"4- B,S,C,L",TG,8.619543,0.824782
2,2015,70.0,France,3.0,missing,Togo,"4- B,S,C,L",TG,8.619543,0.824782
3,2015,70.0,France,3.5,missing,Togo,"4- B,S,C,L",TG,8.619543,0.824782
4,2015,70.0,France,3.5,missing,Peru,"4- B,S,C,L",PE,-9.189967,-75.015152


In [5]:
def bin_ratings(rating):
    if rating >= 4.0: 
        return 'Premium_Elite'
    elif  rating >= 3.0 and rating < 4.0: 
        return 'Satisfactory'
    elif  rating >= 2.0 and rating < 3.0:
        return 'Unsatisfactory'
    else:
        return "unpleasant"

df['Rating'] = df['Rating'].apply(bin_ratings)

# Split the Data into Training and Testing


In [6]:
y = df['Rating']
X = df.drop(columns="Rating", axis=1)
X = pd.get_dummies(X)

In [7]:
# Check the balance of our target values
y.value_counts()

Satisfactory      1353
Unsatisfactory     484
Premium_Elite      105
unpleasant          18
Name: Rating, dtype: int64

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
   y,test_size= 0.40,train_size=0.60 , random_state=1, stratify=y)

In [9]:
from sklearn.preprocessing import StandardScaler

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Ensemble Learners

## Balanced Random Forest Classifier

In [10]:


# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

brfc = BalancedRandomForestClassifier(n_estimators=500, random_state=1)

# train
brfc.fit(X_train_scaled, y_train)



BalancedRandomForestClassifier(n_estimators=500, random_state=1)

In [11]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = brfc.predict(X_test_scaled)
balanced_accuracy_score(y_test, y_pred)

0.4953109843133091

In [12]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 1", "Actual 2", "Actual 3", "Actual 4"], columns=["Predicted 1", "Predicted 2","Predicted 3", "Predicted 4"])
cm_df

Unnamed: 0,Predicted 1,Predicted 2,Predicted 3,Predicted 4
Actual 1,28,7,2,5
Actual 2,193,216,62,70
Actual 3,55,66,39,34
Actual 4,2,0,0,5


In [13]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                      pre       rec       spe        f1       geo       iba       sup

 Premium_Elite       0.10      0.67      0.66      0.17      0.66      0.44        42
  Satisfactory       0.75      0.40      0.70      0.52      0.53      0.27       541
Unsatisfactory       0.38      0.20      0.89      0.26      0.42      0.17       194
    unpleasant       0.04      0.71      0.86      0.08      0.78      0.61         7

   avg / total       0.62      0.37      0.75      0.43      0.51      0.26       784



In [14]:
# List the features sorted in descending order by feature importance
importances = brfc.feature_importances_ 

features_importance = sorted(zip(brfc.feature_importances_, X.columns), reverse=True)
for feature in features_importance:
    print(f"{feature[1]}: ({feature[0]})")

Cocoa_Percent: (0.12151387268678536)
Review_Date: (0.08621070552112155)
latitude: (0.06442250744544023)
longitude: (0.06272172763236035)
Ingredients_Unknown: (0.03785631752283829)
Company_Location_United States of America: (0.029430667810759266)
Ingredients_3- B,S,C: (0.02820034836572777)
Bean_Type_missing: (0.026461536165387434)
Broad_Bean_Origin_Unknown: (0.023856490672528034)
Ingredients_2- B,S: (0.023549215339757797)
Bean_Type_Trinitario: (0.01921575372541531)
country_code_ZZ: (0.019002170279476132)
Bean_Type_Criollo: (0.018790995287947535)
Bean_Type_Forastero: (0.01702402963313661)
Ingredients_4- B,S,C,L: (0.016723652124121947)
Company_Location_England: (0.01638592640941415)
Company_Location_France: (0.016346284651868583)
Ingredients_5- B,S,C,V,L: (0.014382253288160701)
country_code_VE: (0.014095155440257993)
Company_Location_Canada: (0.01351043182801382)
country_code_EC: (0.013307694430855591)
Broad_Bean_Origin_Venezuela: (0.01282054379004)
Ingredients_4- B,S,C,V: (0.012620345131

# Easy Ensemble AdaBoost Classifier


In [15]:


# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
eec = EasyEnsembleClassifier(n_estimators=100, random_state=1)
eec.fit(X_train_scaled, y_train)



EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [16]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = eec.predict(X_test_scaled)
balanced_accuracy_score(y_test, y_pred)

0.4715740546652184

In [17]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 1", "Actual 2", "Actual 3", "Actual 4"], columns=["Predicted 1", "Predicted 2","Predicted 3", "Predicted 4"])
cm_df

Unnamed: 0,Predicted 1,Predicted 2,Predicted 3,Predicted 4
Actual 1,25,9,4,4
Actual 2,172,222,105,42
Actual 3,41,70,60,23
Actual 4,2,1,0,4


In [18]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                      pre       rec       spe        f1       geo       iba       sup

 Premium_Elite       0.10      0.60      0.71      0.18      0.65      0.42        42
  Satisfactory       0.74      0.41      0.67      0.53      0.52      0.27       541
Unsatisfactory       0.36      0.31      0.82      0.33      0.50      0.24       194
    unpleasant       0.05      0.57      0.91      0.10      0.72      0.50         7

   avg / total       0.60      0.40      0.71      0.46      0.53      0.27       784

