In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [2]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [3]:
# Load the data
file_path = Path('clean_flavors_of_cacao.csv')
df = pd.read_csv(file_path)
df.head()


Unnamed: 0,Company,Bean_Origin_or_Bar_Name,REF,Review_Date,Cocoa_Percent,Company_Location,Rating,Bean_Type,Broad_Bean_Origin
0,A. Morin,Agua Grande,1876,2016,63.0,France,3.75,missing,Sao Tome & Principe
1,A. Morin,Kpime,1676,2015,70.0,France,2.75,missing,Togo
2,A. Morin,Atsane,1676,2015,70.0,France,3.0,missing,Togo
3,A. Morin,Akata,1680,2015,70.0,France,3.5,missing,Togo
4,A. Morin,Quilla,1704,2015,70.0,France,3.5,missing,Peru


In [4]:
df= df.drop(columns=["Company", "REF","Bean_Origin_or_Bar_Name"], axis=1)
df.head()

Unnamed: 0,Review_Date,Cocoa_Percent,Company_Location,Rating,Bean_Type,Broad_Bean_Origin
0,2016,63.0,France,3.75,missing,Sao Tome & Principe
1,2015,70.0,France,2.75,missing,Togo
2,2015,70.0,France,3.0,missing,Togo
3,2015,70.0,France,3.5,missing,Togo
4,2015,70.0,France,3.5,missing,Peru


In [5]:
def bin_ratings(rating):
    if rating == 5.75: return 1
    if rating == 5.50: return 1
    if rating == 5.25: return 1
    if rating == 5.00: return 1
    
    if rating == 4.75: return 1
    if rating == 4.50: return 1
    if rating == 4.25: return 1
    if rating == 4.00: return 1
    
    if rating == 3.75: return 1
    if rating == 3.50: return 1
    if rating == 3.25: return 0
    if rating == 3.00: return 0

    if rating == 2.75: return 0
    if rating == 2.50: return 0
    if rating == 2.25: return 0
    if rating == 2.00: return 0
    
    if rating == 1.75: return 0
    if rating == 1.50: return 0
    if rating == 1.25: return 0
    if rating == 1.00: return 0
    
    #print( f"error: rating={rating} type={type(rating)}" )
    return "2"

df['Rating'] = df['Rating'].apply(bin_ratings)

# Split the Data into Training and Testing


In [6]:
y = df['Rating']
X = df.drop(columns="Rating", axis=1)
X = pd.get_dummies(X)

In [7]:
# Check the balance of our target values
y.value_counts()

0    1087
1     700
Name: Rating, dtype: int64

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
   y,test_size= 0.40,train_size=0.60 , random_state=1, stratify=y)

In [9]:
from sklearn.preprocessing import StandardScaler

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Ensemble Learners

## Balanced Random Forest Classifier

In [10]:


# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

brfc = BalancedRandomForestClassifier(n_estimators=500, random_state=1)

# train
brfc.fit(X_train_scaled, y_train)



BalancedRandomForestClassifier(n_estimators=500, random_state=1)

In [11]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = brfc.predict(X_test_scaled)
balanced_accuracy_score(y_test, y_pred)

0.5804802955665025

In [12]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual high_risk", "Actual low_risk"], columns=["Predicted high_risk", "Predicted low_risk"])
cm_df

Unnamed: 0,Predicted high_risk,Predicted low_risk
Actual high_risk,258,177
Actual low_risk,121,159


In [13]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.68      0.59      0.57      0.63      0.58      0.34       435
          1       0.47      0.57      0.59      0.52      0.58      0.34       280

avg / total       0.60      0.58      0.58      0.59      0.58      0.34       715



In [14]:
# List the features sorted in descending order by feature importance
importances = brfc.feature_importances_ 

features_importance = sorted(zip(brfc.feature_importances_, X.columns), reverse=True)
for feature in features_importance:
    print(f"{feature[1]}: ({feature[0]})")

Cocoa_Percent: (0.2148622807254748)
Review_Date: (0.2041400820593054)
Company_Location_United States of America: (0.02962480774939886)
Bean_Type_missing: (0.024735422038490162)
Broad_Bean_Origin_Venezuela: (0.020333999459708325)
Broad_Bean_Origin_Ecuador: (0.01967335535582589)
Broad_Bean_Origin_Dominican Republic: (0.018841086047878382)
Company_Location_France: (0.017710419511947826)
Broad_Bean_Origin_Peru: (0.017211486554491638)
Bean_Type_Trinitario: (0.0170072859176071)
Company_Location_Canada: (0.016141401564468683)
Broad_Bean_Origin_Madagascar: (0.014580097532880393)
Bean_Type_Criollo: (0.014110190295096895)
Company_Location_England: (0.013710921978579432)
Bean_Type_Forastero: (0.013269264957048481)
Broad_Bean_Origin_Unknown: (0.011466996754231249)
Broad_Bean_Origin_Brazil: (0.011083330029679471)
Broad_Bean_Origin_Papua New Guinea: (0.011029981467895708)
Broad_Bean_Origin_Bolivia: (0.009913230017002965)
Company_Location_Italy: (0.009665945643974072)
Broad_Bean_Origin_Nicaragua: (0.

# Easy Ensemble AdaBoost Classifier


In [15]:


# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
eec = EasyEnsembleClassifier(n_estimators=100, random_state=1)
eec.fit(X_train_scaled, y_train)



EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [16]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = eec.predict(X_test_scaled)
balanced_accuracy_score(y_test, y_pred)

0.6073686371100164

In [17]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual high_risk", "Actual low_risk"], columns=["Predicted high_risk", "Predicted low_risk"])
cm_df

Unnamed: 0,Predicted high_risk,Predicted low_risk
Actual high_risk,241,194
Actual low_risk,95,185


In [18]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.72      0.55      0.66      0.63      0.61      0.36       435
          1       0.49      0.66      0.55      0.56      0.61      0.37       280

avg / total       0.63      0.60      0.62      0.60      0.61      0.37       715

