In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [2]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [3]:
# Load the data
file_path = Path('Resources/clean_flavors_of_cacao.csv')
df = pd.read_csv(file_path)
df.head()


Unnamed: 0,Company,Bean_Origin_or_Bar_Name,REF,Review_Date,Cocoa_Percent,Company_Location,Rating,Bean_Type,Broad_Bean_Origin_Country,Ingredients,Most_Memorable_Characteristics,continent
0,A. Morin,Agua Grande,1876,2016,63.0,France,3.75,missing,Sao Tome & Principe,"4- B,S,C,L","sweet, chocolatey, vegetal",Africa
1,A. Morin,Kpime,1676,2015,70.0,France,2.75,missing,Togo,"4- B,S,C,L","burnt wood, earthy, choco",Africa
2,A. Morin,Atsane,1676,2015,70.0,France,3.0,missing,Togo,"4- B,S,C,L","roasty, acidic, nutty",Africa
3,A. Morin,Akata,1680,2015,70.0,France,3.5,missing,Togo,"4- B,S,C,L","mild profile, chocolaty, spice",Africa
4,A. Morin,Quilla,1704,2015,70.0,France,3.5,missing,Peru,"4- B,S,C,L","grainy texture, cocoa, sweet",South America


In [4]:
df= df.drop(columns=["Company","REF","Bean_Origin_or_Bar_Name","Most_Memorable_Characteristics","continent"], axis=1)
df.head()

Unnamed: 0,Review_Date,Cocoa_Percent,Company_Location,Rating,Bean_Type,Broad_Bean_Origin_Country,Ingredients
0,2016,63.0,France,3.75,missing,Sao Tome & Principe,"4- B,S,C,L"
1,2015,70.0,France,2.75,missing,Togo,"4- B,S,C,L"
2,2015,70.0,France,3.0,missing,Togo,"4- B,S,C,L"
3,2015,70.0,France,3.5,missing,Togo,"4- B,S,C,L"
4,2015,70.0,France,3.5,missing,Peru,"4- B,S,C,L"


In [5]:
def bin_ratings(rating):
    if rating == 5.75: return 1
    if rating == 5.50: return 1
    if rating == 5.25: return 1
    if rating == 5.00: return 1
    
    if rating == 4.75: return 1
    if rating == 4.50: return 1
    if rating == 4.25: return 1
    if rating == 4.00: return 1
    
    if rating == 3.75: return 1
    if rating == 3.50: return 0
    if rating == 3.25: return 0
    if rating == 3.00: return 0

    if rating == 2.75: return 0
    if rating == 2.50: return 0
    if rating == 2.25: return 0
    if rating == 2.00: return 0
    
    if rating == 1.75: return 0
    if rating == 1.50: return 0
    if rating == 1.25: return 0
    if rating == 1.00: return 0
    
    #print( f"error: rating={rating} type={type(rating)}" )
    return "2"

df['Rating'] = df['Rating'].apply(bin_ratings)

# Split the Data into Training and Testing


In [6]:
y = df['Rating']
X = df.drop(columns="Rating", axis=1)
X = pd.get_dummies(X)

In [7]:
# Check the balance of our target values
y.value_counts()

0    1634
1     326
Name: Rating, dtype: int64

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
   y,test_size= 0.40,train_size=0.60 , random_state=1, stratify=y)

In [9]:
from sklearn.preprocessing import StandardScaler

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Ensemble Learners

## Balanced Random Forest Classifier

In [10]:


# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

brfc = BalancedRandomForestClassifier(n_estimators=500, random_state=1)

# train
brfc.fit(X_train_scaled, y_train)



BalancedRandomForestClassifier(n_estimators=500, random_state=1)

In [11]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = brfc.predict(X_test_scaled)
balanced_accuracy_score(y_test, y_pred)

0.606504351917196

In [12]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual high_risk", "Actual low_risk"], columns=["Predicted high_risk", "Predicted low_risk"])
cm_df

Unnamed: 0,Predicted high_risk,Predicted low_risk
Actual high_risk,416,238
Actual low_risk,55,75


In [13]:
confusion_matrix(y_test, y_pred)

array([[416, 238],
       [ 55,  75]], dtype=int64)

In [14]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.88      0.64      0.58      0.74      0.61      0.37       654
          1       0.24      0.58      0.64      0.34      0.61      0.36       130

avg / total       0.78      0.63      0.59      0.67      0.61      0.37       784



In [15]:
# List the features sorted in descending order by feature importance
importances = brfc.feature_importances_ 

features_importance = sorted(zip(brfc.feature_importances_, X.columns), reverse=True)
for feature in features_importance:
    print(f"{feature[1]}: ({feature[0]})")

Cocoa_Percent: (0.16315402094295392)
Review_Date: (0.1441217996502986)
Company_Location_United States of America: (0.028082357504149093)
Bean_Type_missing: (0.027538156579082627)
Ingredients_3- B,S,C: (0.02558223542282417)
Ingredients_2- B,S: (0.023942842342376505)
Broad_Bean_Origin_Country_Venezuela: (0.022399551509740736)
Ingredients_Unknown: (0.021884300935419272)
Bean_Type_Trinitario: (0.02069535661324876)
Broad_Bean_Origin_Country_Dominican Republic: (0.01960944131687568)
Company_Location_Canada: (0.019382605850541364)
Broad_Bean_Origin_Country_Peru: (0.01913308493572276)
Broad_Bean_Origin_Country_Ecuador: (0.018463629739712138)
Broad_Bean_Origin_Country_Madagascar: (0.018062399763011164)
Company_Location_France: (0.017435862608688922)
Bean_Type_Criollo: (0.016127942678567205)
Ingredients_4- B,S,C,L: (0.015437089492701159)
Ingredients_5- B,S,C,V,L: (0.014350893993902108)
Bean_Type_Forastero: (0.013748300018626383)
Ingredients_4- B,S,C,V: (0.012597000835753715)
Company_Location_Ita

# Easy Ensemble AdaBoost Classifier


In [16]:


# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
eec = EasyEnsembleClassifier(n_estimators=100, random_state=1)
eec.fit(X_train_scaled, y_train)



EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [17]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = eec.predict(X_test_scaled)
balanced_accuracy_score(y_test, y_pred)

0.5869560103505058

In [18]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual high_risk", "Actual low_risk"], columns=["Predicted high_risk", "Predicted low_risk"])
cm_df

Unnamed: 0,Predicted high_risk,Predicted low_risk
Actual high_risk,320,334
Actual low_risk,41,89


In [20]:
confusion_matrix(y_test, y_pred)

array([[320, 334],
       [ 41,  89]], dtype=int64)

In [19]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.89      0.49      0.68      0.63      0.58      0.33       654
          1       0.21      0.68      0.49      0.32      0.58      0.34       130

avg / total       0.77      0.52      0.65      0.58      0.58      0.33       784

