In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [2]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [3]:
# Load the data
file_path = Path('clean_flavors_of_cacao.csv')
df = pd.read_csv(file_path)
df.head()


Unnamed: 0,Company,Bean_Origin_or_Bar_Name,REF,Review_Date,Cocoa_Percent,Company_Location,Rating,Bean_Type,Broad_Bean_Origin
0,A. Morin,Agua Grande,1876,2016,63.0,France,3.75,missing,Sao Tome & Principe
1,A. Morin,Kpime,1676,2015,70.0,France,2.75,missing,Togo
2,A. Morin,Atsane,1676,2015,70.0,France,3.0,missing,Togo
3,A. Morin,Akata,1680,2015,70.0,France,3.5,missing,Togo
4,A. Morin,Quilla,1704,2015,70.0,France,3.5,missing,Peru


In [4]:
#df['Rating'] = df['Rating'].replace(1.0, 'A')\
#.replace( 1.5, 'B')\
#.replace(1.75,'C')\
#.replace(2.0, 'D')\
#.replace(2.25, 'E')\
#.replace(2.5, 'F')\
#.replace(2.75, 'G')\
#.replace(3.0,'H')\
#.replace(3.25, 'I')\
#.replace(3.5, 'J')\
#.replace(3.75, 'K')\
#.replace(4.0, 'L')\
#.replace(5.0, 'M')

#df['Rating']

In [5]:
df= df.drop(columns=["Company", "REF","Bean_Origin_or_Bar_Name"], axis=1)
df.head()

Unnamed: 0,Review_Date,Cocoa_Percent,Company_Location,Rating,Bean_Type,Broad_Bean_Origin
0,2016,63.0,France,3.75,missing,Sao Tome & Principe
1,2015,70.0,France,2.75,missing,Togo
2,2015,70.0,France,3.0,missing,Togo
3,2015,70.0,France,3.5,missing,Togo
4,2015,70.0,France,3.5,missing,Peru


In [6]:
def bin_ratings(rating):
    if rating == 5.75: return 1
    if rating == 5.50: return 1
    if rating == 5.25: return 1
    if rating == 5.00: return 1
    
    if rating == 4.75: return 1
    if rating == 4.50: return 1
    if rating == 4.25: return 1
    if rating == 4.00: return 1
    
    if rating == 3.75: return 0
    if rating == 3.50: return 0
    if rating == 3.25: return 0
    if rating == 3.00: return 0

    if rating == 2.75: return 0
    if rating == 2.50: return 0
    if rating == 2.25: return 0
    if rating == 2.00: return 0
    
    if rating == 1.75: return 0
    if rating == 1.50: return 0
    if rating == 1.25: return 0
    if rating == 1.00: return 0
    
    #print( f"error: rating={rating} type={type(rating)}" )
    return "2"

df['Rating'] = df['Rating'].apply(bin_ratings)

# Split the Data into Training and Testing


In [7]:
y = df['Rating']
X = df.drop(columns="Rating", axis=1)
X = pd.get_dummies(X)

In [8]:
# Check the balance of our target values
y.value_counts()

0    1688
1      99
Name: Rating, dtype: int64

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
   y,test_size= 0.3,train_size=0.7 , random_state=1, stratify=y)

# Ensemble Learners

## Balanced Random Forest Classifier

In [10]:


# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

brfc = BalancedRandomForestClassifier(n_estimators=100, random_state=1)

# train
brfc.fit(X_train, y_train)



BalancedRandomForestClassifier(random_state=1)

In [11]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = brfc.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.6309664694280079

In [12]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual high_risk", "Actual low_risk"], columns=["Predicted high_risk", "Predicted low_risk"])
cm_df

Unnamed: 0,Predicted high_risk,Predicted low_risk
Actual high_risk,268,239
Actual low_risk,8,22


In [13]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.97      0.53      0.73      0.68      0.62      0.38       507
          1       0.08      0.73      0.53      0.15      0.62      0.40        30

avg / total       0.92      0.54      0.72      0.65      0.62      0.38       537



In [14]:
# List the features sorted in descending order by feature importance
importances = brfc.feature_importances_ 

features_importance = sorted(zip(brfc.feature_importances_, X.columns), reverse=True)
for feature in features_importance:
    print(f"{feature[1]}: ({feature[0]})")

Review_Date: (0.1860679289908285)
Cocoa_Percent: (0.15501704956480908)
Company_Location_United States of America: (0.032209278870024366)
Broad_Bean_Origin_Dominican Republic: (0.03199133618108755)
Broad_Bean_Origin_Peru: (0.03148744636496944)
Bean_Type_missing: (0.02964063426739646)
Company_Location_France: (0.02886557352899661)
Bean_Type_Criollo: (0.027574163846396712)
Bean_Type_Trinitario: (0.021396617461518656)
Company_Location_Canada: (0.02129143741722441)
Broad_Bean_Origin_Venezuela: (0.021090069626658728)
Broad_Bean_Origin_Ecuador: (0.020782327727494845)
Broad_Bean_Origin_Papua New Guinea: (0.020288753004611994)
Company_Location_Italy: (0.018080180825062207)
Broad_Bean_Origin_Madagascar: (0.016063264224976656)
Bean_Type_Forastero: (0.016037078776300572)
Company_Location_Ecuador: (0.01474186775581037)
Company_Location_Belgium: (0.012532912417808382)
Company_Location_Australia: (0.012218808750890504)
Broad_Bean_Origin_Brazil: (0.010461537837019788)
Company_Location_England: (0.0104

# Easy Ensemble AdaBoost Classifier


In [15]:


# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
eec = EasyEnsembleClassifier(n_estimators=100, random_state=1)
eec.fit(X_train, y_train)



EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [16]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = eec.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.6269230769230769

In [17]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual high_risk", "Actual low_risk"], columns=["Predicted high_risk", "Predicted low_risk"])
cm_df

Unnamed: 0,Predicted high_risk,Predicted low_risk
Actual high_risk,247,260
Actual low_risk,7,23


In [18]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.97      0.49      0.77      0.65      0.61      0.36       507
          1       0.08      0.77      0.49      0.15      0.61      0.38        30

avg / total       0.92      0.50      0.75      0.62      0.61      0.36       537

