In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

import numpy as np
import re
import plotly as pl 
import matplotlib.pyplot as plt



In [2]:
df_chocolate = pd.read_csv("clean_flavors_of_cacao.csv", encoding='utf-8')
df_chocolate.head()

Unnamed: 0,Company,Bean_Origin_or_Bar_Name,REF,Review_Date,Cocoa_Percent,Company_Location,Rating,Bean_Type,Broad_Bean_Origin_Country,Ingredients,Most_Memorable_Characteristics,continent
0,A. Morin,Agua Grande,1876,2016,63.0,France,3.75,missing,Sao Tome & Principe,"4- B,S,C,L","sweet, chocolatey, vegetal",Africa
1,A. Morin,Kpime,1676,2015,70.0,France,2.75,missing,Togo,"4- B,S,C,L","burnt wood, earthy, choco",Africa
2,A. Morin,Atsane,1676,2015,70.0,France,3.0,missing,Togo,"4- B,S,C,L","roasty, acidic, nutty",Africa
3,A. Morin,Akata,1680,2015,70.0,France,3.5,missing,Togo,"4- B,S,C,L","mild profile, chocolaty, spice",Africa
4,A. Morin,Quilla,1704,2015,70.0,France,3.5,missing,Peru,"4- B,S,C,L","grainy texture, cocoa, sweet",South America


# Machine Learning 



In [3]:
def bin_ratings(rating):
    if rating == 5.75: return 1
    if rating == 5.50: return 1
    if rating == 5.25: return 1
    if rating == 5.00: return 1
    
    if rating == 4.75: return 1
    if rating == 4.50: return 1
    if rating == 4.25: return 1
    if rating == 4.00: return 1
    
    if rating == 3.75: return 1
    if rating == 3.50: return 0
    if rating == 3.25: return 0
    if rating == 3.00: return 0

    if rating == 2.75: return 0
    if rating == 2.50: return 0
    if rating == 2.25: return 0
    if rating == 2.00: return 0
    
    if rating == 1.75: return 0
    if rating == 1.50: return 0
    if rating == 1.25: return 0
    if rating == 1.00: return 0
    
    #print( f"error: rating={rating} type={type(rating)}" )
    return "2"

In [4]:
df_chocolate['Rating'] = df_chocolate['Rating'].apply(bin_ratings)


In [5]:
def to_string(value):
    other = f"_{value}_"
    return other

In [6]:
def reduce_count_vals(df, colname, threshold):
    counts = df[colname].value_counts()
    replace_list = list(counts[counts < threshold].index)

    # Replace in dataframe
    for item in replace_list:
       df[colname] = df[colname].replace(item,"Other")

In [7]:
df_chocolate['Review_Date']= df_chocolate['Review_Date'].apply(to_string)
reduce_count_vals(df_chocolate, 'Review_Date', 100)

## Neural Network!

In [8]:
# Drop the non-beneficial ID columns, 'EIN' and 'NAME'.
df_chocolate= df_chocolate.drop(columns=["Company_Location","REF","Bean_Origin_or_Bar_Name","Most_Memorable_Characteristics","continent"], axis=1)
df_chocolate.head()

Unnamed: 0,Company,Review_Date,Cocoa_Percent,Rating,Bean_Type,Broad_Bean_Origin_Country,Ingredients
0,A. Morin,_2016_,63.0,1,missing,Sao Tome & Principe,"4- B,S,C,L"
1,A. Morin,_2015_,70.0,0,missing,Togo,"4- B,S,C,L"
2,A. Morin,_2015_,70.0,0,missing,Togo,"4- B,S,C,L"
3,A. Morin,_2015_,70.0,0,missing,Togo,"4- B,S,C,L"
4,A. Morin,_2015_,70.0,0,missing,Peru,"4- B,S,C,L"


In [9]:
# Generate our categorical variable lists
chocolate_cat = df_chocolate.dtypes[df_chocolate.dtypes == 'object'].index.tolist()
chocolate_cat

['Company',
 'Review_Date',
 'Bean_Type',
 'Broad_Bean_Origin_Country',
 'Ingredients']

In [10]:


# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(df_chocolate[chocolate_cat]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(chocolate_cat)
encode_df.head()



Unnamed: 0,Company_A. Morin,Company_AMMA,Company_Acalli,Company_Adi,Company_Aequare (Gianduja),Company_Ah Cacao,Company_Akesson's (Pralus),Company_Alain Ducasse,Company_Alexandre,Company_Altus aka Cao Artisan,...,"Ingredients_4- B,S,V,L","Ingredients_4- B,Sw,C,L","Ingredients_4- B,Sw,C,Sa","Ingredients_4- B,Sw,C,V","Ingredients_4- B,Sw,V,L","Ingredients_5- B,S,C,L,Sa","Ingredients_5- B,S,C,V,L","Ingredients_5-B,S,C,V,Sa","Ingredients_6-B,S,C,V,L,Sa",Ingredients_Unknown
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
# Merge one-hot encoded features and drop the originals
df_chocolate = df_chocolate.merge(encode_df, left_index=True, right_index=True).drop(chocolate_cat, 1)
df_chocolate.head()

Unnamed: 0,Cocoa_Percent,Rating,Company_A. Morin,Company_AMMA,Company_Acalli,Company_Adi,Company_Aequare (Gianduja),Company_Ah Cacao,Company_Akesson's (Pralus),Company_Alain Ducasse,...,"Ingredients_4- B,S,V,L","Ingredients_4- B,Sw,C,L","Ingredients_4- B,Sw,C,Sa","Ingredients_4- B,Sw,C,V","Ingredients_4- B,Sw,V,L","Ingredients_5- B,S,C,L,Sa","Ingredients_5- B,S,C,V,L","Ingredients_5-B,S,C,V,Sa","Ingredients_6-B,S,C,V,L,Sa",Ingredients_Unknown
0,63.0,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,70.0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,70.0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,70.0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,70.0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
# Split our preprocessed data into our features and target arrays
y = df_chocolate["Rating"]
X = df_chocolate.drop(["Rating"],1) 

# Split the preprocessed data into a training and testing dataset
#X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

X_train, X_test, y_train, y_test = train_test_split(X,
   y,test_size= 0.4,train_size=0.6 , random_state=1, stratify=y)

In [13]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [14]:
len(X_train_scaled[0])

514

In [15]:
from imblearn.combine import SMOTEENN
smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)


# Random Forest

In [16]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

# Fitting the model
rf_model = rf_model.fit(X_resampled, y_resampled)

# Evaluate the model
y_pred = rf_model.predict(X_test_scaled)
print(f" Random forest predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")



 Random forest predictive accuracy: 0.901


In [17]:
# Calculated the balanced accuracy score
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.9005102040816326

In [18]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual average chocolate", "Actual high rated chocolate"], columns=["Predicted average rated chocolate", "Predicted high rated chocolate"])
cm_df

Unnamed: 0,Predicted average rated chocolate,Predicted high rated chocolate
Actual average chocolate,598,56
Actual high rated chocolate,22,108


In [19]:
# Print the imbalanced classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.91      0.94       654
           1       0.66      0.83      0.73       130

    accuracy                           0.90       784
   macro avg       0.81      0.87      0.84       784
weighted avg       0.91      0.90      0.90       784



In [20]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_



In [21]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.1416937830855881, 'Cocoa_Percent'),
 (0.02236244490422439, 'Review_Date__2009_'),
 (0.021916454312380856, 'Broad_Bean_Origin_Country_Venezuela'),
 (0.020572533049845677, 'Broad_Bean_Origin_Country_Ecuador'),
 (0.020470043981458786, 'Ingredients_3- B,S,C'),
 (0.019746242650432143, 'Bean_Type_missing'),
 (0.019497926027666, 'Broad_Bean_Origin_Country_Peru'),
 (0.018955778690206433, 'Review_Date__2014_'),
 (0.01890521653790272, 'Review_Date__2015_'),
 (0.0186785933890283, 'Company_Soma'),
 (0.018050670456475192, 'Ingredients_4- B,S,C,V'),
 (0.015907605171824563, 'Review_Date__2012_'),
 (0.015604237955816419, 'Ingredients_Unknown'),
 (0.01500420286712843, 'Bean_Type_Trinitario'),
 (0.014693549616769974, 'Review_Date__2016_'),
 (0.014327113276817727, 'Broad_Bean_Origin_Country_Dominican Republic'),
 (0.014258449063628572, 'Review_Date__2008_'),
 (0.013551670788527896, 'Review_Date__2013_'),
 (0.013521894749143754, 'Ingredients_4- B,S,C,L'),
 (0.013406456283704862, 'Bean_Type_Criollo'),


# Support Vector Machine

In [22]:
# Create the SVM model
svm = SVC(kernel='rbf')

# Train the model
svm.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = svm.predict(X_test_scaled)
print(f" SVM model accuracy: {accuracy_score(y_test,y_pred):.3f}")

 SVM model accuracy: 0.829


In [23]:
# Calculated the balanced accuracy score
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.8290816326530612

In [24]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual high_risk", "Actual low_risk"], columns=["Predicted high_risk", "Predicted low_risk"])
cm_df

Unnamed: 0,Predicted high_risk,Predicted low_risk
Actual high_risk,639,15
Actual low_risk,119,11


In [25]:
# Print the classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.98      0.91       654
           1       0.42      0.08      0.14       130

    accuracy                           0.83       784
   macro avg       0.63      0.53      0.52       784
weighted avg       0.77      0.83      0.78       784



# Deep Learning Model

In [26]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train_scaled[0])
hidden_nodes_layer1 =  256
hidden_nodes_layer2 = 128
hidden_nodes_layer3 = 64
hidden_nodes_layer4 = 32
hidden_nodes_layer5 = 16
hidden_nodes_layer6 = 8
hidden_nodes_layer7 = 4


nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units = hidden_nodes_layer1, input_dim = number_input_features, activation = 'relu'))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units = hidden_nodes_layer2, activation = 'relu'))

# other hidden layer
nn.add(tf.keras.layers.Dense(units = hidden_nodes_layer3, activation = 'relu'))
nn.add(tf.keras.layers.Dense(units = hidden_nodes_layer4, activation = 'relu'))
nn.add(tf.keras.layers.Dense(units = hidden_nodes_layer5, activation = 'relu'))
nn.add(tf.keras.layers.Dense(units = hidden_nodes_layer6, activation = 'relu'))
nn.add(tf.keras.layers.Dense(units = hidden_nodes_layer7, activation = 'relu'))


# Output layer
nn.add(tf.keras.layers.Dense(units = 1, activation = 'sigmoid'))

# Check the structure of the model
nn.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 256)               131840    
_________________________________________________________________
dense_1 (Dense)              (None, 128)               32896     
_________________________________________________________________
dense_2 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_3 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_4 (Dense)              (None, 16)                528       
_________________________________________________________________
dense_5 (Dense)              (None, 8)                 136       
_________________________________________________________________
dense_6 (Dense)              (None, 4)                 3

In [27]:
import os
from tensorflow.keras.callbacks import ModelCheckpoint

# Define the checkpoint path and filenames
os.makedirs("checkpoints/",exist_ok=True)
checkpoint_path = "checkpoints/weights.{epoch:02d}.hdf5"

In [28]:
# Compile the model
nn.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics= ['accuracy'])

In [29]:
cp_callback = ModelCheckpoint(
    filepath=checkpoint_path,
    verbose=1,
    save_weights_only=True,
    save_freq='epoch',
    period=5)



In [30]:
# Train the model
nn.fit(X_train_scaled, y_train, epochs = 100, callbacks=[cp_callback])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100

Epoch 00005: saving model to checkpoints/weights.05.hdf5
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100

Epoch 00010: saving model to checkpoints/weights.10.hdf5
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100

Epoch 00015: saving model to checkpoints/weights.15.hdf5
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100

Epoch 00020: saving model to checkpoints/weights.20.hdf5
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100

Epoch 00025: saving model to checkpoints/weights.25.hdf5
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100

Epoch 00030: saving model to checkpoints/weights.30.hdf5
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100

Epoch 00035: saving model to checkpoints/weights.35.hdf5
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100

Epoch 00040: saving model to checkpoints/weights.40.hdf5
Epoch 41/100
Epoch 42/100

Epoch 75/100

Epoch 00075: saving model to checkpoints/weights.75.hdf5
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100

Epoch 00080: saving model to checkpoints/weights.80.hdf5
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100

Epoch 00085: saving model to checkpoints/weights.85.hdf5
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100

Epoch 00090: saving model to checkpoints/weights.90.hdf5
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100

Epoch 00095: saving model to checkpoints/weights.95.hdf5
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100

Epoch 00100: saving model to checkpoints/weights.100.hdf5


<tensorflow.python.keras.callbacks.History at 0x7fdd6f192a90>

In [31]:


# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")



25/25 - 0s - loss: 4.2260 - accuracy: 0.7793
Loss: 4.2260308265686035, Accuracy: 0.7793367505073547


In [32]:
nn.save("Chocolate_Ratings_ML.h5")