In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, balanced_accuracy_score
from imblearn.metrics import classification_report_imbalanced
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

import numpy as np
from collections import Counter
import re
import plotly as pl 
import matplotlib.pyplot as plt
# Import db related librarys
import psycopg2
import sys
# import boto3
import os
import sqlalchemy
import json
import csv
from getpass import getpass


In [2]:
# Configure connection parameters to AWS RDS
def connect_to_db():
    
    connection = None
    #enter_password = getpass('Enter database password')
    
    try:
        print('Connecting to the PostgreSQL database...')
        connection = psycopg2.connect(
            host = "database-chocolate.cafzzay3t2tr.us-east-2.rds.amazonaws.com",
            port = 5432,
            user = 'postgres',
            password = getpass('Enter database password'),
            database = 'postgres'
            )
        print('Connection successful')
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        sys.exit(1) 
    return connection

In [3]:
# Call the connect_to_db to connect to database
connection = connect_to_db()

Connecting to the PostgreSQL database...
Enter database password········
Connection successful


In [4]:
# Create cursor to perform database operations
cursor = connection.cursor()

# If curser object = 0, then connection was successfully established
cursor

<cursor object at 0x000001A907970BA8; closed: 0>

In [5]:
# Query the location_table from AWS database and store in dataframe - verify there is data in the table.
sql = """
SELECT * FROM location_table
"""

location_table_from_db = pd.read_sql(sql, con=connection)

In [6]:
# Display the location_table queried from database
location_table_from_db

Unnamed: 0,country_code,latitude,longitude,broad_bean_origin_country
0,AD,42.546245,1.601554,Andorra
1,AE,23.424076,53.847818,United Arab Emirates
2,AF,33.939110,67.709953,Afghanistan
3,AG,17.060816,-61.796428,Antigua and Barbuda
4,AI,18.220554,-63.068615,Anguilla
...,...,...,...,...
242,ZM,-13.133897,27.849332,Zambia
243,ZW,-19.015438,29.154857,Zimbabwe
244,HI,19.898682,-155.665857,Hawaii
245,ZZ,0.000000,0.000000,Unknown


In [7]:
# Query the clean_flavours_table from AWS database and store in dataframe.
sql = """
SELECT * FROM clean_flavors_table
"""

clean_flavors_from_db = pd.read_sql(sql, con=connection)

In [8]:
clean_flavors_from_db

Unnamed: 0,company,bean_origin_or_bar_name,REF,review_date,cocoa_percent,company_location,rating,bean_type,broad_bean_origin_country,ingredients,most_memorable_characteristics,continent
0,A. Morin,Agua Grande,1876,2016,63.0,France,3.75,missing,Sao Tome & Principe,"4- B,S,C,L","sweet, chocolatey, vegetal",Africa
1,A. Morin,Kpime,1676,2015,70.0,France,2.75,missing,Togo,"4- B,S,C,L","burnt wood, earthy, choco",Africa
2,A. Morin,Atsane,1676,2015,70.0,France,3.00,missing,Togo,"4- B,S,C,L","roasty, acidic, nutty",Africa
3,A. Morin,Akata,1680,2015,70.0,France,3.50,missing,Togo,"4- B,S,C,L","mild profile, chocolaty, spice",Africa
4,A. Morin,Quilla,1704,2015,70.0,France,3.50,missing,Peru,"4- B,S,C,L","grainy texture, cocoa, sweet",South America
...,...,...,...,...,...,...,...,...,...,...,...,...
1955,Zotter,Peru,647,2011,70.0,Austria,3.75,missing,Peru,"4- B,Sw,C,Sa","creamy, fatty, mild nutty",South America
1956,Zotter,Congo,749,2011,65.0,Austria,3.00,Forastero,Republic of Congo,"4- B,Sw,C,Sa","dairy, salt, caramel",Africa
1957,Zotter,Kerala State,749,2011,65.0,Austria,3.50,Forastero,India,"4- B,Sw,C,Sa","creamy, masculine, earthy",Asia
1958,Zotter,Kerala State,781,2011,62.0,Austria,3.25,missing,India,"4- B,Sw,C,Sa","oily, subdued, caramel, salt",Asia


In [9]:
# Join the 2 tables together on broad_bean_origin_country, select the columns you want to view in df

sql = """ SELECT clean_flavors_table .*, location_table.country_code,location_table.longitude, location_table.latitude
FROM clean_flavors_table
LEFT JOIN location_table
ON clean_flavors_table.broad_bean_origin_country = location_table.broad_bean_origin_country;
"""

# Store the joined tables in dataframe
df_chocolate = pd.read_sql(sql, con=connection)

# View the new dataframe combined from two sql tables
df_chocolate.head(10)

Unnamed: 0,company,bean_origin_or_bar_name,REF,review_date,cocoa_percent,company_location,rating,bean_type,broad_bean_origin_country,ingredients,most_memorable_characteristics,continent,country_code,longitude,latitude
0,A. Morin,Agua Grande,1876,2016,63.0,France,3.75,missing,Sao Tome & Principe,"4- B,S,C,L","sweet, chocolatey, vegetal",Africa,ST,6.613081,0.18636
1,A. Morin,Kpime,1676,2015,70.0,France,2.75,missing,Togo,"4- B,S,C,L","burnt wood, earthy, choco",Africa,TG,0.824782,8.619543
2,A. Morin,Atsane,1676,2015,70.0,France,3.0,missing,Togo,"4- B,S,C,L","roasty, acidic, nutty",Africa,TG,0.824782,8.619543
3,A. Morin,Akata,1680,2015,70.0,France,3.5,missing,Togo,"4- B,S,C,L","mild profile, chocolaty, spice",Africa,TG,0.824782,8.619543
4,A. Morin,Quilla,1704,2015,70.0,France,3.5,missing,Peru,"4- B,S,C,L","grainy texture, cocoa, sweet",South America,PE,-75.015152,-9.189967
5,A. Morin,Carenero,1315,2014,70.0,France,2.75,Criollo,Venezuela,Unknown,missing,South America,VE,-66.58973,6.42375
6,A. Morin,Cuba,1315,2014,70.0,France,3.5,missing,Cuba,"4- B,S,C,L","sliglty dry, papaya",Caribbean,CU,-77.781167,21.521757
7,A. Morin,Sur del Lago,1315,2014,70.0,France,3.5,Criollo,Venezuela,Unknown,missing,South America,VE,-66.58973,6.42375
8,A. Morin,Puerto Cabello,1319,2014,70.0,France,3.75,Criollo,Venezuela,Unknown,missing,South America,VE,-66.58973,6.42375
9,A. Morin,Pablino,1319,2014,70.0,France,4.0,missing,Peru,"4- B,S,C,L","delicate, hazelnut, brownie",South America,PE,-75.015152,-9.189967


In [10]:
# Close connection to RDS
cursor.close()

In [11]:
# Check that connection is closed where object returned = -1
cursor

<cursor object at 0x000001A907970BA8; closed: -1>

# Machine Learning 



In [12]:
def bin_ratings(rating):
    if rating == 5.75: return 1
    if rating == 5.50: return 1
    if rating == 5.25: return 1
    if rating == 5.00: return 1
    
    if rating == 4.75: return 1
    if rating == 4.50: return 1
    if rating == 4.25: return 1
    if rating == 4.00: return 1
    
    if rating == 3.75: return 1
    if rating == 3.50: return 0
    if rating == 3.25: return 0
    if rating == 3.00: return 0

    if rating == 2.75: return 0
    if rating == 2.50: return 0
    if rating == 2.25: return 0
    if rating == 2.00: return 0
    
    if rating == 1.75: return 0
    if rating == 1.50: return 0
    if rating == 1.25: return 0
    if rating == 1.00: return 0
    
    #print( f"error: rating={rating} type={type(rating)}" )
    return "2"

In [13]:
df_chocolate['rating'] = df_chocolate['rating'].apply(bin_ratings)


In [14]:
def to_string(value):
    other = f"_{value}_"
    return other

In [15]:
def reduce_count_vals(df, colname, threshold):
    counts = df[colname].value_counts()
    replace_list = list(counts[counts < threshold].index)

    # Replace in dataframe
    for item in replace_list:
       df[colname] = df[colname].replace(item,"Other")

In [16]:
df_chocolate['review_date']= df_chocolate['review_date'].apply(to_string)
reduce_count_vals(df_chocolate, 'review_date', 100)

## Machine Learning!

In [17]:
# Drop rows where review_date= 2009 or 2010
# indexNames = df_chocolate[(df_chocolate["review_date"]=='_2010_') | (df_chocolate["review_date"]=='_2009_')].index
# df_chocolate.drop(indexNames, inplace=True)
# df_chocolate.head()

In [18]:
# Drop the non-beneficial ID columns, 'EIN' and 'NAME'.
df_chocolate= df_chocolate.drop(columns=["company_location","REF","bean_origin_or_bar_name","most_memorable_characteristics","latitude","longitude","country_code"], axis=1)
df_chocolate.head()

Unnamed: 0,company,review_date,cocoa_percent,rating,bean_type,broad_bean_origin_country,ingredients,continent
0,A. Morin,_2016_,63.0,1,missing,Sao Tome & Principe,"4- B,S,C,L",Africa
1,A. Morin,_2015_,70.0,0,missing,Togo,"4- B,S,C,L",Africa
2,A. Morin,_2015_,70.0,0,missing,Togo,"4- B,S,C,L",Africa
3,A. Morin,_2015_,70.0,0,missing,Togo,"4- B,S,C,L",Africa
4,A. Morin,_2015_,70.0,0,missing,Peru,"4- B,S,C,L",South America


In [19]:
df_chocolate.describe()

Unnamed: 0,cocoa_percent,rating
count,1960.0,1960.0
mean,71.760204,0.166327
std,6.657419,0.372469
min,42.0,0.0
25%,70.0,0.0
50%,70.0,0.0
75%,75.0,0.0
max,100.0,1.0


In [20]:
# Generate our categorical variable lists
chocolate_cat = df_chocolate.dtypes[df_chocolate.dtypes == 'object'].index.tolist()
chocolate_cat

['company',
 'review_date',
 'bean_type',
 'broad_bean_origin_country',
 'ingredients',
 'continent']

In [21]:


# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(df_chocolate[chocolate_cat]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(chocolate_cat)
encode_df.head()



Unnamed: 0,company_A. Morin,company_AMMA,company_Acalli,company_Adi,company_Aequare (Gianduja),company_Ah Cacao,company_Akesson's (Pralus),company_Alain Ducasse,company_Alexandre,company_Altus aka Cao Artisan,...,ingredients_Unknown,continent_Africa,continent_Asia,continent_Caribbean,continent_Central America,continent_North America,continent_Oceania,continent_Other,continent_South America,continent_Unknown
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [22]:
# Merge one-hot encoded features and drop the originals
df_chocolate = df_chocolate.merge(encode_df, left_index=True, right_index=True).drop(chocolate_cat, 1)
df_chocolate.head()

Unnamed: 0,cocoa_percent,rating,company_A. Morin,company_AMMA,company_Acalli,company_Adi,company_Aequare (Gianduja),company_Ah Cacao,company_Akesson's (Pralus),company_Alain Ducasse,...,ingredients_Unknown,continent_Africa,continent_Asia,continent_Caribbean,continent_Central America,continent_North America,continent_Oceania,continent_Other,continent_South America,continent_Unknown
0,63.0,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,70.0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,70.0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,70.0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,70.0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [23]:
# Split our preprocessed data into our features and target arrays
y = df_chocolate["rating"]
X = df_chocolate.drop(["rating"],1) 

# Split the preprocessed data into a training and testing dataset
#X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

X_train, X_test, y_train, y_test = train_test_split(X,
   y,test_size= 0.2,train_size=0.8 , random_state=42, stratify=y)


In [24]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [25]:
len(X_train_scaled[0])

523

In [26]:
from imblearn.combine import SMOTEENN
smote_enn = SMOTEENN(random_state=42)
X_resampled, y_resampled = smote_enn.fit_resample(X_train_scaled, y_train)


# Logistic Regression

In [27]:
classifier = LogisticRegression(solver='lbfgs',
   max_iter=400,
   random_state=42)

In [28]:
classifier.fit(X_train_scaled, y_train)

LogisticRegression(max_iter=400, random_state=42)

In [29]:
y_pred = classifier.predict(X_test_scaled)

In [30]:
print(accuracy_score(y_test, y_pred))

0.8163265306122449


In [31]:
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
    cm, index=["Actual average chocolate", "Actual high rated chocolate"], columns=["Predicted average rated chocolate", "Predicted high rated chocolate"])
cm_df

Unnamed: 0,Predicted average rated chocolate,Predicted high rated chocolate
Actual average chocolate,294,33
Actual high rated chocolate,39,26


In [32]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.90      0.89       327
           1       0.44      0.40      0.42        65

    accuracy                           0.82       392
   macro avg       0.66      0.65      0.66       392
weighted avg       0.81      0.82      0.81       392



# Naive Random Oversampling


In [33]:


# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler
# Instantiate the model
ros = RandomOverSampler(random_state=42)
# Resample the targets
X_resampled_Oversampling, y_resampled_Oversampling = ros.fit_resample(X_train_scaled, y_train)
Counter(y_resampled_Oversampling)



Counter({0: 1307, 1: 1307})

In [34]:
# from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', random_state=42)
model.fit(X_resampled_Oversampling, y_resampled_Oversampling)
# Calculate predictions
y_pred = model.predict(X_test_scaled)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [35]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)

0.700846859562456

In [36]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[237,  90],
       [ 21,  44]], dtype=int64)

In [37]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.92      0.72      0.68      0.81      0.70      0.49       327
          1       0.33      0.68      0.72      0.44      0.70      0.49        65

avg / total       0.82      0.72      0.68      0.75      0.70      0.49       392



# Random Forest

In [38]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train_scaled, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
 max_depth=None, max_features='auto', max_leaf_nodes=None,
 min_impurity_split=1e-07, min_samples_leaf=1,
 min_samples_split=2, min_weight_fraction_leaf=0.0,
 n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
 verbose=0, warm_start=False)
y_pred = rf.predict(X_test_scaled)

In [39]:
from sklearn.metrics import roc_curve, auc
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(false_positive_rate, true_positive_rate)
roc_auc

0.6293813220418726

## N_estimators

In [40]:
# Create a random forest classifier.
n_estimators = [1, 2, 4, 8, 16, 32, 64, 100, 200]

train_results = []
test_results = []
for estimator in n_estimators:
   rf = RandomForestClassifier(n_estimators=estimator, n_jobs=-1)
   rf.fit(X_train, y_train)
   train_pred = rf.predict(X_train)
   false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train, train_pred)
   roc_auc = auc(false_positive_rate, true_positive_rate)
   train_results.append(roc_auc)
   y_pred = rf.predict(X_test)
   false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
   roc_auc = auc(false_positive_rate, true_positive_rate)
   test_results.append(roc_auc)   
    
    
print(f" Random forest predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")




 Random forest predictive accuracy: 0.842


In [41]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual average chocolate", "Actual high rated chocolate"], columns=["Predicted average rated chocolate", "Predicted high rated chocolate"])
cm_df

Unnamed: 0,Predicted average rated chocolate,Predicted high rated chocolate
Actual average chocolate,312,15
Actual high rated chocolate,47,18


In [42]:
# Print the imbalanced classification report

print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.87      0.95      0.28      0.91      0.51      0.28       327
          1       0.55      0.28      0.95      0.37      0.51      0.25        65

avg / total       0.82      0.84      0.39      0.82      0.51      0.28       392



##  max_depth

In [43]:
max_depths = np.linspace(1, 32, 32, endpoint=True)
train_results = []
test_results = []
for max_depth in max_depths:
   rf = RandomForestClassifier(max_depth=max_depth, n_jobs=-1)
   rf.fit(X_train, y_train)
   train_pred = rf.predict(X_train)
   false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train, train_pred)
   roc_auc = auc(false_positive_rate, true_positive_rate)
   train_results.append(roc_auc)
   y_pred = rf.predict(X_test)
   false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
   roc_auc = auc(false_positive_rate, true_positive_rate)
   test_results.append(roc_auc)

In [44]:
# Calculated the balanced accuracy score
print(f" Random forest predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")


 Random forest predictive accuracy: 0.865


In [45]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual average chocolate", "Actual high rated chocolate"], columns=["Predicted average rated chocolate", "Predicted high rated chocolate"])
cm_df

Unnamed: 0,Predicted average rated chocolate,Predicted high rated chocolate
Actual average chocolate,324,3
Actual high rated chocolate,50,15


In [46]:
# Print the imbalanced classification report

print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.87      0.99      0.23      0.92      0.48      0.25       327
          1       0.83      0.23      0.99      0.36      0.48      0.21        65

avg / total       0.86      0.86      0.36      0.83      0.48      0.24       392



In [47]:
# Calculate feature importance in the Random Forest model.
importances = rf.feature_importances_



In [48]:
# We can sort the features by their importance.
sorted(zip(rf.feature_importances_, X.columns), reverse=True)

[(0.08748869969751757, 'cocoa_percent'),
 (0.024820466436019206, 'company_Soma'),
 (0.017971410607565824, 'bean_type_Trinitario'),
 (0.01700691093808583, 'bean_type_missing'),
 (0.015263632748597497, 'continent_South America'),
 (0.015198014168234485, 'company_Idilio (Felchlin)'),
 (0.015032417420421688, 'bean_type_Criollo'),
 (0.014554853486997154, 'company_AMMA'),
 (0.014222447378910255, 'review_date_Other'),
 (0.014085285626708289, 'company_Bonnat'),
 (0.014034819005956649, 'review_date__2011_'),
 (0.01380991582648132, 'broad_bean_origin_country_Venezuela'),
 (0.013790756829960997, 'review_date__2014_'),
 (0.013673335211697207, 'review_date__2013_'),
 (0.013473005173487209, 'ingredients_Unknown'),
 (0.01339001497363524, 'review_date__2015_'),
 (0.012915933847163878, 'review_date__2012_'),
 (0.012832521823240552, 'continent_Caribbean'),
 (0.012828588943619197, 'ingredients_3- B,S,C'),
 (0.012483764227782706, 'ingredients_2- B,S'),
 (0.012425469833530232, 'company_Madecasse (Cinagra)'

# Balanced RandomForest Classifier

In [49]:
from imblearn.ensemble import BalancedRandomForestClassifier
brf_model= BalancedRandomForestClassifier(n_estimators=500, random_state=42, criterion='entropy', min_samples_leaf=1, max_features=30)
brf_model.fit(X_train_scaled, y_train)
y_pred_brf = brf_model.predict(X_test_scaled)

In [50]:
# Calculated the balanced accuracy score
balanced_accuracy_score(y_test,y_pred_brf)

0.7134556574923547

In [51]:
# Display the confusion matrix
matrix= confusion_matrix(y_test, y_pred_brf)
print(matrix)

[[205 122]
 [ 13  52]]


In [52]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred_brf))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.94      0.63      0.80      0.75      0.71      0.49       327
          1       0.30      0.80      0.63      0.44      0.71      0.51        65

avg / total       0.83      0.66      0.77      0.70      0.71      0.50       392



In [53]:
importances = brf_model.feature_importances_
importances
from operator import itemgetter
ordered_importances= sorted(zip(X.columns,importances), key=itemgetter(1), reverse=True)
ordered_importances

[('cocoa_percent', 0.1062502618036808),
 ('company_Soma', 0.017263593898580294),
 ('bean_type_missing', 0.017076335026303197),
 ('ingredients_3- B,S,C', 0.01635672085815606),
 ('ingredients_2- B,S', 0.015309044630044464),
 ('review_date__2015_', 0.0147159404323561),
 ('ingredients_Unknown', 0.013357531612318323),
 ('review_date__2014_', 0.013267569166245089),
 ('review_date__2016_', 0.013128614932686656),
 ('broad_bean_origin_country_Venezuela', 0.012926699815736924),
 ('bean_type_Trinitario', 0.012903579921003763),
 ('bean_type_Criollo', 0.01258706381094171),
 ('review_date__2012_', 0.012410348328120836),
 ('review_date__2013_', 0.012247404903270668),
 ('continent_South America', 0.012089794132481381),
 ('review_date__2011_', 0.011572619414724757),
 ('review_date_Other', 0.011192577557542901),
 ('ingredients_4- B,S,C,L', 0.011031409886970012),
 ('review_date__2009_', 0.011026903260207143),
 ('review_date__2010_', 0.010580233075815964),
 ('broad_bean_origin_country_Peru', 0.01031052368

# Easy Ensemble AdaBoost Classifier


In [54]:


# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
eec = EasyEnsembleClassifier(n_estimators=100, random_state=42)
eec.fit(X_train_scaled, y_train)



EasyEnsembleClassifier(n_estimators=100, random_state=42)

In [55]:
y_pred_eac = brf_model.predict(X_test_scaled)
balanced_accuracy_score(y_test, y_pred_eac)

0.7134556574923547

In [56]:


# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred_eac)



array([[205, 122],
       [ 13,  52]], dtype=int64)

In [57]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred_eac))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.94      0.63      0.80      0.75      0.71      0.49       327
          1       0.30      0.80      0.63      0.44      0.71      0.51        65

avg / total       0.83      0.66      0.77      0.70      0.71      0.50       392



# Support Vector Machine

In [58]:
# Create the SVM model
svm = SVC(kernel='rbf')

# Train the model
svm.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = svm.predict(X_test_scaled)
print(f" SVM model accuracy: {accuracy_score(y_test,y_pred):.3f}")

 SVM model accuracy: 0.837


In [59]:
# Calculated the balanced accuracy score
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.8367346938775511

In [60]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual average chocolate", "Actual high rated chocolate"], columns=["Predicted average rated chocolate", "Predicted high rated chocolate"])
cm_df

Unnamed: 0,Predicted average rated chocolate,Predicted high rated chocolate
Actual average chocolate,316,11
Actual high rated chocolate,53,12


In [61]:
# Print the classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.97      0.91       327
           1       0.52      0.18      0.27        65

    accuracy                           0.84       392
   macro avg       0.69      0.58      0.59       392
weighted avg       0.80      0.84      0.80       392



# GradientBoost

In [62]:
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
    classifier = GradientBoostingClassifier(n_estimators=20,
    learning_rate=learning_rate,
    max_features=5,
    max_depth=3,
    random_state=42)
    classifier.fit(X_train_scaled, y_train)
    print("Learning rate: ", learning_rate)
    print("Accuracy score (training): {0:.3f}".format(
           classifier.score(
               X_train_scaled,
               y_train)))
    print("Accuracy score (validation): {0:.3f}".format(
           classifier.score(
               X_test_scaled,
               y_test)))

Learning rate:  0.05
Accuracy score (training): 0.834
Accuracy score (validation): 0.834
Learning rate:  0.1
Accuracy score (training): 0.834
Accuracy score (validation): 0.834
Learning rate:  0.25
Accuracy score (training): 0.839
Accuracy score (validation): 0.839
Learning rate:  0.5
Accuracy score (training): 0.858
Accuracy score (validation): 0.834
Learning rate:  0.75
Accuracy score (training): 0.856
Accuracy score (validation): 0.819
Learning rate:  1
Accuracy score (training): 0.857
Accuracy score (validation): 0.814


In [63]:
classifier = GradientBoostingClassifier(n_estimators=20,
   learning_rate=0.75, max_features=5, max_depth=3, random_state=42)

classifier.fit(X_train_scaled, y_train)
predictions = classifier.predict(X_test_scaled)

In [64]:
acc_score = accuracy_score(y_test, predictions)
print(f"Accuracy Score : {acc_score}")

Accuracy Score : 0.8188775510204082


In [65]:
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
   cm, index=["Actual 0", "Actual 1"],
   columns=["Predicted 0", "Predicted 1"]
)
display(cm_df)
print("Classification Report")
print(classification_report(y_test, predictions))

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,303,24
Actual 1,47,18


Classification Report
              precision    recall  f1-score   support

           0       0.87      0.93      0.90       327
           1       0.43      0.28      0.34        65

    accuracy                           0.82       392
   macro avg       0.65      0.60      0.62       392
weighted avg       0.79      0.82      0.80       392



# Deep Learning Model

In [66]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train_scaled[0])
hidden_nodes_layer1 =  256
hidden_nodes_layer2 = 128
hidden_nodes_layer3 = 64
hidden_nodes_layer4 = 32
hidden_nodes_layer5 = 16
hidden_nodes_layer6 = 8
hidden_nodes_layer7 = 4


nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units = hidden_nodes_layer1, input_dim = number_input_features, activation = 'relu'))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units = hidden_nodes_layer2, activation = 'relu'))

# other hidden layer
nn.add(tf.keras.layers.Dense(units = hidden_nodes_layer3, activation = 'relu'))
nn.add(tf.keras.layers.Dense(units = hidden_nodes_layer4, activation = 'relu'))
nn.add(tf.keras.layers.Dense(units = hidden_nodes_layer5, activation = 'relu'))
nn.add(tf.keras.layers.Dense(units = hidden_nodes_layer6, activation = 'relu'))
nn.add(tf.keras.layers.Dense(units = hidden_nodes_layer7, activation = 'relu'))


# Output layer
nn.add(tf.keras.layers.Dense(units = 1, activation = 'sigmoid'))

# Check the structure of the model
nn.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 256)               134144    
_________________________________________________________________
dense_1 (Dense)              (None, 128)               32896     
_________________________________________________________________
dense_2 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_3 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_4 (Dense)              (None, 16)                528       
_________________________________________________________________
dense_5 (Dense)              (None, 8)                 136       
_________________________________________________________________
dense_6 (Dense)              (None, 4)                 3

In [67]:
import os
from tensorflow.keras.callbacks import ModelCheckpoint

# Define the checkpoint path and filenames
os.makedirs("checkpoints/",exist_ok=True)
checkpoint_path = "checkpoints/weights.{epoch:02d}.hdf5"

In [68]:
# Compile the model
nn.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics= ['accuracy'])

In [69]:
cp_callback = ModelCheckpoint(
    filepath=checkpoint_path,
    verbose=1,
    save_weights_only=True,
    save_freq='epoch',
    period=5)



In [70]:
# Train the model
nn.fit(X_train_scaled, y_train, epochs = 100, callbacks=[cp_callback])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100

Epoch 00005: saving model to checkpoints\weights.05.hdf5
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100

Epoch 00010: saving model to checkpoints\weights.10.hdf5
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100

Epoch 00015: saving model to checkpoints\weights.15.hdf5
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100

Epoch 00020: saving model to checkpoints\weights.20.hdf5
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100

Epoch 00025: saving model to checkpoints\weights.25.hdf5
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100

Epoch 00030: saving model to checkpoints\weights.30.hdf5
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100

Epoch 00035: saving model to checkpoints\weights.35.hdf5
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100

Epoch 00040: saving model to checkpoints\weights.40.hdf5
Epoch 41/100
Epoch 42/100

<tensorflow.python.keras.callbacks.History at 0x1a90ab45288>

In [71]:


# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")



13/13 - 0s - loss: 1.9284 - accuracy: 0.7398
Loss: 1.928389549255371, Accuracy: 0.7397959232330322


In [72]:
nn.save("Chocolate_Ratings_ML.h5")