In [55]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.impute import SimpleImputer
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import plot_confusion_matrix, auc, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
# from imblearn.over_sampling import SMOTE

df = pd.read_csv('../../../data/chocolate.csv')
df.head()

Unnamed: 0,REF,Company (Manufacturer),Company Location,Review Date,Country of Bean Origin,Specific Bean Origin or Bar Name,Cocoa Percent,Ingredients,Most Memorable Characteristics,Rating
0,2454,5150,U.S.A.,2019,Madagascar,"Bejofo Estate, batch 1",76%,"3- B,S,C","cocoa, blackberry, full body",3.75
1,2458,5150,U.S.A.,2019,Dominican Republic,"Zorzal, batch 1",76%,"3- B,S,C","cocoa, vegetal, savory",3.5
2,2454,5150,U.S.A.,2019,Tanzania,"Kokoa Kamili, batch 1",76%,"3- B,S,C","rich cocoa, fatty, bready",3.25
3,797,A. Morin,France,2012,Peru,Peru,63%,"4- B,S,C,L","fruity, melon, roasty",3.75
4,797,A. Morin,France,2012,Bolivia,Bolivia,70%,"4- B,S,C,L","vegetal, nutty",3.5


## Took out the percent sign in the cocoa column, took out the , and - in the ingredients ##

In [63]:
df.columns = df.columns.str.replace(' ', '_').str.lower()

#cocoa_percent
df = df.replace('%',' ', regex=True)

#ingredients
df = df.replace(',',' ', regex=True)
df = df.replace('-',' ', regex=True)

df.head()

Unnamed: 0,ref,company_(manufacturer),company_location,review_date,country_of_bean_origin,specific_bean_origin_or_bar_name,cocoa_percent,ingredients,most_memorable_characteristics,rating
0,2454,5150,U.S.A.,2019,Madagascar,Bejofo Estate batch 1,76,3 B S C,cocoa blackberry full body,3.75
1,2458,5150,U.S.A.,2019,Dominican Republic,Zorzal batch 1,76,3 B S C,cocoa vegetal savory,3.5
2,2454,5150,U.S.A.,2019,Tanzania,Kokoa Kamili batch 1,76,3 B S C,rich cocoa fatty bready,3.25
3,797,A. Morin,France,2012,Peru,Peru,63,4 B S C L,fruity melon roasty,3.75
4,797,A. Morin,France,2012,Bolivia,Bolivia,70,4 B S C L,vegetal nutty,3.5


In [64]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2362 entries, 0 to 2361
Data columns (total 10 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   ref                               2362 non-null   int64  
 1   company_(manufacturer)            2362 non-null   object 
 2   company_location                  2362 non-null   object 
 3   review_date                       2362 non-null   int64  
 4   country_of_bean_origin            2362 non-null   object 
 5   specific_bean_origin_or_bar_name  2362 non-null   object 
 6   cocoa_percent                     2362 non-null   object 
 7   ingredients                       2274 non-null   object 
 8   most_memorable_characteristics    2362 non-null   object 
 9   rating                            2362 non-null   float64
dtypes: float64(1), int64(2), object(7)
memory usage: 184.7+ KB


### Changing Number Columns into Floats ###

In [68]:
df["cocoa_percent"] = df.cocoa_percent.astype(float)
df["review_date"] = df.review_date.astype(float)

In [69]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2362 entries, 0 to 2361
Data columns (total 10 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   ref                               2362 non-null   int64  
 1   company_(manufacturer)            2362 non-null   object 
 2   company_location                  2362 non-null   object 
 3   review_date                       2362 non-null   float64
 4   country_of_bean_origin            2362 non-null   object 
 5   specific_bean_origin_or_bar_name  2362 non-null   object 
 6   cocoa_percent                     2362 non-null   float64
 7   ingredients                       2274 non-null   object 
 8   most_memorable_characteristics    2362 non-null   object 
 9   rating                            2362 non-null   float64
dtypes: float64(3), int64(1), object(6)
memory usage: 184.7+ KB


In [37]:
features = df.drop('rating', axis =1)
X = features
y = df.rating


In [38]:
#performing train test split for test set (subsets 1/3)
X_tr, X_test, y_tr, y_test = train_test_split(X, y, test_size=.15, random_state=42)


#performing a train test split for train and validation set (subsets - 3/3)
X_train, X_val, y_train, y_val = train_test_split(X_tr, y_tr, test_size=.15, random_state=42) 

In [39]:
#ingredients have 88 rows that have no imputs.
df.isna().sum()

ref                                  0
company_(manufacturer)               0
company_location                     0
review_date                          0
country_of_bean_origin               0
specific_bean_origin_or_bar_name     0
cocoa_percent                        0
ingredients                         88
most_memorable_characteristics       0
rating                               0
dtype: int64

In [40]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1705 entries, 108 to 624
Data columns (total 9 columns):
 #   Column                            Non-Null Count  Dtype 
---  ------                            --------------  ----- 
 0   ref                               1705 non-null   int64 
 1   company_(manufacturer)            1705 non-null   object
 2   company_location                  1705 non-null   object
 3   review_date                       1705 non-null   int64 
 4   country_of_bean_origin            1705 non-null   object
 5   specific_bean_origin_or_bar_name  1705 non-null   object
 6   cocoa_percent                     1705 non-null   object
 7   ingredients                       1636 non-null   object
 8   most_memorable_characteristics    1705 non-null   object
dtypes: int64(2), object(7)
memory usage: 133.2+ KB


In [41]:
X_val.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 302 entries, 34 to 1630
Data columns (total 9 columns):
 #   Column                            Non-Null Count  Dtype 
---  ------                            --------------  ----- 
 0   ref                               302 non-null    int64 
 1   company_(manufacturer)            302 non-null    object
 2   company_location                  302 non-null    object
 3   review_date                       302 non-null    int64 
 4   country_of_bean_origin            302 non-null    object
 5   specific_bean_origin_or_bar_name  302 non-null    object
 6   cocoa_percent                     302 non-null    object
 7   ingredients                       292 non-null    object
 8   most_memorable_characteristics    302 non-null    object
dtypes: int64(2), object(7)
memory usage: 23.6+ KB


In [60]:
ohe = OneHotEncoder(sparse=False)