In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn import ensemble
from sklearn.model_selection import cross_val_score
import seaborn as sns

In [2]:
# Libraries to import so I can find important features
from sklearn.datasets import make_classification
from sklearn.ensemble import ExtraTreesClassifier

In [3]:
foc_original = pd.read_csv('flavors_of_cacao.csv',
    skipinitialspace=True,
    header=0
)

In [4]:
foc_original.head()

Unnamed: 0,Company (Maker-if known),Specific Bean Origin or Bar Name,REF,Review Date,Cocoa Percent,Company Location,Rating,Bean Type,Broad Bean Origin
0,A. Morin,Agua Grande,1876,2016,63%,France,3.75,,Sao Tome
1,A. Morin,Kpime,1676,2015,70%,France,2.75,,Togo
2,A. Morin,Atsane,1676,2015,70%,France,3.0,,Togo
3,A. Morin,Akata,1680,2015,70%,France,3.5,,Togo
4,A. Morin,Quilla,1704,2015,70%,France,3.5,,Peru


In [5]:
# Rename column names
# df.rename(index=str, columns={"A": "a", "C": "c"})
foc_v1 = foc_original.rename(index=str, columns={'Company \n(Maker-if known)': 'company',
                            'Specific Bean Origin\nor Bar Name': 'specific_bean_origin', 'Review\nDate':'review_date',
                            'Cocoa\nPercent':'cocoa_percent','Company\nLocation':'company_loc','Rating':'rating',
                            'Bean\nType':'bean_type','Broad Bean\nOrigin':'broad_bean_origin'
                            })

In [6]:
# Change everything to numeric now
foc_v1['cocoa_percent'] = pd.to_numeric(foc_v1['cocoa_percent'].str.strip('%'), errors='coerce')

In [7]:
# Run the RF model again, it should work
from sklearn import ensemble
from sklearn.model_selection import cross_val_score

# The below is OK if Rating is a float

In [8]:
# This is the model we'll be using.
from sklearn import tree

# A convenience for displaying visualizations.
from IPython.display import Image

# Packages for rendering our tree.
import pydotplus
import graphviz

# Try the Regressor First

In [9]:
rfg = ensemble.RandomForestRegressor(max_depth = 10, n_estimators = 50, max_features = 'sqrt',
                                    min_samples_split=.01) #if a float, auto converts to %
# a list with two elements, rating and bean_origin
X = foc_v1.drop(['rating','specific_bean_origin'],1)  # drop rating from X
Y = foc_v1['rating']
X = pd.get_dummies(X) 

# Put a fit in here
rfg.fit(X,Y) # drop rating from X

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=10,
           max_features='sqrt', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=0.01,
           min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [10]:
print(rfg.score(X,Y))
cross_val_score(rfg, X, Y, cv=5)

0.16097021148234336


array([0.04275149, 0.07099904, 0.04108923, 0.02643539, 0.03661763])

# These values are terrible!

# Try Classifier next...

In [11]:
rfc = ensemble.RandomForestClassifier(max_depth = 10, n_estimators = 50, max_features = 'sqrt',
                                    min_samples_split=.01) #if a float, auto converts to %
# a list with two elements, rating and bean_origin
X = foc_v1.drop(['rating','specific_bean_origin'],1)  # drop rating from X
Y = foc_v1['rating']
# convert Y to strings
#Y = Y.astype(str)

# Try a Boolean Y value
Y = (foc_v1['rating'] >= 4)
X = pd.get_dummies(X) 

# Put a fit in here
rfc.fit(X,Y) # drop rating from X
print(rfc.score(X,Y))
cross_val_score(rfc, X, Y, cv=5)

0.9442896935933147


array([0.94428969, 0.94428969, 0.94428969, 0.94428969, 0.94428969])

In [12]:
# A heck of a lot better! Because the Regressor ratings look like this:
foc_v1['rating'].value_counts()

3.50    392
3.00    341
3.25    303
2.75    259
3.75    210
2.50    127
4.00     98
2.00     32
2.25     14
1.50     10
1.00      4
1.75      3
5.00      2
Name: rating, dtype: int64

In [13]:
# Try to make the best decision tree and the most simple random forest (max_depth = 2, max_features = 1)
# Initialize and train our tree.
decision_tree = tree.DecisionTreeClassifier(
    criterion='gini',
    max_features='auto',
    max_depth=None,
    random_state = None
)
decision_tree.fit(X, Y)
print(decision_tree.score(X,Y))
cross_val_score(decision_tree, X, Y, cv=5)

0.9983286908077994


array([0.8356546 , 0.92479109, 0.91922006, 0.92200557, 0.92479109])

In [14]:
feature_importances = pd.DataFrame(rfg.feature_importances_,
                                   index = X.columns,
                                    columns=['importance']).sort_values('importance', ascending=False)

print(feature_importances)

                                                    importance
cocoa_percent                                         0.128389
REF                                                   0.056485
Company \n(Maker-if known)_Soma                       0.035773
Company \n(Maker-if known)_Cote d' Or (Kraft)         0.029431
review_date                                           0.029392
Company \n(Maker-if known)_Machu Picchu Trading...    0.028857
Company \n(Maker-if known)_Amedei                     0.028378
bean_type_                                            0.027732
Company \n(Maker-if known)_Idilio (Felchlin)          0.023770
broad_bean_origin_                                    0.022689
Company \n(Maker-if known)_Neuhaus (Callebaut)        0.022342
Company \n(Maker-if known)_Callebaut                  0.015641
company_loc_Italy                                     0.015215
company_loc_Belgium                                   0.013089
company_loc_U.K.                                      0