# Setup

In [143]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as stats

# Link to kaggle dataset: https://www.kaggle.com/uciml/mushroom-classification
data = pd.read_csv('C:/Users/keger/Documents/Thinkful_dataScience/supervisedLearning_randomForest/mushrooms.csv')

This dataset is about mushroom classification. For the purpose of this model, we want to predict whether or not the mushroom is poisonous or edible, represented by the class column.

# Exploratory Data Analysis

In [144]:
data.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [145]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
class                       8124 non-null object
cap-shape                   8124 non-null object
cap-surface                 8124 non-null object
cap-color                   8124 non-null object
bruises                     8124 non-null object
odor                        8124 non-null object
gill-attachment             8124 non-null object
gill-spacing                8124 non-null object
gill-size                   8124 non-null object
gill-color                  8124 non-null object
stalk-shape                 8124 non-null object
stalk-root                  8124 non-null object
stalk-surface-above-ring    8124 non-null object
stalk-surface-below-ring    8124 non-null object
stalk-color-above-ring      8124 non-null object
stalk-color-below-ring      8124 non-null object
veil-type                   8124 non-null object
veil-color                  8124 non-null object
ring-number

Lucky for us, there are no null values in any of the columns, so we do not need to drop or interpolate any values. All of our columns are categorical however, so let's check to see how many unique values are in each column to ensure that there is not an abnormally large amount in one. 

In [146]:
for col in data.columns : 
    print(col, data[col].nunique())

class 2
cap-shape 6
cap-surface 4
cap-color 10
bruises 2
odor 9
gill-attachment 2
gill-spacing 2
gill-size 2
gill-color 12
stalk-shape 2
stalk-root 5
stalk-surface-above-ring 4
stalk-surface-below-ring 4
stalk-color-above-ring 9
stalk-color-below-ring 9
veil-type 1
veil-color 4
ring-number 3
ring-type 5
spore-print-color 9
population 6
habitat 7


So, we can see that the column with the largest number of unique variables in the gill-color at 12, so not too large of a number to handle. 

In [147]:
data['class'].value_counts()

e    4208
p    3916
Name: class, dtype: int64

We can also see that the number of edible versus the nubmer of poisonous mushrooms is relatively similar, so we do not have to worry aboout not having enough data in one class versus the other. 

# Feature Engineering

In [148]:
data_dummies = pd.get_dummies(data, drop_first=True)
data_dummies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 96 columns):
class_p                       8124 non-null uint8
cap-shape_c                   8124 non-null uint8
cap-shape_f                   8124 non-null uint8
cap-shape_k                   8124 non-null uint8
cap-shape_s                   8124 non-null uint8
cap-shape_x                   8124 non-null uint8
cap-surface_g                 8124 non-null uint8
cap-surface_s                 8124 non-null uint8
cap-surface_y                 8124 non-null uint8
cap-color_c                   8124 non-null uint8
cap-color_e                   8124 non-null uint8
cap-color_g                   8124 non-null uint8
cap-color_n                   8124 non-null uint8
cap-color_p                   8124 non-null uint8
cap-color_r                   8124 non-null uint8
cap-color_u                   8124 non-null uint8
cap-color_w                   8124 non-null uint8
cap-color_y                   8124 non-null

In [149]:
corr_mat = data_dummies.corr().abs()
s = corr_mat.unstack()
so = s.sort_values(kind='quicksort', ascending=False)
so_df = pd.DataFrame(so)
so_90_df = so_df.loc[so_df[0] > .9]
so_90_df = so_90_df.loc[so_df[0] < 1]
so_90_df.head(20)

Unnamed: 0,Unnamed: 1,0
stalk-color-above-ring_o,veil-color_w,0.979302
veil-color_w,stalk-color-below-ring_o,0.979302
veil-color_w,stalk-color-above-ring_o,0.979302
stalk-color-below-ring_o,veil-color_w,0.979302
ring-number_t,ring-number_o,0.968959
ring-number_o,ring-number_t,0.968959
stalk-color-above-ring_o,gill-attachment_f,0.955097
gill-attachment_f,stalk-color-above-ring_o,0.955097
stalk-color-below-ring_o,gill-attachment_f,0.955097
gill-attachment_f,stalk-color-below-ring_o,0.955097


So we can see that some of the columns are very highly correlated, so we don.t need to keep all of these highly correlated variables. 

In [150]:
drop_list = ['stalk-color-above-ring_o', 'stalk-color-below-ring_o', 'ring-number_o', 'gill-attachment_f']
data_dummies.drop(drop_list, axis=1, inplace=True)

In [152]:
data_dummies.corrwith(data_dummies['class_p']).sort_values(ascending=False).head(30)

class_p                       1.000000
odor_f                        0.623842
stalk-surface-above-ring_k    0.587658
stalk-surface-below-ring_k    0.573524
gill-size_n                   0.540024
spore-print-color_h           0.490229
ring-type_l                   0.451619
population_v                  0.443722
spore-print-color_w           0.357384
habitat_p                     0.323346
odor_s                        0.286360
odor_y                        0.286360
stalk-color-above-ring_n      0.233164
stalk-color-above-ring_p      0.230277
stalk-color-below-ring_p      0.230277
stalk-color-below-ring_n      0.203966
odor_p                        0.186984
cap-shape_k                   0.163565
odor_c                        0.161278
habitat_l                     0.155150
gill-color_h                  0.150694
veil-color_w                  0.140541
gill-color_g                  0.120285
cap-color_y                   0.113014
habitat_u                     0.112078
spore-print-color_r      

In [169]:
keep_list = ['odor_f', 'stalk-surface-above-ring_k', 'stalk-surface-below-ring_k', 'gill-size_n', 
            'spore-print-color_h', 'ring-type_l', 'population_v', 'spore-print-color_w', 'habitat_p',
            'odor_s', 'odor_y', 'stalk-color-above-ring_n', 'stalk-color-above-ring_p', 'stalk-color-below-ring_p', 
            'stalk-color-below-ring_n', 'odor_p', 'cap-shape_k', 'odor_c', 'habitat_l', 'gill-color_h', 
            'veil-color_w', 'gill-color_g', 'cap-color_y', 'habitat_u', 'spore-print-color_r', 'cap-color_e', 
            'cap-surface_s', 'cap-surface_y', 'ring-type_n']
mushroom_df = data_dummies[keep_list]
mushroom_df['class_p'] = data_dummies['class_p']
mushroom_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,odor_f,stalk-surface-above-ring_k,stalk-surface-below-ring_k,gill-size_n,spore-print-color_h,ring-type_l,population_v,spore-print-color_w,habitat_p,odor_s,...,veil-color_w,gill-color_g,cap-color_y,habitat_u,spore-print-color_r,cap-color_e,cap-surface_s,cap-surface_y,ring-type_n,class_p
0,0,0,0,1,0,0,0,0,0,0,...,1,0,0,1,0,0,1,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,0,1,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
3,0,0,0,1,0,0,0,0,0,0,...,1,0,0,1,0,0,0,1,0,1
4,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0


# Model

In [190]:
from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import cross_val_score
import time
start_time = time.time()
#main()


X = mushroom_df.drop('class_p', axis=1)
Y = mushroom_df['class_p']

rfc = RandomForestClassifier(max_depth=3, random_state=0)
rfc.fit(X,Y)
print(cross_val_score(rfc, X, Y, cv=10))

print("--- %s seconds ---" % (time.time() - start_time))



[0.6199262  0.84132841 0.97785978 0.96063961 0.81057811 0.80073801
 0.99876847 0.99630542 0.9161529  0.97533909]
--- 0.31415796279907227 seconds ---


In [197]:
from sklearn.tree import DecisionTreeClassifier

start_time = time.time()

dtc = DecisionTreeClassifier(random_state=0)
dtc.fit(X, Y)
print(cross_val_score(dtc, X, Y, cv=10))

print("--- %s seconds ---" % (time.time() - start_time))

[0.68511685 1.         1.         1.         1.         1.
 1.         1.         1.         1.        ]
--- 0.14062285423278809 seconds ---


So based on the 2 models, the Decision Tree Classifier is the faster. 