In [1]:
import pandas as pd

## Import data sets and perform ETL to clean and combine

In [2]:
# Import North American mushroom data

NA_data = pd.read_csv("north_american_secondary_data_shuffled.csv")
NA_data

Unnamed: 0,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,[e],7.34,f,d,n,f,e,c,w,16.77,...,,,w,,,f,f,,g,s
1,[e],5.39,x,d,o,f,x,c,u,7.06,...,,,w,,,t,e,,d,s
2,[p],5.36,f,d,n,t,x,c,b,4.29,...,,,n,,,f,f,,d,a
3,[e],10.62,c,t,o,f,e,c,o,12.18,...,,,y,,,t,r,,d,u
4,[p],9.66,x,s,n,t,d,c,w,4.96,...,,,b,,,f,f,,d,a
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34589,[e],3.23,f,d,n,t,d,d,w,15.35,...,,,n,,,f,f,,d,a
34590,[e],2.85,f,s,p,f,f,f,f,2.57,...,,,p,,,f,f,,d,a
34591,[p],5.69,x,s,b,f,a,d,b,5.16,...,,,b,,,f,f,,d,w
34592,[e],6.25,x,s,y,f,d,d,y,4.01,...,,,y,,,f,f,,d,a


In [3]:
#ETL

NA_data = NA_data.replace(
    {
    'class': {'[p]': 'na_poisonous', '[e]': 'na_edible'},
    'cap-shape': {'b': 'bell', 'c': 'conical', 'x': 'convex', 'f': 'flat', 's': 'sunken', 'p': 'spherical', 'o': 'others'},
    'cap-surface': {'i': 'fibrous', 'g': 'grooves', 'y': 'scaly', 's': 'smooth', 'd': 'dry', 'h': 'shiny', 'l': 'leathery', 'k': 'silky', 't': 'sticky', 'w': 'wrinkled', 'e': 'fleshy'},
    'cap-color': {'n': 'brown', 'b': 'buff', 'g': 'gray', 'r': 'green', 'p': 'pink', 'u': 'purple', 'e': 'red', 'w': 'white', 'y': 'yellow', 'l': 'blue', 'o': 'orange', 'k': 'black'},
    'gill-attachment': {'a': 'adnate', 'x': 'adnexed', 'd': 'decurrent', 'e': 'free', 's': 'sinuate', 'p': 'pores', 'f': 'none', '?': 'unknown'},
    'gill-spacing': {'c': 'close', 'd': 'distant', 'f': 'none'},
    'gill-color': {'n': 'brown', 'b': 'buff', 'g': 'gray', 'r': 'green', 'p': 'pink', 'u': 'purple', 'e': 'red', 'w': 'white', 'y': 'yellow', 'l': 'blue', 'o': 'orange', 'k': 'black', 'f': 'none'},
    'stem-color': {'n': 'brown', 'b': 'buff', 'g': 'gray', 'r': 'green', 'p': 'pink', 'u': 'purple', 'e': 'red', 'w': 'white', 'y': 'yellow', 'l': 'blue', 'o': 'orange', 'k': 'black', 'f': 'none'},
    'ring-type': {'c': 'cobwebby', 'e': 'evanescent', 'r': 'flaring', 'g': 'grooved', 'l': 'large', 'p': 'pendant', 's': 'sheathing', 'z': 'zone', 'y': 'scaly', 'm': 'movable', 'f': 'none', '?': 'unknown'},
    'habitat': {'g': 'grasses', 'l': 'leaves', 'm': 'meadows', 'p': 'paths', 'h': 'heaths', 'u': 'urban', 'w': 'waste', 'd': 'woods'},
    'season': {'s': 'spring', 'u': 'summer', 'a': 'autumn', 'w': 'winter'}
    })

# convert t/f columns to boolean.
NA_data['does-bruise-or-bleed'] = NA_data['does-bruise-or-bleed']=='t'
NA_data['has-ring'] = NA_data['has-ring']=='t'

NA_data

Unnamed: 0,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,na_edible,7.34,flat,dry,brown,False,free,close,white,16.77,...,,,white,,,False,none,,grasses,spring
1,na_edible,5.39,convex,dry,orange,False,adnexed,close,purple,7.06,...,,,white,,,True,evanescent,,woods,spring
2,na_poisonous,5.36,flat,dry,brown,True,adnexed,close,buff,4.29,...,,,brown,,,False,none,,woods,autumn
3,na_edible,10.62,conical,sticky,orange,False,free,close,orange,12.18,...,,,yellow,,,True,flaring,,woods,summer
4,na_poisonous,9.66,convex,smooth,brown,True,decurrent,close,white,4.96,...,,,buff,,,False,none,,woods,autumn
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34589,na_edible,3.23,flat,dry,brown,True,decurrent,distant,white,15.35,...,,,brown,,,False,none,,woods,autumn
34590,na_edible,2.85,flat,smooth,pink,False,none,none,none,2.57,...,,,pink,,,False,none,,woods,autumn
34591,na_poisonous,5.69,convex,smooth,buff,False,adnate,distant,buff,5.16,...,,,buff,,,False,none,,woods,winter
34592,na_edible,6.25,convex,smooth,yellow,False,decurrent,distant,yellow,4.01,...,,,yellow,,,False,none,,woods,autumn


In [4]:
# Drop unneeded columns

NA_data.drop(columns=['stem-root', 'stem-surface', 'veil-type', 'veil-color', 'spore-print-color'], inplace=True)
NA_data

Unnamed: 0,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-color,has-ring,ring-type,habitat,season
0,na_edible,7.34,flat,dry,brown,False,free,close,white,16.77,15.82,white,False,none,grasses,spring
1,na_edible,5.39,convex,dry,orange,False,adnexed,close,purple,7.06,6.89,white,True,evanescent,woods,spring
2,na_poisonous,5.36,flat,dry,brown,True,adnexed,close,buff,4.29,11.90,brown,False,none,woods,autumn
3,na_edible,10.62,conical,sticky,orange,False,free,close,orange,12.18,12.25,yellow,True,flaring,woods,summer
4,na_poisonous,9.66,convex,smooth,brown,True,decurrent,close,white,4.96,21.26,buff,False,none,woods,autumn
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34589,na_edible,3.23,flat,dry,brown,True,decurrent,distant,white,15.35,13.08,brown,False,none,woods,autumn
34590,na_edible,2.85,flat,smooth,pink,False,none,none,none,2.57,10.61,pink,False,none,woods,autumn
34591,na_poisonous,5.69,convex,smooth,buff,False,adnate,distant,buff,5.16,4.77,buff,False,none,woods,winter
34592,na_edible,6.25,convex,smooth,yellow,False,decurrent,distant,yellow,4.01,14.04,yellow,False,none,woods,autumn


In [5]:
#Import EU data

EU_data = pd.read_csv('../secondary_data_no_miss.csv', sep = ';')
EU_data

Unnamed: 0,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-color,has-ring,ring-type,habitat,season
0,e,1.26,x,g,y,f,d,c,w,5.04,1.73,y,f,f,d,a
1,e,10.32,f,e,b,f,a,c,b,4.68,19.44,w,t,f,d,a
2,p,0.92,x,g,p,f,a,c,p,4.59,1.15,k,f,f,d,u
3,p,4.27,x,t,p,f,x,c,w,4.55,6.52,w,f,f,d,a
4,e,3.08,f,s,w,f,d,d,w,2.67,5.18,w,f,f,m,a
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61064,p,12.79,x,e,n,t,p,c,e,9.60,25.61,y,f,f,d,u
61065,p,2.42,x,d,w,f,a,d,p,3.52,2.20,w,f,f,g,u
61066,e,12.33,s,t,u,f,s,c,u,7.71,21.99,u,f,f,d,a
61067,p,3.85,s,w,u,f,a,c,u,5.32,5.59,u,f,f,l,a


In [6]:
#ETL

EU_data = EU_data.replace(
    {
    'class': {'p': 'eu_poisonous', 'e': 'eu_edible'},
    'cap-shape': {'b': 'bell', 'c': 'conical', 'x': 'convex', 'f': 'flat', 's': 'sunken', 'p': 'spherical', 'o': 'others'},
    'cap-surface': {'i': 'fibrous', 'g': 'grooves', 'y': 'scaly', 's': 'smooth', 'd': 'dry', 'h': 'shiny', 'l': 'leathery', 'k': 'silky', 't': 'sticky', 'w': 'wrinkled', 'e': 'fleshy'},
    'cap-color': {'n': 'brown', 'b': 'buff', 'g': 'gray', 'r': 'green', 'p': 'pink', 'u': 'purple', 'e': 'red', 'w': 'white', 'y': 'yellow', 'l': 'blue', 'o': 'orange', 'k': 'black'},
    'gill-attachment': {'a': 'adnate', 'x': 'adnexed', 'd': 'decurrent', 'e': 'free', 's': 'sinuate', 'p': 'pores', 'f': 'none', '?': 'unknown'},
    'gill-spacing': {'c': 'close', 'd': 'distant', 'f': 'none'},
    'gill-color': {'n': 'brown', 'b': 'buff', 'g': 'gray', 'r': 'green', 'p': 'pink', 'u': 'purple', 'e': 'red', 'w': 'white', 'y': 'yellow', 'l': 'blue', 'o': 'orange', 'k': 'black', 'f': 'none'},
    'stem-color': {'n': 'brown', 'b': 'buff', 'g': 'gray', 'r': 'green', 'p': 'pink', 'u': 'purple', 'e': 'red', 'w': 'white', 'y': 'yellow', 'l': 'blue', 'o': 'orange', 'k': 'black', 'f': 'none'},
    'ring-type': {'c': 'cobwebby', 'e': 'evanescent', 'r': 'flaring', 'g': 'grooved', 'l': 'large', 'p': 'pendant', 's': 'sheathing', 'z': 'zone', 'y': 'scaly', 'm': 'movable', 'f': 'none', '?': 'unknown'},
    'habitat': {'g': 'grasses', 'l': 'leaves', 'm': 'meadows', 'p': 'paths', 'h': 'heaths', 'u': 'urban', 'w': 'waste', 'd': 'woods'},
    'season': {'s': 'spring', 'u': 'summer', 'a': 'autumn', 'w': 'winter'}
    })

# convert t/f columns to boolean.
EU_data['does-bruise-or-bleed'] = EU_data['does-bruise-or-bleed']=='t'
EU_data['has-ring'] = EU_data['has-ring']=='t'

EU_data

Unnamed: 0,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-color,has-ring,ring-type,habitat,season
0,eu_edible,1.26,convex,grooves,yellow,False,decurrent,close,white,5.04,1.73,yellow,False,none,woods,autumn
1,eu_edible,10.32,flat,fleshy,buff,False,adnate,close,buff,4.68,19.44,white,True,none,woods,autumn
2,eu_poisonous,0.92,convex,grooves,pink,False,adnate,close,pink,4.59,1.15,black,False,none,woods,summer
3,eu_poisonous,4.27,convex,sticky,pink,False,adnexed,close,white,4.55,6.52,white,False,none,woods,autumn
4,eu_edible,3.08,flat,smooth,white,False,decurrent,distant,white,2.67,5.18,white,False,none,meadows,autumn
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61064,eu_poisonous,12.79,convex,fleshy,brown,True,pores,close,red,9.60,25.61,yellow,False,none,woods,summer
61065,eu_poisonous,2.42,convex,dry,white,False,adnate,distant,pink,3.52,2.20,white,False,none,grasses,summer
61066,eu_edible,12.33,sunken,sticky,purple,False,sinuate,close,purple,7.71,21.99,purple,False,none,woods,autumn
61067,eu_poisonous,3.85,sunken,wrinkled,purple,False,adnate,close,purple,5.32,5.59,purple,False,none,leaves,autumn


In [7]:
# Combine data sets

combined_data = EU_data.append(NA_data, ignore_index=True)

In [8]:
# Export data to csv so it can be shuffled
 combined_data.to_csv("four_output_combined_data.csv", sep = ";")

In [9]:
# Import shuffled data

shuffled_combined = pd.read_csv("four_output_shuffled_data.csv")
shuffled_combined

Unnamed: 0.1,Unnamed: 0,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-color,has-ring,ring-type,habitat,season
0,65139,na_poisonous,5.78,flat,sticky,red,True,adnexed,close,red,5.33,14.16,white,False,none,woods,autumn
1,60522,eu_poisonous,13.53,convex,silky,brown,True,pores,close,yellow,9.17,31.69,yellow,False,none,woods,summer
2,32876,eu_poisonous,3.74,bell,sticky,blue,False,sinuate,close,brown,6.39,7.67,white,True,evanescent,meadows,autumn
3,93704,na_edible,4.71,others,scaly,gray,False,free,close,black,7.84,4.82,white,True,zone,woods,autumn
4,34324,eu_edible,8.31,convex,shiny,brown,True,pores,close,brown,6.63,17.59,brown,True,pendant,woods,autumn
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95658,21440,eu_poisonous,6.14,flat,dry,brown,True,decurrent,close,white,6.12,9.52,red,False,none,woods,summer
95659,73349,na_edible,10.13,convex,fleshy,brown,True,decurrent,close,white,7.41,14.41,buff,False,none,woods,autumn
95660,50057,eu_edible,6.88,flat,shiny,orange,False,sinuate,close,yellow,9.68,12.55,brown,False,none,woods,autumn
95661,5192,eu_poisonous,6.58,convex,scaly,white,True,adnate,close,gray,7.47,13.66,white,True,large,woods,autumn


In [10]:
# Remove old index

shuffled_combined.drop(columns=['Unnamed: 0'], inplace=True)
shuffled_combined.head()

Unnamed: 0,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-color,has-ring,ring-type,habitat,season
0,na_poisonous,5.78,flat,sticky,red,True,adnexed,close,red,5.33,14.16,white,False,none,woods,autumn
1,eu_poisonous,13.53,convex,silky,brown,True,pores,close,yellow,9.17,31.69,yellow,False,none,woods,summer
2,eu_poisonous,3.74,bell,sticky,blue,False,sinuate,close,brown,6.39,7.67,white,True,evanescent,meadows,autumn
3,na_edible,4.71,others,scaly,gray,False,free,close,black,7.84,4.82,white,True,zone,woods,autumn
4,eu_edible,8.31,convex,shiny,brown,True,pores,close,brown,6.63,17.59,brown,True,pendant,woods,autumn


## Set up data for Random Forest model

In [11]:
shuffled_combined_encoded = pd.get_dummies(data=shuffled_combined, columns=[
                                                'cap-surface',
                                                'cap-color',
                                                'does-bruise-or-bleed',
                                                'gill-attachment',
                                                'gill-spacing',
                                                'gill-color',
                                                'cap-shape',
                                                'stem-color',
                                                'has-ring',
                                                'ring-type',
                                                'habitat',
                                                'season'])
shuffled_combined_encoded

Unnamed: 0,class,cap-diameter,stem-height,stem-width,cap-surface_dry,cap-surface_fibrous,cap-surface_fleshy,cap-surface_grooves,cap-surface_leathery,cap-surface_scaly,...,habitat_leaves,habitat_meadows,habitat_paths,habitat_urban,habitat_waste,habitat_woods,season_autumn,season_spring,season_summer,season_winter
0,na_poisonous,5.78,5.33,14.16,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
1,eu_poisonous,13.53,9.17,31.69,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
2,eu_poisonous,3.74,6.39,7.67,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0
3,na_edible,4.71,7.84,4.82,0,0,0,0,0,1,...,0,0,0,0,0,1,1,0,0,0
4,eu_edible,8.31,6.63,17.59,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95658,eu_poisonous,6.14,6.12,9.52,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
95659,na_edible,10.13,7.41,14.41,0,0,1,0,0,0,...,0,0,0,0,0,1,1,0,0,0
95660,eu_edible,6.88,9.68,12.55,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
95661,eu_poisonous,6.58,7.47,13.66,0,0,0,0,0,1,...,0,0,0,0,0,1,1,0,0,0


In [12]:
from sklearn.preprocessing import LabelEncoder
  
le = LabelEncoder()
shuffled_combined_encoded['class']= le.fit_transform(shuffled_combined_encoded['class'])

In [13]:
shuffled_combined_encoded

Unnamed: 0,class,cap-diameter,stem-height,stem-width,cap-surface_dry,cap-surface_fibrous,cap-surface_fleshy,cap-surface_grooves,cap-surface_leathery,cap-surface_scaly,...,habitat_leaves,habitat_meadows,habitat_paths,habitat_urban,habitat_waste,habitat_woods,season_autumn,season_spring,season_summer,season_winter
0,3,5.78,5.33,14.16,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
1,1,13.53,9.17,31.69,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
2,1,3.74,6.39,7.67,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0
3,2,4.71,7.84,4.82,0,0,0,0,0,1,...,0,0,0,0,0,1,1,0,0,0
4,0,8.31,6.63,17.59,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95658,1,6.14,6.12,9.52,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
95659,2,10.13,7.41,14.41,0,0,1,0,0,0,...,0,0,0,0,0,1,1,0,0,0
95660,0,6.88,9.68,12.55,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
95661,1,6.58,7.47,13.66,0,0,0,0,0,1,...,0,0,0,0,0,1,1,0,0,0


In [14]:
y = shuffled_combined_encoded["class"]
target_names = ["eu_edible", "eu_poisonous", "na_edible", "na_poisonous"]

X = shuffled_combined_encoded.drop("class", axis=1)
X.head()

Unnamed: 0,cap-diameter,stem-height,stem-width,cap-surface_dry,cap-surface_fibrous,cap-surface_fleshy,cap-surface_grooves,cap-surface_leathery,cap-surface_scaly,cap-surface_shiny,...,habitat_leaves,habitat_meadows,habitat_paths,habitat_urban,habitat_waste,habitat_woods,season_autumn,season_spring,season_summer,season_winter
0,5.78,5.33,14.16,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
1,13.53,9.17,31.69,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
2,3.74,6.39,7.67,0,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0
3,4.71,7.84,4.82,0,0,0,0,0,1,0,...,0,0,0,0,0,1,1,0,0,0
4,8.31,6.63,17.59,0,0,0,0,0,0,1,...,0,0,0,0,0,1,1,0,0,0


In [15]:
#Import RF dependencies

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier

In [16]:
# Train / test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [17]:
# Scale data

X_scaler = MinMaxScaler().fit(X_train)
#y_scaler = MinMaxScaler().fit(y_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
#y_train_scaled = y_scaler.transform(y_train)
#y_test_scaled = y_scaler.transform(y_test)

In [18]:
# Run Random Forests

rf = RandomForestClassifier(n_estimators=50)
rf = rf.fit(X_train_scaled, y_train)
rf.score(X_test_scaled, y_test)

0.9990383007191838

In [19]:
feature_names = X.columns
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

[(0.09305043761731925, 'stem-width'),
 (0.06747565164329612, 'stem-height'),
 (0.0651689968908607, 'cap-diameter'),
 (0.023439251804376546, 'stem-color_white'),
 (0.022794721807692195, 'gill-attachment_pores'),
 (0.02174156842518752, 'gill-color_white'),
 (0.021702710606762213, 'gill-attachment_adnate'),
 (0.020598216392712712, 'gill-spacing_close'),
 (0.019070646429525266, 'cap-surface_sticky'),
 (0.01900722566902756, 'gill-attachment_decurrent'),
 (0.018922617198090866, 'gill-spacing_distant'),
 (0.017468673403166737, 'does-bruise-or-bleed_True'),
 (0.017263654463511673, 'cap-surface_smooth'),
 (0.016670554921802395, 'gill-attachment_free'),
 (0.01653704250258093, 'stem-color_brown'),
 (0.01646164355140553, 'cap-shape_convex'),
 (0.01590473619158693, 'habitat_woods'),
 (0.015819053367187218, 'cap-surface_dry'),
 (0.014572364650848626, 'cap-color_brown'),
 (0.014428156007825172, 'does-bruise-or-bleed_False'),
 (0.01399207690255602, 'has-ring_True'),
 (0.013706539312971902, 'gill-color

In [20]:
from sklearn.metrics import mean_squared_error

predictions = rf.predict(X_test_scaled)
MSE = mean_squared_error(y_test, predictions)
r2 = rf.score(X_test_scaled, y_test)

print(f"MSE: {MSE}, R2: {r2}")

MSE: 0.002508780732563974, R2: 0.9990383007191838


## Set up data to run neural network

In [21]:
# Check data
print(y)

0        3
1        1
2        1
3        2
4        0
        ..
95658    1
95659    2
95660    0
95661    1
95662    3
Name: class, Length: 95663, dtype: int32


In [22]:
X

Unnamed: 0,cap-diameter,stem-height,stem-width,cap-surface_dry,cap-surface_fibrous,cap-surface_fleshy,cap-surface_grooves,cap-surface_leathery,cap-surface_scaly,cap-surface_shiny,...,habitat_leaves,habitat_meadows,habitat_paths,habitat_urban,habitat_waste,habitat_woods,season_autumn,season_spring,season_summer,season_winter
0,5.78,5.33,14.16,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
1,13.53,9.17,31.69,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
2,3.74,6.39,7.67,0,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0
3,4.71,7.84,4.82,0,0,0,0,0,1,0,...,0,0,0,0,0,1,1,0,0,0
4,8.31,6.63,17.59,0,0,0,0,0,0,1,...,0,0,0,0,0,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95658,6.14,6.12,9.52,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
95659,10.13,7.41,14.41,0,0,1,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
95660,6.88,9.68,12.55,0,0,0,0,0,0,1,...,0,0,0,0,0,1,1,0,0,0
95661,6.58,7.47,13.66,0,0,0,0,0,1,0,...,0,0,0,0,0,1,1,0,0,0


In [23]:
# Import dependencies

from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [24]:
# Train / test split

X_train_nn, X_test_nn, y_train_nn, y_test_nn = train_test_split(X, y, random_state=42)

In [25]:
# Scale X

X_scaler_nn = MinMaxScaler().fit(X_train_nn)
X_train_scaled_nn = X_scaler_nn.transform(X_train_nn)
X_test_scaled_nn = X_scaler_nn.transform(X_test_nn)

In [26]:
# One Hot Encode y

y_train_categorical = to_categorical(y_train_nn)
y_test_categorical = to_categorical(y_test_nn)

In [27]:
# Create the Neural Network with a 6/4 distribution

# Make a sequential model
model = Sequential()
# Add the first hidden layer with 6 nodes
model.add(Dense(units=6, activation='relu', input_dim=92))
# Add the output layer with 2 outputs
model.add(Dense(units=4, activation='softmax'))

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [28]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 6)                 558       
_________________________________________________________________
dense_1 (Dense)              (None, 4)                 28        
Total params: 586
Trainable params: 586
Non-trainable params: 0
_________________________________________________________________


In [29]:
# Compile the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [30]:
# Fit the model to the training data
model.fit(
    X_train_scaled_nn,
    y_train_categorical,
    epochs=100,
    shuffle=True,
    verbose=2
)

Epoch 1/100
71747/71747 - 2s - loss: 0.9089 - acc: 0.6196
Epoch 2/100
71747/71747 - 2s - loss: 0.7109 - acc: 0.7127
Epoch 3/100
71747/71747 - 2s - loss: 0.6492 - acc: 0.7413
Epoch 4/100
71747/71747 - 2s - loss: 0.6025 - acc: 0.7634
Epoch 5/100
71747/71747 - 2s - loss: 0.5729 - acc: 0.7763
Epoch 6/100
71747/71747 - 2s - loss: 0.5537 - acc: 0.7851
Epoch 7/100
71747/71747 - 2s - loss: 0.5403 - acc: 0.7930
Epoch 8/100
71747/71747 - 2s - loss: 0.5297 - acc: 0.7975
Epoch 9/100
71747/71747 - 2s - loss: 0.5215 - acc: 0.8019
Epoch 10/100
71747/71747 - 2s - loss: 0.5160 - acc: 0.8050
Epoch 11/100
71747/71747 - 2s - loss: 0.5105 - acc: 0.8052
Epoch 12/100
71747/71747 - 2s - loss: 0.5061 - acc: 0.8065
Epoch 13/100
71747/71747 - 2s - loss: 0.5025 - acc: 0.8080
Epoch 14/100
71747/71747 - 2s - loss: 0.4993 - acc: 0.8081
Epoch 15/100
71747/71747 - 2s - loss: 0.4964 - acc: 0.8086
Epoch 16/100
71747/71747 - 2s - loss: 0.4930 - acc: 0.8118
Epoch 17/100
71747/71747 - 2s - loss: 0.4902 - acc: 0.8125
Epoch 

<tensorflow.python.keras.callbacks.History at 0x15cdba589e8>

In [31]:
# Ading another layer to try to improve model
# Create the Neural Network with a 6/6/4 distribution

# Make a sequential model
model_2 = Sequential()
# Add the first hidden layer with 6 nodes
model_2.add(Dense(units=6, activation='relu', input_dim=92))
# Add the second hidden layer
model_2.add(Dense(units=6, activation='relu', input_dim=6))
# Add the output layer with 2 outputs
model_2.add(Dense(units=4, activation='softmax'))

In [32]:
# Compile the model
model_2.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [33]:
# Fit the model to the training data
model_2.fit(
    X_train_scaled_nn,
    y_train_categorical,
    epochs=100,
    shuffle=True,
    verbose=2
)

Epoch 1/100
71747/71747 - 3s - loss: 0.9022 - acc: 0.6040
Epoch 2/100
71747/71747 - 3s - loss: 0.6648 - acc: 0.7190
Epoch 3/100
71747/71747 - 3s - loss: 0.6031 - acc: 0.7497
Epoch 4/100
71747/71747 - 3s - loss: 0.5628 - acc: 0.7701
Epoch 5/100
71747/71747 - 3s - loss: 0.5286 - acc: 0.7874
Epoch 6/100
71747/71747 - 3s - loss: 0.5023 - acc: 0.8018
Epoch 7/100
71747/71747 - 3s - loss: 0.4851 - acc: 0.8090
Epoch 8/100
71747/71747 - 3s - loss: 0.4711 - acc: 0.8167
Epoch 9/100
71747/71747 - 3s - loss: 0.4605 - acc: 0.8215
Epoch 10/100
71747/71747 - 3s - loss: 0.4510 - acc: 0.8256
Epoch 11/100
71747/71747 - 3s - loss: 0.4434 - acc: 0.8284
Epoch 12/100
71747/71747 - 3s - loss: 0.4363 - acc: 0.8314
Epoch 13/100
71747/71747 - 3s - loss: 0.4295 - acc: 0.8315
Epoch 14/100
71747/71747 - 3s - loss: 0.4254 - acc: 0.8343
Epoch 15/100
71747/71747 - 3s - loss: 0.4200 - acc: 0.8348
Epoch 16/100
71747/71747 - 3s - loss: 0.4165 - acc: 0.8360
Epoch 17/100
71747/71747 - 3s - loss: 0.4129 - acc: 0.8373
Epoch 

<tensorflow.python.keras.callbacks.History at 0x15cdbdab1d0>

In [34]:
model_loss, model_accuracy = model.evaluate(
    X_test_scaled_nn, y_test_categorical, verbose=2)
print(
    f"Normal Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

23916/23916 - 0s - loss: 0.4338 - acc: 0.8312
Normal Neural Network - Loss: 0.4338340621904538, Accuracy: 0.8311590552330017


In [36]:
model_loss, model_accuracy = model_2.evaluate(
    X_test_scaled_nn, y_test_categorical, verbose=2)
print(
    f"Deep Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

23916/23916 - 0s - loss: 0.3471 - acc: 0.8704
Deep Neural Network - Loss: 0.3470588101354166, Accuracy: 0.8703796863555908


In [50]:
# Expand test
import numpy as np
test = np.expand_dims(X_test_scaled_nn[2], axis=0)
test.shape

(1, 92)

In [51]:
# Make a prediction
print(f"Predicted class: {model.predict_classes(test)}, {model_2.predict_classes(test)}")

Predicted class: [3], [3]


In [52]:
X_test_scaled_nn[2]

array([0.03917759, 0.00279408, 0.07083414, 0.        , 0.        ,
       0.        , 0.        , 1.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 1.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 1.        , 0.        , 0.        , 0.        ,
       0.        , 1.        , 0.        , 0.        , 0.        ,
       1.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 1.        ,
       0.        , 0.        , 1.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       1.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       1.        , 0.        , 0.        , 0.        , 0.     

In [53]:
y_test_categorical

array([[0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.],
       ...,
       [0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.]], dtype=float32)

In [56]:
# Pickle and export models for website

import pickle
from pickle import dump

In [57]:
# Pickle scalers

dump(X_scaler, open('four_output_rf_scaler.pkl', 'wb'))
dump(X_scaler_nn, open('four_outpul_nn_scaler.pkl', 'wb'))

In [58]:
# Pickle Random Forests model

dump(rf, open('four_output_rf_model.pkl', 'wb'))

In [59]:
# Export deep neural network

model_2.save("four_output_deepnn_model.h5")