In [1]:
import pandas as pd

## Import data sets and perform ETL to clean and combine

In [2]:
# Import North American mushroom data

NA_data = pd.read_csv("north_american_secondary_data_shuffled.csv")
NA_data

Unnamed: 0,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,[e],7.34,f,d,n,f,e,c,w,16.77,...,,,w,,,f,f,,g,s
1,[e],5.39,x,d,o,f,x,c,u,7.06,...,,,w,,,t,e,,d,s
2,[p],5.36,f,d,n,t,x,c,b,4.29,...,,,n,,,f,f,,d,a
3,[e],10.62,c,t,o,f,e,c,o,12.18,...,,,y,,,t,r,,d,u
4,[p],9.66,x,s,n,t,d,c,w,4.96,...,,,b,,,f,f,,d,a
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34589,[e],3.23,f,d,n,t,d,d,w,15.35,...,,,n,,,f,f,,d,a
34590,[e],2.85,f,s,p,f,f,f,f,2.57,...,,,p,,,f,f,,d,a
34591,[p],5.69,x,s,b,f,a,d,b,5.16,...,,,b,,,f,f,,d,w
34592,[e],6.25,x,s,y,f,d,d,y,4.01,...,,,y,,,f,f,,d,a


In [3]:
#ETL

NA_data = NA_data.replace(
    {
    'class': {'[p]': 'na_poisonous', '[e]': 'na_edible'},
    'cap-shape': {'b': 'bell', 'c': 'conical', 'x': 'convex', 'f': 'flat', 's': 'sunken', 'p': 'spherical', 'o': 'others'},
    'cap-surface': {'i': 'fibrous', 'g': 'grooves', 'y': 'scaly', 's': 'smooth', 'd': 'dry', 'h': 'shiny', 'l': 'leathery', 'k': 'silky', 't': 'sticky', 'w': 'wrinkled', 'e': 'fleshy'},
    'cap-color': {'n': 'brown', 'b': 'buff', 'g': 'gray', 'r': 'green', 'p': 'pink', 'u': 'purple', 'e': 'red', 'w': 'white', 'y': 'yellow', 'l': 'blue', 'o': 'orange', 'k': 'black'},
    'gill-attachment': {'a': 'adnate', 'x': 'adnexed', 'd': 'decurrent', 'e': 'free', 's': 'sinuate', 'p': 'pores', 'f': 'none', '?': 'unknown'},
    'gill-spacing': {'c': 'close', 'd': 'distant', 'f': 'none'},
    'gill-color': {'n': 'brown', 'b': 'buff', 'g': 'gray', 'r': 'green', 'p': 'pink', 'u': 'purple', 'e': 'red', 'w': 'white', 'y': 'yellow', 'l': 'blue', 'o': 'orange', 'k': 'black', 'f': 'none'},
    'stem-color': {'n': 'brown', 'b': 'buff', 'g': 'gray', 'r': 'green', 'p': 'pink', 'u': 'purple', 'e': 'red', 'w': 'white', 'y': 'yellow', 'l': 'blue', 'o': 'orange', 'k': 'black', 'f': 'none'},
    'ring-type': {'c': 'cobwebby', 'e': 'evanescent', 'r': 'flaring', 'g': 'grooved', 'l': 'large', 'p': 'pendant', 's': 'sheathing', 'z': 'zone', 'y': 'scaly', 'm': 'movable', 'f': 'none', '?': 'unknown'},
    'habitat': {'g': 'grasses', 'l': 'leaves', 'm': 'meadows', 'p': 'paths', 'h': 'heaths', 'u': 'urban', 'w': 'waste', 'd': 'woods'},
    'season': {'s': 'spring', 'u': 'summer', 'a': 'autumn', 'w': 'winter'}
    })

# convert t/f columns to boolean.
NA_data['does-bruise-or-bleed'] = NA_data['does-bruise-or-bleed']=='t'
NA_data['has-ring'] = NA_data['has-ring']=='t'

NA_data

Unnamed: 0,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,na_edible,7.34,flat,dry,brown,False,free,close,white,16.77,...,,,white,,,False,none,,grasses,spring
1,na_edible,5.39,convex,dry,orange,False,adnexed,close,purple,7.06,...,,,white,,,True,evanescent,,woods,spring
2,na_poisonous,5.36,flat,dry,brown,True,adnexed,close,buff,4.29,...,,,brown,,,False,none,,woods,autumn
3,na_edible,10.62,conical,sticky,orange,False,free,close,orange,12.18,...,,,yellow,,,True,flaring,,woods,summer
4,na_poisonous,9.66,convex,smooth,brown,True,decurrent,close,white,4.96,...,,,buff,,,False,none,,woods,autumn
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34589,na_edible,3.23,flat,dry,brown,True,decurrent,distant,white,15.35,...,,,brown,,,False,none,,woods,autumn
34590,na_edible,2.85,flat,smooth,pink,False,none,none,none,2.57,...,,,pink,,,False,none,,woods,autumn
34591,na_poisonous,5.69,convex,smooth,buff,False,adnate,distant,buff,5.16,...,,,buff,,,False,none,,woods,winter
34592,na_edible,6.25,convex,smooth,yellow,False,decurrent,distant,yellow,4.01,...,,,yellow,,,False,none,,woods,autumn


In [4]:
# Drop unneeded columns

NA_data.drop(columns=['stem-root', 'stem-surface', 'veil-type', 'veil-color', 'spore-print-color'], inplace=True)
NA_data

Unnamed: 0,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-color,has-ring,ring-type,habitat,season
0,na_edible,7.34,flat,dry,brown,False,free,close,white,16.77,15.82,white,False,none,grasses,spring
1,na_edible,5.39,convex,dry,orange,False,adnexed,close,purple,7.06,6.89,white,True,evanescent,woods,spring
2,na_poisonous,5.36,flat,dry,brown,True,adnexed,close,buff,4.29,11.90,brown,False,none,woods,autumn
3,na_edible,10.62,conical,sticky,orange,False,free,close,orange,12.18,12.25,yellow,True,flaring,woods,summer
4,na_poisonous,9.66,convex,smooth,brown,True,decurrent,close,white,4.96,21.26,buff,False,none,woods,autumn
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34589,na_edible,3.23,flat,dry,brown,True,decurrent,distant,white,15.35,13.08,brown,False,none,woods,autumn
34590,na_edible,2.85,flat,smooth,pink,False,none,none,none,2.57,10.61,pink,False,none,woods,autumn
34591,na_poisonous,5.69,convex,smooth,buff,False,adnate,distant,buff,5.16,4.77,buff,False,none,woods,winter
34592,na_edible,6.25,convex,smooth,yellow,False,decurrent,distant,yellow,4.01,14.04,yellow,False,none,woods,autumn


In [5]:
#Import EU data

EU_data = pd.read_csv('../secondary_data_no_miss.csv', sep = ';')
EU_data

Unnamed: 0,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-color,has-ring,ring-type,habitat,season
0,e,1.26,x,g,y,f,d,c,w,5.04,1.73,y,f,f,d,a
1,e,10.32,f,e,b,f,a,c,b,4.68,19.44,w,t,f,d,a
2,p,0.92,x,g,p,f,a,c,p,4.59,1.15,k,f,f,d,u
3,p,4.27,x,t,p,f,x,c,w,4.55,6.52,w,f,f,d,a
4,e,3.08,f,s,w,f,d,d,w,2.67,5.18,w,f,f,m,a
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61064,p,12.79,x,e,n,t,p,c,e,9.60,25.61,y,f,f,d,u
61065,p,2.42,x,d,w,f,a,d,p,3.52,2.20,w,f,f,g,u
61066,e,12.33,s,t,u,f,s,c,u,7.71,21.99,u,f,f,d,a
61067,p,3.85,s,w,u,f,a,c,u,5.32,5.59,u,f,f,l,a


In [6]:
#ETL

EU_data = EU_data.replace(
    {
    'class': {'p': 'eu_poisonous', 'e': 'eu_edible'},
    'cap-shape': {'b': 'bell', 'c': 'conical', 'x': 'convex', 'f': 'flat', 's': 'sunken', 'p': 'spherical', 'o': 'others'},
    'cap-surface': {'i': 'fibrous', 'g': 'grooves', 'y': 'scaly', 's': 'smooth', 'd': 'dry', 'h': 'shiny', 'l': 'leathery', 'k': 'silky', 't': 'sticky', 'w': 'wrinkled', 'e': 'fleshy'},
    'cap-color': {'n': 'brown', 'b': 'buff', 'g': 'gray', 'r': 'green', 'p': 'pink', 'u': 'purple', 'e': 'red', 'w': 'white', 'y': 'yellow', 'l': 'blue', 'o': 'orange', 'k': 'black'},
    'gill-attachment': {'a': 'adnate', 'x': 'adnexed', 'd': 'decurrent', 'e': 'free', 's': 'sinuate', 'p': 'pores', 'f': 'none', '?': 'unknown'},
    'gill-spacing': {'c': 'close', 'd': 'distant', 'f': 'none'},
    'gill-color': {'n': 'brown', 'b': 'buff', 'g': 'gray', 'r': 'green', 'p': 'pink', 'u': 'purple', 'e': 'red', 'w': 'white', 'y': 'yellow', 'l': 'blue', 'o': 'orange', 'k': 'black', 'f': 'none'},
    'stem-color': {'n': 'brown', 'b': 'buff', 'g': 'gray', 'r': 'green', 'p': 'pink', 'u': 'purple', 'e': 'red', 'w': 'white', 'y': 'yellow', 'l': 'blue', 'o': 'orange', 'k': 'black', 'f': 'none'},
    'ring-type': {'c': 'cobwebby', 'e': 'evanescent', 'r': 'flaring', 'g': 'grooved', 'l': 'large', 'p': 'pendant', 's': 'sheathing', 'z': 'zone', 'y': 'scaly', 'm': 'movable', 'f': 'none', '?': 'unknown'},
    'habitat': {'g': 'grasses', 'l': 'leaves', 'm': 'meadows', 'p': 'paths', 'h': 'heaths', 'u': 'urban', 'w': 'waste', 'd': 'woods'},
    'season': {'s': 'spring', 'u': 'summer', 'a': 'autumn', 'w': 'winter'}
    })

# convert t/f columns to boolean.
EU_data['does-bruise-or-bleed'] = EU_data['does-bruise-or-bleed']=='t'
EU_data['has-ring'] = EU_data['has-ring']=='t'

EU_data

Unnamed: 0,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-color,has-ring,ring-type,habitat,season
0,eu_edible,1.26,convex,grooves,yellow,False,decurrent,close,white,5.04,1.73,yellow,False,none,woods,autumn
1,eu_edible,10.32,flat,fleshy,buff,False,adnate,close,buff,4.68,19.44,white,True,none,woods,autumn
2,eu_poisonous,0.92,convex,grooves,pink,False,adnate,close,pink,4.59,1.15,black,False,none,woods,summer
3,eu_poisonous,4.27,convex,sticky,pink,False,adnexed,close,white,4.55,6.52,white,False,none,woods,autumn
4,eu_edible,3.08,flat,smooth,white,False,decurrent,distant,white,2.67,5.18,white,False,none,meadows,autumn
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61064,eu_poisonous,12.79,convex,fleshy,brown,True,pores,close,red,9.60,25.61,yellow,False,none,woods,summer
61065,eu_poisonous,2.42,convex,dry,white,False,adnate,distant,pink,3.52,2.20,white,False,none,grasses,summer
61066,eu_edible,12.33,sunken,sticky,purple,False,sinuate,close,purple,7.71,21.99,purple,False,none,woods,autumn
61067,eu_poisonous,3.85,sunken,wrinkled,purple,False,adnate,close,purple,5.32,5.59,purple,False,none,leaves,autumn


In [7]:
# Combine data sets

combined_data = EU_data.append(NA_data, ignore_index=True)

In [8]:
# Export data to csv so it can be shuffled
 combined_data.to_csv("four_output_combined_data.csv", sep = ";")

In [10]:
# Import shuffled data

shuffled_combined = pd.read_csv("four_output_shuffled_data.csv")
shuffled_combined

Unnamed: 0.1,Unnamed: 0,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-color,has-ring,ring-type,habitat,season
0,65139,na_poisonous,5.78,flat,sticky,red,True,adnexed,close,red,5.33,14.16,white,False,none,woods,autumn
1,60522,eu_poisonous,13.53,convex,silky,brown,True,pores,close,yellow,9.17,31.69,yellow,False,none,woods,summer
2,32876,eu_poisonous,3.74,bell,sticky,blue,False,sinuate,close,brown,6.39,7.67,white,True,evanescent,meadows,autumn
3,93704,na_edible,4.71,others,scaly,gray,False,free,close,black,7.84,4.82,white,True,zone,woods,autumn
4,34324,eu_edible,8.31,convex,shiny,brown,True,pores,close,brown,6.63,17.59,brown,True,pendant,woods,autumn
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95658,21440,eu_poisonous,6.14,flat,dry,brown,True,decurrent,close,white,6.12,9.52,red,False,none,woods,summer
95659,73349,na_edible,10.13,convex,fleshy,brown,True,decurrent,close,white,7.41,14.41,buff,False,none,woods,autumn
95660,50057,eu_edible,6.88,flat,shiny,orange,False,sinuate,close,yellow,9.68,12.55,brown,False,none,woods,autumn
95661,5192,eu_poisonous,6.58,convex,scaly,white,True,adnate,close,gray,7.47,13.66,white,True,large,woods,autumn


In [11]:
# Remove old index

shuffled_combined.drop(columns=['Unnamed: 0'], inplace=True)
shuffled_combined.head()

Unnamed: 0,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-color,has-ring,ring-type,habitat,season
0,na_poisonous,5.78,flat,sticky,red,True,adnexed,close,red,5.33,14.16,white,False,none,woods,autumn
1,eu_poisonous,13.53,convex,silky,brown,True,pores,close,yellow,9.17,31.69,yellow,False,none,woods,summer
2,eu_poisonous,3.74,bell,sticky,blue,False,sinuate,close,brown,6.39,7.67,white,True,evanescent,meadows,autumn
3,na_edible,4.71,others,scaly,gray,False,free,close,black,7.84,4.82,white,True,zone,woods,autumn
4,eu_edible,8.31,convex,shiny,brown,True,pores,close,brown,6.63,17.59,brown,True,pendant,woods,autumn


## Set up data for Random Forest model