In [1]:
import pandas as pd
from sklearn import svm
from sklearn.model_selection import train_test_split, cross_val_score
from joblib import dump, load

In [2]:
mushrooms = pd.read_csv('mushrooms.csv')
mushrooms.head(5)

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [3]:
mushrooms.columns

Index(['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat'],
      dtype='object')

In [4]:
df = mushrooms[['class', 'cap-shape', 'cap-surface', 'cap-color', 'stalk-shape',  'ring-number', 'habitat']]
df.head(5)

Unnamed: 0,class,cap-shape,cap-surface,cap-color,stalk-shape,ring-number,habitat
0,p,x,s,n,e,o,u
1,e,x,s,y,e,o,g
2,e,b,s,w,e,o,m
3,p,x,y,w,e,o,u
4,e,x,s,g,t,o,g


In [5]:
df_encoded = pd.get_dummies(df)
df_encoded.head(5)

Unnamed: 0,class_e,class_p,cap-shape_b,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x,cap-surface_f,cap-surface_g,...,ring-number_n,ring-number_o,ring-number_t,habitat_d,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
0,0,1,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,1,0
1,1,0,0,0,0,0,0,1,0,0,...,0,1,0,0,1,0,0,0,0,0
2,1,0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0
3,0,1,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,1,0
4,1,0,0,0,0,0,0,1,0,0,...,0,1,0,0,1,0,0,0,0,0


In [6]:
df_encoded.columns

Index(['class_e', 'class_p', 'cap-shape_b', 'cap-shape_c', 'cap-shape_f',
       'cap-shape_k', 'cap-shape_s', 'cap-shape_x', 'cap-surface_f',
       'cap-surface_g', 'cap-surface_s', 'cap-surface_y', 'cap-color_b',
       'cap-color_c', 'cap-color_e', 'cap-color_g', 'cap-color_n',
       'cap-color_p', 'cap-color_r', 'cap-color_u', 'cap-color_w',
       'cap-color_y', 'stalk-shape_e', 'stalk-shape_t', 'ring-number_n',
       'ring-number_o', 'ring-number_t', 'habitat_d', 'habitat_g', 'habitat_l',
       'habitat_m', 'habitat_p', 'habitat_u', 'habitat_w'],
      dtype='object')

In [7]:
X = df_encoded.drop(['class_e', 'class_p'], axis=1)
X.head(5)

Unnamed: 0,cap-shape_b,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x,cap-surface_f,cap-surface_g,cap-surface_s,cap-surface_y,...,ring-number_n,ring-number_o,ring-number_t,habitat_d,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
0,0,0,0,0,0,1,0,0,1,0,...,0,1,0,0,0,0,0,0,1,0
1,0,0,0,0,0,1,0,0,1,0,...,0,1,0,0,1,0,0,0,0,0
2,1,0,0,0,0,0,0,0,1,0,...,0,1,0,0,0,0,1,0,0,0
3,0,0,0,0,0,1,0,0,0,1,...,0,1,0,0,0,0,0,0,1,0
4,0,0,0,0,0,1,0,0,1,0,...,0,1,0,0,1,0,0,0,0,0


In [8]:
y = df_encoded['class_e']
y.head(5)

0    0
1    1
2    1
3    0
4    1
Name: class_e, dtype: uint8

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

In [10]:
model = svm.SVC(C=1, decision_function_shape='ovo', degree=1, gamma=1, kernel='rbf')

In [11]:
model.fit(X_train, y_train)

SVC(C=1, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovo', degree=1, gamma=1, kernel='rbf', max_iter=-1,
    probability=False, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [12]:
train_score = model.score(X_train, y_train)
train_score

0.9397391144589381

In [13]:
test_score = model.score(X_test, y_test)
test_score

0.9399477806788512

In [14]:
dump(model, 'model.joblib')

['model.joblib']