In [179]:
# Kaggle Project: https://www.kaggle.com/uciml/mushroom-classification
# Spark + Python

# Name: Jianlei (John) Sun


# Data Exploration

In [1]:
import pandas as pd

In [136]:
df_pd = pd.read_csv('./mushrooms.csv')
df_sp = spark.createDataFrame(df_pd)

In [137]:
df_sp.take(1)

[Row(class='p', cap-shape='x', cap-surface='s', cap-color='n', bruises='t', odor='p', gill-attachment='f', gill-spacing='c', gill-size='n', gill-color='k', stalk-shape='e', stalk-root='e', stalk-surface-above-ring='s', stalk-surface-below-ring='s', stalk-color-above-ring='w', stalk-color-below-ring='w', veil-type='p', veil-color='w', ring-number='o', ring-type='p', spore-print-color='k', population='s', habitat='u')]

# Feature Extraction

In [168]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler

In [140]:
# Encode the output variable
stringIndexer = StringIndexer(inputCol="class", outputCol="label")
df_sp = stringIndexer.fit(df_sp).transform(df_sp)

In [141]:
df_sp.select('label').show(3) # 

+-----+
|label|
+-----+
|  1.0|
|  0.0|
|  0.0|
+-----+
only showing top 3 rows



In [142]:
# drop "veil-type"，only one unique value
df_sp = df_sp.drop('veil-type');

In [None]:
# one-hot-encode the feature variables

In [143]:
cols = [i for i in df_sp.columns if i != 'label' and i != 'class']

In [144]:
for i in cols:
    stringIndexer = StringIndexer(inputCol=i, outputCol=(str(i)+'_SI'))
    df_sp = stringIndexer.fit(df_sp).transform(df_sp)
    
    df_sp = OneHotEncoder(inputCol=(str(i)+'_SI'), outputCol=(str(i)+'_SI'+'_OHE')).transform(df_sp)

In [156]:
cols_new = [i for i in df_sp.columns if i != 'label' and i[-3:] == 'OHE']

In [166]:
cols_new

['cap-shape_SI_OHE',
 'cap-surface_SI_OHE',
 'cap-color_SI_OHE',
 'bruises_SI_OHE',
 'odor_SI_OHE',
 'gill-attachment_SI_OHE',
 'gill-spacing_SI_OHE',
 'gill-size_SI_OHE',
 'gill-color_SI_OHE',
 'stalk-shape_SI_OHE',
 'stalk-root_SI_OHE',
 'stalk-surface-above-ring_SI_OHE',
 'stalk-surface-below-ring_SI_OHE',
 'stalk-color-above-ring_SI_OHE',
 'stalk-color-below-ring_SI_OHE',
 'veil-color_SI_OHE',
 'ring-number_SI_OHE',
 'ring-type_SI_OHE',
 'spore-print-color_SI_OHE',
 'population_SI_OHE',
 'habitat_SI_OHE']

In [169]:
vecAssembler = VectorAssembler(inputCols = cols_new, outputCol = "features")
df_sp = vecAssembler.transform(df_sp)
df_sp.select('features','label').show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|(95,[0,6,8,24,26,...|  1.0|
|(95,[0,6,11,22,26...|  0.0|
|(95,[3,6,12,23,26...|  0.0|
|(95,[0,5,12,24,26...|  1.0|
|(95,[0,6,9,17,18,...|  0.0|
+--------------------+-----+
only showing top 5 rows



# Model

In [180]:
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [181]:
# split into training set and testing set
df_train, df_test = df_sp.randomSplit([.8, .2])

In [182]:
# decision tree model
dt = DecisionTreeClassifier(featuresCol="features", labelCol="label")
model = dt.fit(df_train)

In [183]:
# predict
df_predicted = model.transform(df_test.select('features','label'))
df_predicted.show(3)

+--------------------+-----+-------------+-----------+----------+
|            features|label|rawPrediction|probability|prediction|
+--------------------+-----+-------------+-----------+----------+
|(95,[3,6,12,22,26...|  0.0|  [413.0,0.0]|  [1.0,0.0]|       0.0|
|(95,[3,6,12,22,26...|  0.0|  [413.0,0.0]|  [1.0,0.0]|       0.0|
|(95,[3,6,12,22,26...|  0.0|  [413.0,0.0]|  [1.0,0.0]|       0.0|
+--------------------+-----+-------------+-----------+----------+
only showing top 3 rows



In [184]:
# measure performance
evaluator = MulticlassClassificationEvaluator()
evaluator.evaluate(df_predicted)

0.9993788683121182