# Classifier les champignons avec PySpark

In [27]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("MushroomClassifier").getOrCreate()

## 2. Creating the spark session

In [28]:
# see what it in spark variable
print(spark)

<pyspark.sql.session.SparkSession object at 0x0000017FAC533670>


In [29]:
sc = spark.sparkContext
sc


## 3. Loading the data

In [30]:
mushroom_df = spark.read.csv("mushrooms.csv", header=True)

In [31]:
# Showing the 5 rows
mushroom_df.show(5)

+-----+---------+-----------+---------+-------+----+---------------+------------+---------+----------+-----------+----------+------------------------+------------------------+----------------------+----------------------+---------+----------+-----------+---------+-----------------+----------+-------+
|class|cap-shape|cap-surface|cap-color|bruises|odor|gill-attachment|gill-spacing|gill-size|gill-color|stalk-shape|stalk-root|stalk-surface-above-ring|stalk-surface-below-ring|stalk-color-above-ring|stalk-color-below-ring|veil-type|veil-color|ring-number|ring-type|spore-print-color|population|habitat|
+-----+---------+-----------+---------+-------+----+---------------+------------+---------+----------+-----------+----------+------------------------+------------------------+----------------------+----------------------+---------+----------+-----------+---------+-----------------+----------+-------+
|    p|        x|          s|        n|      t|   p|              f|           c|        n|   

In [32]:
# Inspecting the 5 first rows
mushroom_df.take(5)

[Row(class='p', cap-shape='x', cap-surface='s', cap-color='n', bruises='t', odor='p', gill-attachment='f', gill-spacing='c', gill-size='n', gill-color='k', stalk-shape='e', stalk-root='e', stalk-surface-above-ring='s', stalk-surface-below-ring='s', stalk-color-above-ring='w', stalk-color-below-ring='w', veil-type='p', veil-color='w', ring-number='o', ring-type='p', spore-print-color='k', population='s', habitat='u'),
 Row(class='e', cap-shape='x', cap-surface='s', cap-color='y', bruises='t', odor='a', gill-attachment='f', gill-spacing='c', gill-size='b', gill-color='k', stalk-shape='e', stalk-root='c', stalk-surface-above-ring='s', stalk-surface-below-ring='s', stalk-color-above-ring='w', stalk-color-below-ring='w', veil-type='p', veil-color='w', ring-number='o', ring-type='p', spore-print-color='n', population='n', habitat='g'),
 Row(class='e', cap-shape='b', cap-surface='s', cap-color='w', bruises='t', odor='l', gill-attachment='f', gill-spacing='c', gill-size='b', gill-color='n', st

In [33]:
# showing the columns
mushroom_df.columns

['class',
 'cap-shape',
 'cap-surface',
 'cap-color',
 'bruises',
 'odor',
 'gill-attachment',
 'gill-spacing',
 'gill-size',
 'gill-color',
 'stalk-shape',
 'stalk-root',
 'stalk-surface-above-ring',
 'stalk-surface-below-ring',
 'stalk-color-above-ring',
 'stalk-color-below-ring',
 'veil-type',
 'veil-color',
 'ring-number',
 'ring-type',
 'spore-print-color',
 'population',
 'habitat']

In [34]:
# Schema of the dataframe
mushroom_df.printSchema()

root
 |-- class: string (nullable = true)
 |-- cap-shape: string (nullable = true)
 |-- cap-surface: string (nullable = true)
 |-- cap-color: string (nullable = true)
 |-- bruises: string (nullable = true)
 |-- odor: string (nullable = true)
 |-- gill-attachment: string (nullable = true)
 |-- gill-spacing: string (nullable = true)
 |-- gill-size: string (nullable = true)
 |-- gill-color: string (nullable = true)
 |-- stalk-shape: string (nullable = true)
 |-- stalk-root: string (nullable = true)
 |-- stalk-surface-above-ring: string (nullable = true)
 |-- stalk-surface-below-ring: string (nullable = true)
 |-- stalk-color-above-ring: string (nullable = true)
 |-- stalk-color-below-ring: string (nullable = true)
 |-- veil-type: string (nullable = true)
 |-- veil-color: string (nullable = true)
 |-- ring-number: string (nullable = true)
 |-- ring-type: string (nullable = true)
 |-- spore-print-color: string (nullable = true)
 |-- population: string (nullable = true)
 |-- habitat: string 

## 4. Data Exploration

In [35]:
# run a sample selection
mushroom_df.select('class', 'cap-surface', 'cap-color').show(10)

+-----+-----------+---------+
|class|cap-surface|cap-color|
+-----+-----------+---------+
|    p|          s|        n|
|    e|          s|        y|
|    e|          s|        w|
|    p|          y|        w|
|    e|          s|        g|
|    e|          y|        y|
|    e|          s|        w|
|    e|          y|        w|
|    p|          y|        w|
|    e|          s|        y|
+-----+-----------+---------+
only showing top 10 rows



In [36]:
mushroom_df.describe()

DataFrame[summary: string, class: string, cap-shape: string, cap-surface: string, cap-color: string, bruises: string, odor: string, gill-attachment: string, gill-spacing: string, gill-size: string, gill-color: string, stalk-shape: string, stalk-root: string, stalk-surface-above-ring: string, stalk-surface-below-ring: string, stalk-color-above-ring: string, stalk-color-below-ring: string, veil-type: string, veil-color: string, ring-number: string, ring-type: string, spore-print-color: string, population: string, habitat: string]

In [37]:
mushroom_df.dtypes

[('class', 'string'),
 ('cap-shape', 'string'),
 ('cap-surface', 'string'),
 ('cap-color', 'string'),
 ('bruises', 'string'),
 ('odor', 'string'),
 ('gill-attachment', 'string'),
 ('gill-spacing', 'string'),
 ('gill-size', 'string'),
 ('gill-color', 'string'),
 ('stalk-shape', 'string'),
 ('stalk-root', 'string'),
 ('stalk-surface-above-ring', 'string'),
 ('stalk-surface-below-ring', 'string'),
 ('stalk-color-above-ring', 'string'),
 ('stalk-color-below-ring', 'string'),
 ('veil-type', 'string'),
 ('veil-color', 'string'),
 ('ring-number', 'string'),
 ('ring-type', 'string'),
 ('spore-print-color', 'string'),
 ('population', 'string'),
 ('habitat', 'string')]

In [38]:
# looking for duplicating values or missing value
mushroom_df.drop_duplicates

<bound method dropDuplicates of DataFrame[class: string, cap-shape: string, cap-surface: string, cap-color: string, bruises: string, odor: string, gill-attachment: string, gill-spacing: string, gill-size: string, gill-color: string, stalk-shape: string, stalk-root: string, stalk-surface-above-ring: string, stalk-surface-below-ring: string, stalk-color-above-ring: string, stalk-color-below-ring: string, veil-type: string, veil-color: string, ring-number: string, ring-type: string, spore-print-color: string, population: string, habitat: string]>

In [50]:
# importing librairies
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn import tree
from sklearn.metrics import accuracy_score

In [40]:
# loading the labelencoder
le = LabelEncoder()

In [41]:
# setting the dataframe as panda dataframe in order to train a model
mushroom_df = mushroom_df.toPandas()

In [42]:
mushroom_df

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,e,k,s,n,f,n,a,c,b,y,...,s,o,o,p,o,o,p,b,c,l
8120,e,x,s,n,f,n,a,c,b,y,...,s,o,o,p,n,o,p,b,v,l
8121,e,f,s,n,f,n,a,c,b,n,...,s,o,o,p,o,o,p,b,c,l
8122,p,k,y,n,f,y,f,c,n,b,...,k,w,w,p,w,o,e,w,v,l


In [43]:

for columns in mushroom_df.columns:
    mushroom_df[columns] = le.fit_transform(mushroom_df[columns])

In [46]:
mushroom_df.shape

(8124, 23)

In [47]:
X_train,X_test,Y_train,Y_test = train_test_split(mushroom_df.drop('class',axis = 1),mushroom_df['class'],random_state = 42,test_size = 0.2)

In [51]:
lr = LogisticRegression()
neighbors_classifier = KNeighborsClassifier(n_neighbors = int(np.round(math.sqrt(len(X_train)))))
SVM = SVC(kernel = 'linear',C = 1)
dt = tree.DecisionTreeClassifier()

In [52]:
algos = [lr,neighbors_classifier,SVM,dt]
accu_score = {}

In [53]:
for algo in algos:
    algo.fit(X_train,Y_train)
    Y_pred = algo.predict(X_test)
    accu_score[algo] = accuracy_score(Y_pred,Y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [54]:
print(accu_score)

{LogisticRegression(): 0.947076923076923, KNeighborsClassifier(n_neighbors=81): 0.9489230769230769, SVC(C=1, kernel='linear'): 0.9729230769230769, DecisionTreeClassifier(): 1.0}
