# Classifier les champignons avec PySpark

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("MushroomClassifier").getOrCreate()

## 2. Creating the spark session

In [3]:
# see what it in spark variable
print(spark)

<pyspark.sql.session.SparkSession object at 0x00000151A638E6E0>


In [4]:
sc = spark.sparkContext
sc


## 3. Loading the data

In [38]:
mushroom_df = spark.read.csv("mushrooms.csv")

In [41]:
mushroom_df.dropna(-1)

TypeError: can only concatenate str (not "int") to str

In [39]:
# Inspecting the 5 first rows
mushroom_df.take(5)

[Row(_c0='class', _c1='cap-shape', _c2='cap-surface', _c3='cap-color', _c4='bruises', _c5='odor', _c6='gill-attachment', _c7='gill-spacing', _c8='gill-size', _c9='gill-color', _c10='stalk-shape', _c11='stalk-root', _c12='stalk-surface-above-ring', _c13='stalk-surface-below-ring', _c14='stalk-color-above-ring', _c15='stalk-color-below-ring', _c16='veil-type', _c17='veil-color', _c18='ring-number', _c19='ring-type', _c20='spore-print-color', _c21='population', _c22='habitat'),
 Row(_c0='p', _c1='x', _c2='s', _c3='n', _c4='t', _c5='p', _c6='f', _c7='c', _c8='n', _c9='k', _c10='e', _c11='e', _c12='s', _c13='s', _c14='w', _c15='w', _c16='p', _c17='w', _c18='o', _c19='p', _c20='k', _c21='s', _c22='u'),
 Row(_c0='e', _c1='x', _c2='s', _c3='y', _c4='t', _c5='a', _c6='f', _c7='c', _c8='b', _c9='k', _c10='e', _c11='c', _c12='s', _c13='s', _c14='w', _c15='w', _c16='p', _c17='w', _c18='o', _c19='p', _c20='n', _c21='n', _c22='g'),
 Row(_c0='e', _c1='b', _c2='s', _c3='w', _c4='t', _c5='l', _c6='f', 

In [40]:
# Showing the 5 rows
mushroom_df.show(5)

+-----+---------+-----------+---------+-------+----+---------------+------------+---------+----------+-----------+----------+--------------------+--------------------+--------------------+--------------------+---------+----------+-----------+---------+-----------------+----------+-------+
|  _c0|      _c1|        _c2|      _c3|    _c4| _c5|            _c6|         _c7|      _c8|       _c9|       _c10|      _c11|                _c12|                _c13|                _c14|                _c15|     _c16|      _c17|       _c18|     _c19|             _c20|      _c21|   _c22|
+-----+---------+-----------+---------+-------+----+---------------+------------+---------+----------+-----------+----------+--------------------+--------------------+--------------------+--------------------+---------+----------+-----------+---------+-----------------+----------+-------+
|class|cap-shape|cap-surface|cap-color|bruises|odor|gill-attachment|gill-spacing|gill-size|gill-color|stalk-shape|stalk-root|stalk

In [8]:
# showing the columns
mushroom_df.columns

['_c0',
 '_c1',
 '_c2',
 '_c3',
 '_c4',
 '_c5',
 '_c6',
 '_c7',
 '_c8',
 '_c9',
 '_c10',
 '_c11',
 '_c12',
 '_c13',
 '_c14',
 '_c15',
 '_c16',
 '_c17',
 '_c18',
 '_c19',
 '_c20',
 '_c21',
 '_c22']

In [11]:
# Schema of the dataframe
mushroom_df.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: string (nullable = true)
 |-- _c7: string (nullable = true)
 |-- _c8: string (nullable = true)
 |-- _c9: string (nullable = true)
 |-- _c10: string (nullable = true)
 |-- _c11: string (nullable = true)
 |-- _c12: string (nullable = true)
 |-- _c13: string (nullable = true)
 |-- _c14: string (nullable = true)
 |-- _c15: string (nullable = true)
 |-- _c16: string (nullable = true)
 |-- _c17: string (nullable = true)
 |-- _c18: string (nullable = true)
 |-- _c19: string (nullable = true)
 |-- _c20: string (nullable = true)
 |-- _c21: string (nullable = true)
 |-- _c22: string (nullable = true)



## 4. Data Exploration

In [12]:
# run a sample selection
mushroom_df.select('_c10', '_c11', '_c12').show(10)

+-----------+----------+--------------------+
|       _c10|      _c11|                _c12|
+-----------+----------+--------------------+
|stalk-shape|stalk-root|stalk-surface-abo...|
|          e|         e|                   s|
|          e|         c|                   s|
|          e|         c|                   s|
|          e|         e|                   s|
|          t|         e|                   s|
|          e|         c|                   s|
|          e|         c|                   s|
|          e|         c|                   s|
|          e|         e|                   s|
+-----------+----------+--------------------+
only showing top 10 rows



In [16]:
# showing as a panda dataframe
mushroom_df.toPandas()

Unnamed: 0,_c0,_c1,_c2,_c3,_c4,_c5,_c6,_c7,_c8,_c9,...,_c13,_c14,_c15,_c16,_c17,_c18,_c19,_c20,_c21,_c22
0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
1,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
2,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
3,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
4,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8120,e,k,s,n,f,n,a,c,b,y,...,s,o,o,p,o,o,p,b,c,l
8121,e,x,s,n,f,n,a,c,b,y,...,s,o,o,p,n,o,p,b,v,l
8122,e,f,s,n,f,n,a,c,b,n,...,s,o,o,p,o,o,p,b,c,l
8123,p,k,y,n,f,y,f,c,n,b,...,k,w,w,p,w,o,e,w,v,l


In [19]:
mushroom_df.describe()

DataFrame[summary: string, _c0: string, _c1: string, _c2: string, _c3: string, _c4: string, _c5: string, _c6: string, _c7: string, _c8: string, _c9: string, _c10: string, _c11: string, _c12: string, _c13: string, _c14: string, _c15: string, _c16: string, _c17: string, _c18: string, _c19: string, _c20: string, _c21: string, _c22: string]

In [23]:
mushroom_df.dtypes

[('_c0', 'string'),
 ('_c1', 'string'),
 ('_c2', 'string'),
 ('_c3', 'string'),
 ('_c4', 'string'),
 ('_c5', 'string'),
 ('_c6', 'string'),
 ('_c7', 'string'),
 ('_c8', 'string'),
 ('_c9', 'string'),
 ('_c10', 'string'),
 ('_c11', 'string'),
 ('_c12', 'string'),
 ('_c13', 'string'),
 ('_c14', 'string'),
 ('_c15', 'string'),
 ('_c16', 'string'),
 ('_c17', 'string'),
 ('_c18', 'string'),
 ('_c19', 'string'),
 ('_c20', 'string'),
 ('_c21', 'string'),
 ('_c22', 'string')]

In [27]:
# looking for duplicating values or missing value
mushroom_df.drop_duplicates

<bound method dropDuplicates of DataFrame[_c0: string, _c1: string, _c2: string, _c3: string, _c4: string, _c5: string, _c6: string, _c7: string, _c8: string, _c9: string, _c10: string, _c11: string, _c12: string, _c13: string, _c14: string, _c15: string, _c16: string, _c17: string, _c18: string, _c19: string, _c20: string, _c21: string, _c22: string]>

In [28]:
import seaborn as sns
import math
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn import tree
from sklearn.metrics import accuracy_score

In [29]:
le = LabelEncoder()

for columns in mushroom_df.columns:
    mushroom_df[columns] = le.fit_transform(mushroom_df[columns])

RecursionError: maximum recursion depth exceeded in comparison

In [None]:
X_train,X_test,Y_train,Y_test = train_test_split(mushroom_df.drop('class',axis = 1),mushroom_df['class'],random_state = 42,test_size = 0.2)

In [None]:
lr = LogisticRegression()
neighbors_classifier = KNeighborsClassifier(n_neighbors = int(np.round(math.sqrt(len(X_train)))))
SVM = SVC(kernel = 'linear',C = 1)
dt = tree.DecisionTreeClassifier()

In [None]:
algos = [lr,neighbors_classifier,SVM,dt]
accu_score = {}

In [None]:
for algo in algos:
    algo.fit(X_train,Y_train)
    Y_pred = algo.predict(X_test)
    accu_score[algo] = accuracy_score(Y_pred,Y_test)

In [None]:
print(accu_score)