In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

print(pyspark.__version__)

3.5.0


In [2]:
# Initialiser la session Spark
spark = SparkSession.builder.appName("Practice").getOrCreate()

# Lire le fichier CSV
df_pyspark = spark.read.csv("mushroom.csv", inferSchema=True, header=True)

# Afficher les premières lignes et le schéma
df_pyspark.show(5)
df_pyspark.printSchema()

+-----+---------+-----------+---------+-------+----+---------------+------------+---------+----------+-----------+----------+------------------------+------------------------+----------------------+----------------------+---------+----------+-----------+---------+-----------------+----------+-------+
|class|cap-shape|cap-surface|cap-color|bruises|odor|gill-attachment|gill-spacing|gill-size|gill-color|stalk-shape|stalk-root|stalk-surface-above-ring|stalk-surface-below-ring|stalk-color-above-ring|stalk-color-below-ring|veil-type|veil-color|ring-number|ring-type|spore-print-color|population|habitat|
+-----+---------+-----------+---------+-------+----+---------------+------------+---------+----------+-----------+----------+------------------------+------------------------+----------------------+----------------------+---------+----------+-----------+---------+-----------------+----------+-------+
|    p|        x|          s|        n|      t|   p|              f|           c|        n|   

In [3]:
# Nettoyage des données
categoricalColumns = df_pyspark.columns
for categoricalCol in categoricalColumns:
    stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + "_encoded").fit(df_pyspark)
    df_pyspark = stringIndexer.transform(df_pyspark)
    df_pyspark = df_pyspark.withColumn(categoricalCol + "_encoded", df_pyspark[categoricalCol + "_encoded"].cast('int'))

encoded_df = df_pyspark.select([x + "_encoded" for x in categoricalColumns])

# Assemblage des features
featureAssembler = VectorAssembler(inputCols=[x + "_encoded" for x in categoricalColumns[1:]], outputCol="features")
output = featureAssembler.transform(encoded_df)


# Afficher les premières lignes et le schéma
output.show(5)
output.printSchema()

+-------------+-----------------+-------------------+-----------------+---------------+------------+-----------------------+--------------------+-----------------+------------------+-------------------+------------------+--------------------------------+--------------------------------+------------------------------+------------------------------+-----------------+------------------+-------------------+-----------------+-------------------------+------------------+---------------+--------------------+
|class_encoded|cap-shape_encoded|cap-surface_encoded|cap-color_encoded|bruises_encoded|odor_encoded|gill-attachment_encoded|gill-spacing_encoded|gill-size_encoded|gill-color_encoded|stalk-shape_encoded|stalk-root_encoded|stalk-surface-above-ring_encoded|stalk-surface-below-ring_encoded|stalk-color-above-ring_encoded|stalk-color-below-ring_encoded|veil-type_encoded|veil-color_encoded|ring-number_encoded|ring-type_encoded|spore-print-color_encoded|population_encoded|habitat_encoded|        

In [4]:
# Séparer les données en train et test
train, test = output.randomSplit([0.8, 0.2], seed=17)

print("Données Train : ", train.count())
print("Données Test : ", test.count())

Données Train :  6471
Données Test :  1653


In [5]:
# Entraîner le modèle avec un arbre de décision
dt = DecisionTreeClassifier(featuresCol='features', labelCol='class_encoded', maxDepth=3)
dtModel = dt.fit(train)

# Prédictions
predictions = dtModel.transform(test)

# Évaluation des performances
evaluator = MulticlassClassificationEvaluator(labelCol="class_encoded", predictionCol="prediction")
accuracy = evaluator.evaluate(predictions)
print("TEST AREA UNDER ROC : ", accuracy)

TEST AREA UNDER ROC :  0.9872947089637973


In [6]:
# Enregistrer le modèle d'arbre de décision
model_path = "decision_tree_model"
dtModel.write().overwrite().save(model_path)

In [7]:
# Arrêter la session Spark
spark.stop()