In [None]:
import pyspark
print(pyspark.__version__)


4.0.1


In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Big Data Analysis & Machine Learning with Apache Spark (PySpark)") \
    .getOrCreate()

spark


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = spark.read.csv(
    "/content/drive/MyDrive/Project/iris.csv",
    header=True,
    inferSchema=True
)

df.show(5)


+------------+-----------+------------+-----------+-------+
|sepal.length|sepal.width|petal.length|petal.width|variety|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| Setosa|
|         4.9|        3.0|         1.4|        0.2| Setosa|
|         4.7|        3.2|         1.3|        0.2| Setosa|
|         4.6|        3.1|         1.5|        0.2| Setosa|
|         5.0|        3.6|         1.4|        0.2| Setosa|
+------------+-----------+------------+-----------+-------+
only showing top 5 rows


In [None]:
df = spark.read.csv(
    "/content/drive/MyDrive/Project/iris.csv",
    header=True,
    inferSchema=True,
    sep=","
)

df.show(10)

+------------+-----------+------------+-----------+-------+
|sepal.length|sepal.width|petal.length|petal.width|variety|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| Setosa|
|         4.9|        3.0|         1.4|        0.2| Setosa|
|         4.7|        3.2|         1.3|        0.2| Setosa|
|         4.6|        3.1|         1.5|        0.2| Setosa|
|         5.0|        3.6|         1.4|        0.2| Setosa|
|         5.4|        3.9|         1.7|        0.4| Setosa|
|         4.6|        3.4|         1.4|        0.3| Setosa|
|         5.0|        3.4|         1.5|        0.2| Setosa|
|         4.4|        2.9|         1.4|        0.2| Setosa|
|         4.9|        3.1|         1.5|        0.1| Setosa|
+------------+-----------+------------+-----------+-------+
only showing top 10 rows


In [None]:
import os

print(os.listdir("/content/drive/MyDrive/Project/"))

['Big data.ipynb', 'iris.csv']


In [None]:
print(df.dtypes)

[('sepal.length', 'double'), ('sepal.width', 'double'), ('petal.length', 'double'), ('petal.width', 'double'), ('variety', 'string')]


In [None]:
df.printSchema()

root
 |-- sepal.length: double (nullable = true)
 |-- sepal.width: double (nullable = true)
 |-- petal.length: double (nullable = true)
 |-- petal.width: double (nullable = true)
 |-- variety: string (nullable = true)



In [None]:
from pyspark.sql.functions import col

new_column_names = [c.replace('.', '_') for c in df.columns]
df_cleaned = df.toDF(*new_column_names)
df_cleaned.describe().show()

+-------+------------------+-------------------+------------------+------------------+---------+
|summary|      sepal_length|        sepal_width|      petal_length|       petal_width|  variety|
+-------+------------------+-------------------+------------------+------------------+---------+
|  count|               150|                150|               150|               150|      150|
|   mean| 5.843333333333335|  3.057333333333334|3.7580000000000027| 1.199333333333334|     NULL|
| stddev|0.8280661279778637|0.43586628493669793|1.7652982332594662|0.7622376689603467|     NULL|
|    min|               4.3|                2.0|               1.0|               0.1|   Setosa|
|    max|               7.9|                4.4|               6.9|               2.5|Virginica|
+-------+------------------+-------------------+------------------+------------------+---------+



In [None]:
df_cleaned.describe().show()

+-------+------------------+-------------------+------------------+------------------+---------+
|summary|      sepal_length|        sepal_width|      petal_length|       petal_width|  variety|
+-------+------------------+-------------------+------------------+------------------+---------+
|  count|               150|                150|               150|               150|      150|
|   mean| 5.843333333333335|  3.057333333333334|3.7580000000000027| 1.199333333333334|     NULL|
| stddev|0.8280661279778637|0.43586628493669793|1.7652982332594662|0.7622376689603467|     NULL|
|    min|               4.3|                2.0|               1.0|               0.1|   Setosa|
|    max|               7.9|                4.4|               6.9|               2.5|Virginica|
+-------+------------------+-------------------+------------------+------------------+---------+



In [None]:
print(df.columns)

['sepal.length', 'sepal.width', 'petal.length', 'petal.width', 'variety']


In [None]:
df.createOrReplaceGlobalTempView("iris_table")

In [None]:
distinct_species = spark.sql("SELECT DISTINCT variety FROM global_temp.iris_table")
distinct_species.show()

+----------+
|   variety|
+----------+
| Virginica|
|    Setosa|
|Versicolor|
+----------+



In [None]:
species_counts = spark.sql("SELECT variety, COUNT(*) AS count FROM global_temp.iris_table GROUP BY variety")
species_counts.show()

+----------+-----+
|   variety|count|
+----------+-----+
| Virginica|   50|
|    Setosa|   50|
|Versicolor|   50|
+----------+-----+



In [None]:
df_cleaned.filter(col("sepal_length") > 5).show()

+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|variety|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| Setosa|
|         5.4|        3.9|         1.7|        0.4| Setosa|
|         5.4|        3.7|         1.5|        0.2| Setosa|
|         5.8|        4.0|         1.2|        0.2| Setosa|
|         5.7|        4.4|         1.5|        0.4| Setosa|
|         5.4|        3.9|         1.3|        0.4| Setosa|
|         5.1|        3.5|         1.4|        0.3| Setosa|
|         5.7|        3.8|         1.7|        0.3| Setosa|
|         5.1|        3.8|         1.5|        0.3| Setosa|
|         5.4|        3.4|         1.7|        0.2| Setosa|
|         5.1|        3.7|         1.5|        0.4| Setosa|
|         5.1|        3.3|         1.7|        0.5| Setosa|
|         5.2|        3.5|         1.5|        0.2| Setosa|
|         5.2|        3.4|         1.4| 

In [None]:
from pyspark.ml.feature import StringIndexer
indexer=StringIndexer(inputCol="variety",outputCol="variety_index")
df_indexed=indexer.fit(df_cleaned).transform(df_cleaned)
df_indexed.show()

+------------+-----------+------------+-----------+-------+-------------+
|sepal_length|sepal_width|petal_length|petal_width|variety|variety_index|
+------------+-----------+------------+-----------+-------+-------------+
|         5.1|        3.5|         1.4|        0.2| Setosa|          0.0|
|         4.9|        3.0|         1.4|        0.2| Setosa|          0.0|
|         4.7|        3.2|         1.3|        0.2| Setosa|          0.0|
|         4.6|        3.1|         1.5|        0.2| Setosa|          0.0|
|         5.0|        3.6|         1.4|        0.2| Setosa|          0.0|
|         5.4|        3.9|         1.7|        0.4| Setosa|          0.0|
|         4.6|        3.4|         1.4|        0.3| Setosa|          0.0|
|         5.0|        3.4|         1.5|        0.2| Setosa|          0.0|
|         4.4|        2.9|         1.4|        0.2| Setosa|          0.0|
|         4.9|        3.1|         1.5|        0.1| Setosa|          0.0|
|         5.4|        3.7|         1.5

In [None]:
from pyspark.ml.feature import VectorAssembler
assembler=VectorAssembler(inputCols=["sepal_length","sepal_width","petal_length","petal_width"],outputCol="features")
df_assembled=assembler.transform(df_indexed)
df_assembled.show(5)


+------------+-----------+------------+-----------+-------+-------------+-----------------+
|sepal_length|sepal_width|petal_length|petal_width|variety|variety_index|         features|
+------------+-----------+------------+-----------+-------+-------------+-----------------+
|         5.1|        3.5|         1.4|        0.2| Setosa|          0.0|[5.1,3.5,1.4,0.2]|
|         4.9|        3.0|         1.4|        0.2| Setosa|          0.0|[4.9,3.0,1.4,0.2]|
|         4.7|        3.2|         1.3|        0.2| Setosa|          0.0|[4.7,3.2,1.3,0.2]|
|         4.6|        3.1|         1.5|        0.2| Setosa|          0.0|[4.6,3.1,1.5,0.2]|
|         5.0|        3.6|         1.4|        0.2| Setosa|          0.0|[5.0,3.6,1.4,0.2]|
+------------+-----------+------------+-----------+-------+-------------+-----------------+
only showing top 5 rows


In [None]:
train_data,test_data=df_assembled.randomSplit([0.8,0.2],seed=42)

In [None]:
from pyspark.ml.classification import NaiveBayes
nb=NaiveBayes(featuresCol="features",labelCol="variety_index")
model=nb.fit(train_data)

In [None]:
predictions=model.transform(test_data)
predictions.select("variety","variety_index","prediction").show(5)

+-------+-------------+----------+
|variety|variety_index|prediction|
+-------+-------------+----------+
| Setosa|          0.0|       0.0|
| Setosa|          0.0|       0.0|
| Setosa|          0.0|       0.0|
| Setosa|          0.0|       0.0|
| Setosa|          0.0|       0.0|
+-------+-------------+----------+
only showing top 5 rows


In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator=MulticlassClassificationEvaluator(labelCol="variety_index",predictionCol="prediction",metricName="accuracy")
accuracy=evaluator.evaluate(predictions)
print(f"Precision du modele Naive Bayes :{accuracy:.2f}")

Precision du modele Naive Bayes :1.00


In [None]:
from pyspark.ml.classification import MultilayerPerceptronClassifier
layers=[4,5,4,3]
mlp=MultilayerPerceptronClassifier(layers=layers,featuresCol="features",labelCol="variety_index",seed=1234)
model=mlp.fit(train_data)

In [None]:
predictions=model.transform(test_data)
accuracy=evaluator.evaluate(predictions)
print(f"Precision du modele MLP :{accuracy:.2f}")

Precision du modele MLP :0.96


In [None]:
from pyspark.ml.classification import DecisionTreeClassifier
dt=DecisionTreeClassifier(featuresCol="features",labelCol="variety_index")
model=dt.fit(train_data)
predictions=model.transform(test_data)

In [None]:
accuracy=evaluator.evaluate(predictions)
print(f"Precision du modele Decision Tree :{accuracy:.2f}")

Precision du modele Decision Tree :1.00


In [None]:
from pyspark.ml.classification import RandomForestClassifier
rf=RandomForestClassifier(featuresCol="features",labelCol="variety_index")
model=rf.fit(train_data)
predictions=model.transform(test_data)

In [None]:
accuracy=evaluator.evaluate(predictions)
print(f"Precision du modele Random Forest :{accuracy:.2f}")

Precision du modele Random Forest :0.96
