In [25]:
!pip install findspark



In [26]:
import findspark
findspark.init()

In [27]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.master("local[*]").appName("ML_classification_Example").getOrCreate()

spark

In [43]:
import seaborn as sns
import pandas as pd
import numpy as np

In [29]:
df=sns.load_dataset('iris')
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [30]:
df.to_csv('spark_iris_1.csv', index=False)

In [31]:
data=spark.read.csv('spark_iris_1.csv',header=True,inferSchema=True )

data.show()

+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|species|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| setosa|
|         4.9|        3.0|         1.4|        0.2| setosa|
|         4.7|        3.2|         1.3|        0.2| setosa|
|         4.6|        3.1|         1.5|        0.2| setosa|
|         5.0|        3.6|         1.4|        0.2| setosa|
|         5.4|        3.9|         1.7|        0.4| setosa|
|         4.6|        3.4|         1.4|        0.3| setosa|
|         5.0|        3.4|         1.5|        0.2| setosa|
|         4.4|        2.9|         1.4|        0.2| setosa|
|         4.9|        3.1|         1.5|        0.1| setosa|
|         5.4|        3.7|         1.5|        0.2| setosa|
|         4.8|        3.4|         1.6|        0.2| setosa|
|         4.8|        3.0|         1.4|        0.1| setosa|
|         4.3|        3.0|         1.1| 

In [32]:
# counting number of nulls for each of the columns
from pyspark.sql.functions import col, count,when
data.select([count(when(col(c).isNull(),c)).alias(c) for c in data.columns]).show()

+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|species|
+------------+-----------+------------+-----------+-------+
|           0|          0|           0|          0|      0|
+------------+-----------+------------+-----------+-------+



In [33]:
# encoding categorical variables

from pyspark.ml.feature import StringIndexer

label_indexer= StringIndexer(inputCol='species', outputCol='label_species')

data=label_indexer.fit(data).transform(data)

data.show()

+------------+-----------+------------+-----------+-------+-------------+
|sepal_length|sepal_width|petal_length|petal_width|species|label_species|
+------------+-----------+------------+-----------+-------+-------------+
|         5.1|        3.5|         1.4|        0.2| setosa|          0.0|
|         4.9|        3.0|         1.4|        0.2| setosa|          0.0|
|         4.7|        3.2|         1.3|        0.2| setosa|          0.0|
|         4.6|        3.1|         1.5|        0.2| setosa|          0.0|
|         5.0|        3.6|         1.4|        0.2| setosa|          0.0|
|         5.4|        3.9|         1.7|        0.4| setosa|          0.0|
|         4.6|        3.4|         1.4|        0.3| setosa|          0.0|
|         5.0|        3.4|         1.5|        0.2| setosa|          0.0|
|         4.4|        2.9|         1.4|        0.2| setosa|          0.0|
|         4.9|        3.1|         1.5|        0.1| setosa|          0.0|
|         5.4|        3.7|         1.5

In [34]:
from pyspark.ml.feature import VectorAssembler
feature_cols=['sepal_length', 'sepal_width', 'petal_length', 'petal_width']

assembler=VectorAssembler(inputCols=feature_cols, outputCol='features_1')

data=assembler.transform(data)

data.show()

+------------+-----------+------------+-----------+-------+-------------+-----------------+
|sepal_length|sepal_width|petal_length|petal_width|species|label_species|       features_1|
+------------+-----------+------------+-----------+-------+-------------+-----------------+
|         5.1|        3.5|         1.4|        0.2| setosa|          0.0|[5.1,3.5,1.4,0.2]|
|         4.9|        3.0|         1.4|        0.2| setosa|          0.0|[4.9,3.0,1.4,0.2]|
|         4.7|        3.2|         1.3|        0.2| setosa|          0.0|[4.7,3.2,1.3,0.2]|
|         4.6|        3.1|         1.5|        0.2| setosa|          0.0|[4.6,3.1,1.5,0.2]|
|         5.0|        3.6|         1.4|        0.2| setosa|          0.0|[5.0,3.6,1.4,0.2]|
|         5.4|        3.9|         1.7|        0.4| setosa|          0.0|[5.4,3.9,1.7,0.4]|
|         4.6|        3.4|         1.4|        0.3| setosa|          0.0|[4.6,3.4,1.4,0.3]|
|         5.0|        3.4|         1.5|        0.2| setosa|          0.0|[5.0,3.

In [36]:
from pyspark.ml.feature import StandardScaler

scaler=StandardScaler(inputCol='features_1', outputCol='scaled_features', withStd=True, withMean=False)

scaler_model=scaler.fit(data)

data=scaler_model.transform(data)


data.select('scaled_features', 'label_species').show(5)


+--------------------+-------------+
|     scaled_features|label_species|
+--------------------+-------------+
|[6.15892840883878...|          0.0|
|[5.9174018045706,...|          0.0|
|[5.67587520030241...|          0.0|
|[5.55511189816831...|          0.0|
|[6.03816510670469...|          0.0|
+--------------------+-------------+
only showing top 5 rows



**withStd=True**

This parameter specifies that the data should be scaled to have unit variance, meaning the transformed values will be divided by the standard deviation of each feature. When withStd=True, each feature in the vector will have a standard deviation of 1.

**withMean=False**

This parameter determines whether the data should be centered by subtracting the mean of each feature. withMean=False means the mean of each feature will not be subtracted. This is often kept as False when working with sparse data because centering would result in dense data, which consumes more memory.

In [37]:
# split into train & test

train_data, test_data= data.randomSplit([0.8, 0.2], seed=123)

In [38]:
from pyspark.ml.classification import LogisticRegression

lr=LogisticRegression(featuresCol='scaled_features',labelCol='label_species' )

In [39]:
# train the data

model=lr.fit(train_data)


In [40]:
predictions=model.transform(test_data)


In [41]:
predictions.select("scaled_features", "label_species","prediction","probability").show(truncate=False)

+-----------------------------------------------------------------------------+-------------+----------+-----------------------------------------------------+
|scaled_features                                                              |label_species|prediction|probability                                          |
+-----------------------------------------------------------------------------+-------------+----------+-----------------------------------------------------+
|[5.313585293900131,6.882844816583367,0.7364194760449431,0.26238535320983253] |0.0          |0.0       |[1.0,1.2457789207319175E-219,0.0]                    |
|[5.555111898168318,7.341701137688926,0.7930671280484002,0.26238535320983253] |0.0          |0.0       |[1.0,2.7694139977059496E-208,0.0]                    |
|[5.7966385024365055,6.882844816583367,0.7930671280484002,0.3935780298147488] |0.0          |0.0       |[1.0,3.043953542294356E-129,0.0]                     |
|[5.7966385024365055,7.112272977136146,0.90636

In [44]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator=MulticlassClassificationEvaluator(labelCol='label_species', predictionCol='prediction', metricName='accuracy')

accuracy=evaluator.evaluate(predictions)

print(f"accuracy of the model for iris is {np.round(accuracy,2)}")

accuracy of the model for iris is 0.97


In [45]:
feature_cols=['sepal_length', 'sepal_width', 'petal_length', 'petal_width']

label_indexer= StringIndexer(inputCol='species', outputCol='label_species')
assembler=VectorAssembler(inputCols=feature_cols, outputCol='features_1')
scaler=StandardScaler(inputCol='features', outputCol='scaled_features', withStd=True, withMean=False)
lr=LogisticRegression(featuresCol='scaled_features',labelCol='label_species' )


In [46]:
from pyspark.ml import Pipeline

pipeline_stages=Pipeline(stages=[label_indexer, assembler,scaler, lr])


In [47]:
iris_tr_da, iris_te_da=data.randomSplit([0.8, 0.2], seed=124)

In [49]:
# model=pipeline_stages.fit(iris_tr_da)