In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier,LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator,MulticlassClassificationEvaluator
import pandas as pd

In [None]:
spark=SparkSession.builder.appName('BinaryClass').getOrCreate()

In [None]:
df=spark.read.csv('diabetes.csv',header=True,inferSchema=True)
df.show()

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|          6|    148|           72|           35|      0|33.6|                   0.627| 50|      1|
|          1|     85|           66|           29|      0|26.6|                   0.351| 31|      0|
|          8|    183|           64|            0|      0|23.3|                   0.672| 32|      1|
|          1|     89|           66|           23|     94|28.1|                   0.167| 21|      0|
|          0|    137|           40|           35|    168|43.1|                   2.288| 33|      1|
|          5|    116|           74|            0|      0|25.6|                   0.201| 30|      0|
|          3|     78|           50|           32|     88|31.0|                   0.248| 26|      1|


In [None]:
df.groupBy('Outcome').count().show()

+-------+-----+
|Outcome|count|
+-------+-----+
|      1|  268|
|      0|  500|
+-------+-----+



In [None]:
inp_features=[col for col in df.columns if col!='Outcome']

assembler=VectorAssembler(inputCols=inp_features,outputCol='Features')
assembled_df=assembler.transform(df)

final_df=assembled_df.select('Features','Outcome')
final_df.show()

+--------------------+-------+
|            Features|Outcome|
+--------------------+-------+
|[6.0,148.0,72.0,3...|      1|
|[1.0,85.0,66.0,29...|      0|
|[8.0,183.0,64.0,0...|      1|
|[1.0,89.0,66.0,23...|      0|
|[0.0,137.0,40.0,3...|      1|
|[5.0,116.0,74.0,0...|      0|
|[3.0,78.0,50.0,32...|      1|
|[10.0,115.0,0.0,0...|      0|
|[2.0,197.0,70.0,4...|      1|
|[8.0,125.0,96.0,0...|      1|
|[4.0,110.0,92.0,0...|      0|
|[10.0,168.0,74.0,...|      1|
|[10.0,139.0,80.0,...|      0|
|[1.0,189.0,60.0,2...|      1|
|[5.0,166.0,72.0,1...|      1|
|[7.0,100.0,0.0,0....|      1|
|[0.0,118.0,84.0,4...|      1|
|[7.0,107.0,74.0,0...|      1|
|[1.0,103.0,30.0,3...|      0|
|[1.0,115.0,70.0,3...|      1|
+--------------------+-------+
only showing top 20 rows



In [None]:
(train_df,test_df)=final_df.randomSplit([0.8,0.2])
print(train_df.count(),test_df.count())

602 166


In [None]:
lr=LogisticRegression(featuresCol='Features',labelCol='Outcome')
lr_model=lr.fit(train_df)

In [None]:
lr_pred=lr_model.transform(test_df)

prec_eval=MulticlassClassificationEvaluator(labelCol='Outcome',metricName='weightedPrecision')
rec_eval=MulticlassClassificationEvaluator(labelCol='Outcome',metricName='weightedRecall')

print('Precision:',prec_eval.evaluate(lr_pred))
print('Recall:',rec_eval.evaluate(lr_pred))

Precision: 0.8084327441344324
Recall: 0.8072289156626506


In [10]:
!apt-get install openjdk-11-jdk -y
!pip install pyspark

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["PATH"] += ":/usr/lib/jvm/java-11-openjdk-amd64/bin"


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  ca-certificates-java fonts-dejavu-core fonts-dejavu-extra java-common
  libatk-wrapper-java libatk-wrapper-java-jni libpcsclite1 libxt-dev libxtst6
  libxxf86dga1 openjdk-11-jdk-headless openjdk-11-jre openjdk-11-jre-headless
  x11-utils
Suggested packages:
  default-jre pcscd libxt-doc openjdk-11-demo openjdk-11-source visualvm
  libnss-mdns fonts-ipafont-gothic fonts-ipafont-mincho fonts-wqy-microhei
  | fonts-wqy-zenhei fonts-indic mesa-utils
The following NEW packages will be installed:
  ca-certificates-java fonts-dejavu-core fonts-dejavu-extra java-common
  libatk-wrapper-java libatk-wrapper-java-jni libpcsclite1 libxt-dev libxtst6
  libxxf86dga1 openjdk-11-jdk openjdk-11-jdk-headless openjdk-11-jre
  openjdk-11-jre-headless x11-utils
0 upgraded, 15 newly installed, 0 to remove and 41 not upgraded.
Need to get 122 MB of archives.


In [11]:
import pyspark
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import BinaryClassificationEvaluator,MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier,RandomForestClassifier,LogisticRegression

In [25]:
spark=SparkSession.builder.appName('demo').getOrCreate()

In [34]:
df=spark.read.csv('/content/diabetes.csv',inferSchema=True,header=True)
inp_cols=[col for col in df.columns if col!='Outcome']

assembler=VectorAssembler(inputCols=inp_cols,outputCol='Features')
assembled_df=assembler.transform(df)
assembled_df.show(5)

final_df=assembled_df.select('Features','Outcome')
final_df.show(5)

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+--------------------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|            Features|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+--------------------+
|          6|    148|           72|           35|      0|33.6|                   0.627| 50|      1|[6.0,148.0,72.0,3...|
|          1|     85|           66|           29|      0|26.6|                   0.351| 31|      0|[1.0,85.0,66.0,29...|
|          8|    183|           64|            0|      0|23.3|                   0.672| 32|      1|[8.0,183.0,64.0,0...|
|          1|     89|           66|           23|     94|28.1|                   0.167| 21|      0|[1.0,89.0,66.0,23...|
|          0|    137|           40|           35|    168|43.1|                   2.288| 33|      1|[0.0,137.0,40.0,3...|
+-----------+-------+-----------

In [35]:
(train_data,test_data)=final_df.randomSplit([0.8,0.2])

In [41]:
lr=LogisticRegression(featuresCol='Features',labelCol='Outcome')
lr_model=lr.fit(train_data)
lr_pred=lr_model.transform(test_data)

prec_eval=MulticlassClassificationEvaluator(labelCol='Outcome',metricName='weightedPrecision')
rec_eval=MulticlassClassificationEvaluator(labelCol='Outcome',metricName='weightedRecall')


print(rec_eval.evaluate(lr_pred))
print(prec_eval.evaluate(lr_pred))

0.8059701492537313
0.8042545891233488


**EXP-12**

In [46]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
df=spark.read.csv('/content/Exp12.csv',inferSchema=True,header=True)

input_cols = ["Age", "Income", "SpendingScore"]
assembler = VectorAssembler(inputCols=input_cols, outputCol="features")
assembled_df = assembler.transform(df)
assembled_df.show(5)

+----------+---+------+-------------+-------------------+
|CustomerID|Age|Income|SpendingScore|           features|
+----------+---+------+-------------+-------------------+
|         1| 25| 35000|           39|[25.0,35000.0,39.0]|
|         2| 45| 60000|           81|[45.0,60000.0,81.0]|
|         3| 23| 25000|            6| [23.0,25000.0,6.0]|
|         4| 31| 40000|           77|[31.0,40000.0,77.0]|
|         5| 35| 80000|           40|[35.0,80000.0,40.0]|
+----------+---+------+-------------+-------------------+
only showing top 5 rows



In [48]:
kmeans=KMeans(k=3,featuresCol='features',predictionCol='cluster')
kmeans_model=kmeans.fit(assembled_df)

In [50]:
predictions = kmeans_model.transform(assembled_df)
predictions.show()

+----------+---+------+-------------+-------------------+-------+
|CustomerID|Age|Income|SpendingScore|           features|cluster|
+----------+---+------+-------------+-------------------+-------+
|         1| 25| 35000|           39|[25.0,35000.0,39.0]|      1|
|         2| 45| 60000|           81|[45.0,60000.0,81.0]|      0|
|         3| 23| 25000|            6| [23.0,25000.0,6.0]|      1|
|         4| 31| 40000|           77|[31.0,40000.0,77.0]|      1|
|         5| 35| 80000|           40|[35.0,80000.0,40.0]|      2|
|         6| 40| 65000|           70|[40.0,65000.0,70.0]|      0|
|         7| 29| 32000|           30|[29.0,32000.0,30.0]|      1|
|         8| 48| 90000|           85|[48.0,90000.0,85.0]|      2|
|         9| 52| 70000|           20|[52.0,70000.0,20.0]|      0|
|        10| 36| 50000|           60|[36.0,50000.0,60.0]|      0|
+----------+---+------+-------------+-------------------+-------+



In [51]:
evaluator = ClusteringEvaluator(featuresCol="features", predictionCol="cluster", metricName="silhouette", distanceMeasure="squaredEuclidean")
silhouette = evaluator.evaluate(predictions)
print(f"Silhouette score = {silhouette:.3f}")

Silhouette score = 0.745


In [53]:
print("Cluster Centers:")
for center in kmeans_model.clusterCenters():
    print(center)


Cluster Centers:
[4.325e+01 6.125e+04 5.775e+01]
[2.7e+01 3.3e+04 3.8e+01]
[4.15e+01 8.50e+04 6.25e+01]
