In [1]:
#importing pyspark
import findspark
findspark.init()
import pyspark

In [2]:
#creating sparksession
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('Classifier').getOrCreate()

In [3]:
#importing required libraries
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import NaiveBayes, LogisticRegression, GBTClassifier, LinearSVC 
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [5]:
#loading the dataset
diabetes_df=spark.read.csv('Diabetes.csv',inferSchema=True,header=True)

In [6]:
diabetes_df.printSchema()

root
 |-- preg: integer (nullable = true)
 |-- plas: integer (nullable = true)
 |-- pres: integer (nullable = true)
 |-- skin: integer (nullable = true)
 |-- insu: integer (nullable = true)
 |-- mass: double (nullable = true)
 |-- pedi: double (nullable = true)
 |-- age: integer (nullable = true)
 |-- class: string (nullable = true)



In [7]:
#total data instances in the dataset
diabetes_df.count()

768

In [8]:
#statistical summary
diabetes_df.describe().show(5)

+-------+------------------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+---------------+
|summary|              preg|             plas|              pres|              skin|              insu|              mass|              pedi|               age|          class|
+-------+------------------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+---------------+
|  count|               768|              768|               768|               768|               768|               768|               768|               768|            768|
|   mean|3.8450520833333335|     120.89453125|       69.10546875|20.536458333333332| 79.79947916666667|31.992578124999977|0.4718763020833327|33.240885416666664|           null|
| stddev|  3.36957806269887|31.97261819513622|19.355807170644777|15.952217567727642|115.24400235133803| 7.884160320

In [9]:
#fitting string indexer for target label
labelIndexer = StringIndexer(inputCol="class", outputCol="indexedLabel").fit(diabetes_df)
df=labelIndexer.transform(diabetes_df)
df.show(5)

+----+----+----+----+----+----+-----+---+---------------+------------+
|preg|plas|pres|skin|insu|mass| pedi|age|          class|indexedLabel|
+----+----+----+----+----+----+-----+---+---------------+------------+
|   6| 148|  72|  35|   0|33.6|0.627| 50|tested_positive|         1.0|
|   1|  85|  66|  29|   0|26.6|0.351| 31|tested_negative|         0.0|
|   8| 183|  64|   0|   0|23.3|0.672| 32|tested_positive|         1.0|
|   1|  89|  66|  23|  94|28.1|0.167| 21|tested_negative|         0.0|
|   0| 137|  40|  35| 168|43.1|2.288| 33|tested_positive|         1.0|
+----+----+----+----+----+----+-----+---+---------------+------------+
only showing top 5 rows



In [10]:
feature_list=['preg','plas','pres','skin','insu','mass','pedi','age']
#fitting vector assembler for features
featureAssembler=VectorAssembler(inputCols=feature_list,outputCol='indexedFeatures')
output_df=featureAssembler.transform(df)
output_df.show(5)

+----+----+----+----+----+----+-----+---+---------------+------------+--------------------+
|preg|plas|pres|skin|insu|mass| pedi|age|          class|indexedLabel|     indexedFeatures|
+----+----+----+----+----+----+-----+---+---------------+------------+--------------------+
|   6| 148|  72|  35|   0|33.6|0.627| 50|tested_positive|         1.0|[6.0,148.0,72.0,3...|
|   1|  85|  66|  29|   0|26.6|0.351| 31|tested_negative|         0.0|[1.0,85.0,66.0,29...|
|   8| 183|  64|   0|   0|23.3|0.672| 32|tested_positive|         1.0|[8.0,183.0,64.0,0...|
|   1|  89|  66|  23|  94|28.1|0.167| 21|tested_negative|         0.0|[1.0,89.0,66.0,23...|
|   0| 137|  40|  35| 168|43.1|2.288| 33|tested_positive|         1.0|[0.0,137.0,40.0,3...|
+----+----+----+----+----+----+-----+---+---------------+------------+--------------------+
only showing top 5 rows



In [11]:
#diabetes dataset with feature and output vector
diabetes_data=output_df.select('indexedFeatures','indexedLabel')

In [12]:
#splitting the data into 75/25 ratio for training and testing set
train_df,test_df=diabetes_data.randomSplit([0.7,0.3])

## Naive Bayes Classifier

In [15]:
#creating NaiveBayesClassifier instance
nb=NaiveBayes(labelCol="indexedLabel", featuresCol="indexedFeatures")

In [16]:
#fitting the model
nb_model=nb.fit(train_df)

In [17]:
#predicting for test dataset
nb_pred=nb_model.transform(test_df)

In [18]:
#viewing predictions
nb_pred.select("prediction", "indexedLabel", "indexedFeatures").show(5)

+----------+------------+--------------------+
|prediction|indexedLabel|     indexedFeatures|
+----------+------------+--------------------+
|       1.0|         0.0|(8,[0,1,6,7],[2.0...|
|       1.0|         0.0|(8,[1,5,6,7],[99....|
|       1.0|         1.0|(8,[1,5,6,7],[119...|
|       1.0|         1.0|(8,[1,5,6,7],[131...|
|       1.0|         1.0|(8,[1,5,6,7],[167...|
+----------+------------+--------------------+
only showing top 5 rows



In [19]:
#evaluating the model
evaluator=MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction")
acc=evaluator.evaluate(nb_pred, {evaluator.metricName: "accuracy"})

In [20]:
print('Accuracy of Naive Bayes classifier model:',round(acc*100,3))

Accuracy of Naive Bayes classifier model: 59.211


## Logistic Regression Classifier

In [21]:
#creating LogisticRegression instance
lr=LogisticRegression(labelCol="indexedLabel", featuresCol="indexedFeatures")

In [22]:
#fitting the model
lr_model=lr.fit(train_df)

In [23]:
#predicting for test dataset
lr_pred=lr_model.transform(test_df)

In [24]:
#viewing predictions
lr_pred.select("prediction", "indexedLabel", "indexedFeatures").show(5)

+----------+------------+--------------------+
|prediction|indexedLabel|     indexedFeatures|
+----------+------------+--------------------+
|       0.0|         0.0|(8,[0,1,6,7],[2.0...|
|       0.0|         0.0|(8,[1,5,6,7],[99....|
|       0.0|         1.0|(8,[1,5,6,7],[119...|
|       1.0|         1.0|(8,[1,5,6,7],[131...|
|       1.0|         1.0|(8,[1,5,6,7],[167...|
+----------+------------+--------------------+
only showing top 5 rows



In [25]:
#evaluating the model
evaluator=MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction")
acc=evaluator.evaluate(lr_pred, {evaluator.metricName: "accuracy"})

In [26]:
print('Accuracy of Logistic Regression model:',round(acc*100,3))

Accuracy of Logistic Regression model: 74.123


## Support Vector Machine Classifier

In [27]:
#creating SVMClassifier instance
svm=LinearSVC(labelCol="indexedLabel", featuresCol="indexedFeatures")

In [28]:
#fitting the model
svm_model=svm.fit(train_df)

In [29]:
#predicting for test dataset
svm_pred=svm_model.transform(test_df)

In [30]:
#viewing predictions
svm_pred.select("prediction", "indexedLabel", "indexedFeatures").show(5)

+----------+------------+--------------------+
|prediction|indexedLabel|     indexedFeatures|
+----------+------------+--------------------+
|       0.0|         0.0|(8,[0,1,6,7],[2.0...|
|       0.0|         0.0|(8,[1,5,6,7],[99....|
|       0.0|         1.0|(8,[1,5,6,7],[119...|
|       1.0|         1.0|(8,[1,5,6,7],[131...|
|       1.0|         1.0|(8,[1,5,6,7],[167...|
+----------+------------+--------------------+
only showing top 5 rows



In [31]:
#evaluating the model
evaluator=MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction")
acc=evaluator.evaluate(svm_pred, {evaluator.metricName: "accuracy"})

In [32]:
print('Accuracy of SVM Classifier model:',round(acc*100,3))

Accuracy of SVM Classifier model: 75.439


# Ensemble Model

## Gradient Boosting Classifier

In [33]:
#creating GradientBoostClassifier instance
gbt=GBTClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", maxIter=200)

In [34]:
#fitting the model
gbt_model=gbt.fit(train_df)

In [35]:
#predicting for test dataset
gbt_pred=gbt_model.transform(test_df)

In [36]:
#viewing predictions
gbt_pred.select("prediction", "indexedLabel", "indexedFeatures").show(5)

+----------+------------+--------------------+
|prediction|indexedLabel|     indexedFeatures|
+----------+------------+--------------------+
|       0.0|         0.0|(8,[0,1,6,7],[2.0...|
|       0.0|         0.0|(8,[1,5,6,7],[99....|
|       0.0|         1.0|(8,[1,5,6,7],[119...|
|       1.0|         1.0|(8,[1,5,6,7],[131...|
|       1.0|         1.0|(8,[1,5,6,7],[167...|
+----------+------------+--------------------+
only showing top 5 rows



In [37]:
#evaluating the model
evaluator=MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction")
acc=evaluator.evaluate(gbt_pred, {evaluator.metricName: "accuracy"})

In [38]:
print('Accuracy of Gradient Boost Classifier model:',round(acc*100,3))

Accuracy of Gradient Boost Classifier model: 69.737
