In [1]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('diabetes').getOrCreate()
df = spark.read.csv('Datasets/04DataTransformation.csv', inferSchema=True, header=True)
df.show()

+------+---------------+------+--------+---+-------+------+------+--------------------+------------+------+-------+-----------------+---+---+---------+------+
|    ID|Diabetes_binary|HighBP|HighChol|BMI|BMIBand|Smoker|Stroke|HeartDiseaseorAttack|PhysActivity|Fruits|Veggies|HvyAlcoholConsump|Sex|Age|Education|Income|
+------+---------------+------+--------+---+-------+------+------+--------------------+------------+------+-------+-----------------+---+---+---------+------+
|131805|              0|     1|       0| 42|      6|     0|     1|                   0|           0|     1|      0|                0|  0| 12|        2|     2|
|229519|              0|     0|       0| 23|      2|     0|     0|                   0|           1|     1|      1|                0|  0|  9|        5|     7|
|125271|              0|     0|       1| 25|      3|     1|     0|                   0|           1|     0|      0|                0|  1| 12|        5|     7|
|183133|              0|     0|       0| 24|  

In [2]:
# Split the data into training and test sets (20% held out for testing).
(trainingData, testData) = df.randomSplit([0.8, 0.2])
print(f"Training Data count : {trainingData.count()}")
print(f"Testing Data count : {testData.count()}")

# Check that training set has around 80% of records
training_ratio = trainingData.count() / df.count()
print(f"Training Ratio : {training_ratio}")

Training Data count : 63964
Testing Data count : 15988
Training Ratio : 0.8000300180108065


In [3]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer

assembler = VectorAssembler(
  inputCols=['HighBP', 'HighChol', 'BMI', 'Smoker', 'Stroke', 'HeartDiseaseorAttack',
          'PhysActivity', 'Fruits', 'HvyAlcoholConsump', 'Veggies', 'Sex', 'Age', 'Education', 'Income'],
              outputCol="features")
output = assembler.transform(df)
indexer = StringIndexer(inputCol="Diabetes_binary", outputCol="DiabetesIndex")
output_fixed = indexer.fit(output).transform(output)
final_data = output_fixed.select("features",'DiabetesIndex')
train_data,test_data = final_data.randomSplit([0.8,0.2])
print(f"Training Data count : {train_data.count()}")
print(f"Testing Data count : {test_data.count()}")

# Check that training set has around 80% of records
training_ratio = train_data.count() / df.count()
print(f"Training Ratio : {training_ratio}")

Training Data count : 64017
Testing Data count : 15935
Training Ratio : 0.8006929157494497


In [4]:
from pyspark.ml.classification import DecisionTreeClassifier,GBTClassifier,RandomForestClassifier
from pyspark.ml import Pipeline

dtc = DecisionTreeClassifier(labelCol='DiabetesIndex',featuresCol='features')
dtc_model = dtc.fit(train_data)
fields=['HighBP', 'HighChol', 'BMI', 'Smoker', 'Stroke', 'HeartDiseaseorAttack',
          'PhysActivity', 'Fruits', 'HvyAlcoholConsump', 'Veggies', 'Sex', 'Age', 'Education', 'Income']
dtc_predictions = dtc_model.transform(test_data)
dtc_predictions.select('prediction', 'DiabetesIndex','features').show(5)

+----------+-------------+--------------------+
|prediction|DiabetesIndex|            features|
+----------+-------------+--------------------+
|       1.0|          1.0|(14,[0,1,2,3,4,11...|
|       1.0|          1.0|(14,[0,1,2,3,4,11...|
|       1.0|          1.0|(14,[0,1,2,3,4,11...|
|       1.0|          0.0|(14,[0,1,2,3,4,11...|
|       1.0|          0.0|(14,[0,1,2,3,4,11...|
+----------+-------------+--------------------+
only showing top 5 rows



In [5]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
diabetes_binary_eval = BinaryClassificationEvaluator(labelCol = 'DiabetesIndex')
print(f"Correctness of the Decision Tree Model is : \
      {diabetes_binary_eval.evaluate(dtc_predictions)}")

Correctness of the Decision Tree Model is :       0.7067573591104074


In [6]:
rfc = RandomForestClassifier(labelCol='DiabetesIndex',featuresCol='features')
rfc_model = rfc.fit(train_data)
rfc_predictions = rfc_model.transform(test_data)
rfc_predictions.select('prediction', 'DiabetesIndex','features').show(5)
print(f"Correctness of the Random Forest Model is : \
      {diabetes_binary_eval.evaluate(rfc_predictions)}")

+----------+-------------+--------------------+
|prediction|DiabetesIndex|            features|
+----------+-------------+--------------------+
|       1.0|          1.0|(14,[0,1,2,3,4,11...|
|       1.0|          1.0|(14,[0,1,2,3,4,11...|
|       1.0|          1.0|(14,[0,1,2,3,4,11...|
|       1.0|          0.0|(14,[0,1,2,3,4,11...|
|       1.0|          0.0|(14,[0,1,2,3,4,11...|
+----------+-------------+--------------------+
only showing top 5 rows

Correctness of the Random Forest Model is :       0.7891685318018347


In [7]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator

log_reg = LogisticRegression(featuresCol='features',labelCol='Diabetes_binary')
train_data, test_data = df.randomSplit([0.8,0.2])

pipeline = Pipeline(stages=[assembler,log_reg])
fit_model = pipeline.fit(train_data)
lrc_predictions = fit_model.transform(test_data)

diabetes_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction',
                                       labelCol='Diabetes_binary')
lrc_predictions.select('Diabetes_binary','prediction', 'HighBP', 'HighChol','Stroke', 'BMI', 'Smoker', \
               'PhysActivity', 'Fruits', 'Veggies', 'Sex', 'Age', 'Education', 'Income').show()

+---------------+----------+------+--------+------+---+------+------------+------+-------+---+---+---------+------+
|Diabetes_binary|prediction|HighBP|HighChol|Stroke|BMI|Smoker|PhysActivity|Fruits|Veggies|Sex|Age|Education|Income|
+---------------+----------+------+--------+------+---+------+------------+------+-------+---+---+---------+------+
|              0|       1.0|     1|       1|     0| 25|     1|           1|     0|      1|  0| 11|        4|     4|
|              1|       1.0|     1|       0|     0| 31|     0|           0|     1|      0|  0| 13|        4|     4|
|              0|       0.0|     1|       0|     0| 28|     0|           1|     0|      1|  1|  8|        4|     8|
|              1|       1.0|     0|       1|     0| 32|     0|           1|     0|      0|  0| 13|        2|     2|
|              0|       1.0|     1|       0|     0| 39|     0|           0|     1|      0|  1|  8|        4|     8|
|              1|       1.0|     0|       1|     1| 29|     1|          

In [8]:
print(f"Correctness of the Logistical Regerssion Model is : \
      {diabetes_eval.evaluate(lrc_predictions)}")

Correctness of the Logistical Regerssion Model is :       0.7250848888585888


In [12]:
assembler = VectorAssembler(
  inputCols=['HighBP', 'HighChol', 'BMI', 'Smoker', 'Stroke', 'HeartDiseaseorAttack',
          'PhysActivity', 'Fruits', 'HvyAlcoholConsump', 'Veggies', 'Sex', 'Age', 'Education', 'Income'],
              outputCol="features")
output = assembler.transform(df)
indexer = StringIndexer(inputCol="Diabetes_binary", outputCol="DiabetesIndex")
output_fixed = indexer.fit(output).transform(output)
final_data = output_fixed.select("features",'DiabetesIndex')
train_data,test_data = final_data.randomSplit([0.9,0.1])

from pyspark.ml.classification import DecisionTreeClassifier,GBTClassifier,RandomForestClassifier
from pyspark.ml import Pipeline

dtc = DecisionTreeClassifier(labelCol='DiabetesIndex',featuresCol='features')
dtc_model = dtc.fit(train_data)
fields=['HighBP', 'HighChol', 'BMI', 'Smoker', 'Stroke', 'HeartDiseaseorAttack',
          'PhysActivity', 'Fruits', 'HvyAlcoholConsump', 'Veggies', 'Sex', 'Age', 'Education', 'Income']
dtc_predictions = dtc_model.transform(test_data)
diabetes_binary_eval = BinaryClassificationEvaluator(labelCol = 'DiabetesIndex')
print(f"Correctness of the Decision Tree Model is : \
      {diabetes_binary_eval.evaluate(dtc_predictions)}")

Correctness of the Decision Tree Model is :       0.6889840488607429


In [15]:
assembler = VectorAssembler(
  inputCols=[ 'HighChol', 'HvyAlcoholConsump', 'Sex', 'Age', 'Education', 'Income'],
              outputCol="features")
output = assembler.transform(df)
indexer = StringIndexer(inputCol="Diabetes_binary", outputCol="DiabetesIndex")
output_fixed = indexer.fit(output).transform(output)
final_data = output_fixed.select("features",'DiabetesIndex')
train_data,test_data = final_data.randomSplit([0.9,0.1])

from pyspark.ml.classification import DecisionTreeClassifier,GBTClassifier,RandomForestClassifier
from pyspark.ml import Pipeline

dtc = DecisionTreeClassifier(labelCol='DiabetesIndex',featuresCol='features')
dtc_model = dtc.fit(train_data)
dtc_predictions = dtc_model.transform(test_data)
diabetes_binary_eval = BinaryClassificationEvaluator(labelCol = 'DiabetesIndex')
print(f"Correctness of the Decision Tree Model is : \
      {diabetes_binary_eval.evaluate(dtc_predictions)}")

Correctness of the Decision Tree Model is :       0.7030121084120815
