In [3]:
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler, Normalizer
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, DecisionTreeClassifier
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [4]:
sc = SparkContext()
spark = SparkSession(sc)

In [6]:
file_location ="gs://dataproc-staging-us-central1-820790181286-dl4jzwmr/diabetes.csv"
df = spark.read.format("csv").option("header",
True).load(file_location).toDF("Pregnancies", "Glucose", "BloodPressure",
"SkinThickness", "Insulin", "BMI","DiabetesPedigreeFunction","Age","Outcome")

In [7]:
#Convert the columns from string datatypes to their respective datatypes
df = df.withColumn("Pregnancies",
df["Pregnancies"].cast("integer")).withColumn("Glucose",
df["Glucose"].cast("float")).withColumn("BloodPressure",
df["BloodPressure"].cast("integer")).withColumn("SkinThickness",
df["SkinThickness"].cast("float")).withColumn('Insulin',df['Insulin'].cast('integer')).withColumn('BMI',df['BMI'].cast('float')).withColumn('DiabetesPedigreeFunction',
df['DiabetesPedigreeFunction'].cast('integer')).withColumn('Age',
df['Age'].cast('integer'))

In [8]:
df.show()

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|          6|  148.0|           72|         35.0|      0|33.6|                       0| 50|      1|
|          1|   85.0|           66|         29.0|      0|26.6|                       0| 31|      0|
|          8|  183.0|           64|          0.0|      0|23.3|                       0| 32|      1|
|          1|   89.0|           66|         23.0|     94|28.1|                       0| 21|      0|
|          0|  137.0|           40|         35.0|    168|43.1|                       2| 33|      1|
|          5|  116.0|           74|          0.0|      0|25.6|                       0| 30|      0|
|          3|   78.0|           50|         32.0|     88|31.0|                       0| 26|      1|


In [9]:
input_df = df.drop('Outcome').columns

In [10]:
assembler = VectorAssembler(inputCols=input_df, outputCol = 'features')
labelIndexer = StringIndexer(inputCol="Outcome", outputCol="indexedLabel").fit(df)

In [11]:
scaler_ss = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=False, withMean=True)
scaler = Normalizer(inputCol="features", outputCol="scaledFeatures")

In [12]:
rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="scaledFeatures", numTrees=10)
lr = LogisticRegression(labelCol="indexedLabel", featuresCol="scaledFeatures")
dtc = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="scaledFeatures")

## Using Normalizer

In [None]:
pipeline_lr = Pipeline(stages=[labelIndexer, assembler,scaler,lr])

In [None]:
(trainingData, testData) = df.randomSplit([0.8, 0.2])
model = pipeline_lr.fit(trainingData)

23/04/06 06:52:12 WARN com.github.fommil.netlib.BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
23/04/06 06:52:12 WARN com.github.fommil.netlib.BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS


In [None]:
predictions = model.transform(testData)
evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy for logistic regression  = %g" % (accuracy))

Test set accuracy for logistic regression  = 0.611842


In [None]:
pipeline_rf = Pipeline(stages=[labelIndexer, assembler,scaler, rf])
pipeline_dtc = Pipeline(stages=[labelIndexer, assembler,scaler, dtc])

In [None]:
model_rf = pipeline_rf.fit(trainingData)
model_dtc = pipeline_dtc.fit(trainingData)

In [None]:
predictions = model_rf.transform(testData)
evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy for Random Forest Classifier  = %g" % (accuracy))

Test set accuracy for Random Forest Classifier  = 0.657895


In [None]:
predictions = model_dtc.transform(testData)
evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy for Decision Tree Classifier  = %g" % (accuracy))

Test set accuracy for Decision Tree Classifier  = 0.644737


## Using Standard Scaler

In [13]:
pipeline_lr = Pipeline(stages=[labelIndexer, assembler,scaler_ss,lr])
(trainingData, testData) = df.randomSplit([0.8, 0.2])
model = pipeline_lr.fit(trainingData)

In [14]:
predictions = model.transform(testData)
evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy for logistic regression  = %g" % (accuracy))

Test set accuracy for logistic regression  = 0.793333


In [15]:
pipeline_rf = Pipeline(stages=[labelIndexer, assembler,scaler_ss, rf])
pipeline_dtc = Pipeline(stages=[labelIndexer, assembler,scaler_ss, dtc])

model_rf = pipeline_rf.fit(trainingData)
model_dtc = pipeline_dtc.fit(trainingData)

In [16]:
predictions = model_rf.transform(testData)
evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy for Random Forest Classifier  = %g" % (accuracy))

Test set accuracy for Random Forest Classifier  = 0.786667


In [17]:
predictions = model_dtc.transform(testData)
evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy for Decision Tree Classifier  = %g" % (accuracy))

Test set accuracy for Decision Tree Classifier  = 0.713333
