In [31]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
        .master("local") \
        .appName("Spark ML") \
        .config("spark.ui.port", "4050") \
        .getOrCreate()

sc = spark.sparkContext

In [32]:
raw_data = spark.read.format('csv').option('header', 'true').load('../../data/input5/diabetes.csv')
raw_data.printSchema()

root
 |-- Pregnancies: string (nullable = true)
 |-- Glucose: string (nullable = true)
 |-- BloodPressure: string (nullable = true)
 |-- SkinThickness: string (nullable = true)
 |-- Insulin: string (nullable = true)
 |-- BMI: string (nullable = true)
 |-- DiabetesPedigreeFunction: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Outcome: string (nullable = true)



In [33]:
from pyspark.sql.types import *

schema = StructType([
    StructField('Pregnancies', FloatType(), True),
    StructField('Glucose', FloatType(), True),
    StructField('BloodPressure', FloatType(), True),
    StructField('SkinThickness', FloatType(), True),
    StructField('Insulin', FloatType(), True),
    StructField('BMI', FloatType(), True),
    StructField('DiabetesPedigreeFunction', FloatType(), True),
    StructField('Age', IntegerType(), True),
    StructField('Outcome', IntegerType(), True),
])

raw_data = spark.read.format('csv').option('header', 'true').schema(schema).load('../../data/input5/diabetes.csv')
raw_data.printSchema()

root
 |-- Pregnancies: float (nullable = true)
 |-- Glucose: float (nullable = true)
 |-- BloodPressure: float (nullable = true)
 |-- SkinThickness: float (nullable = true)
 |-- Insulin: float (nullable = true)
 |-- BMI: float (nullable = true)
 |-- DiabetesPedigreeFunction: float (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Outcome: integer (nullable = true)



In [34]:
raw_data.describe(['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']).show()

+-------+-----------------+------------------+------------------+------------------+-----------------+
|summary|          Glucose|     BloodPressure|     SkinThickness|           Insulin|              BMI|
+-------+-----------------+------------------+------------------+------------------+-----------------+
|  count|              768|               768|               768|               768|              768|
|   mean|     120.89453125|       69.10546875|20.536458333333332| 79.79947916666667|31.99257813890775|
| stddev|31.97261819513622|19.355807170644777|15.952217567727642|115.24400235133803|7.884160293010772|
|    min|              0.0|               0.0|               0.0|               0.0|              0.0|
|    max|            199.0|             122.0|              99.0|             846.0|             67.1|
+-------+-----------------+------------------+------------------+------------------+-----------------+



In [35]:
raw_data.select('Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI').summary().show()

+-------+-----------------+------------------+------------------+------------------+-----------------+
|summary|          Glucose|     BloodPressure|     SkinThickness|           Insulin|              BMI|
+-------+-----------------+------------------+------------------+------------------+-----------------+
|  count|              768|               768|               768|               768|              768|
|   mean|     120.89453125|       69.10546875|20.536458333333332| 79.79947916666667|31.99257813890775|
| stddev|31.97261819513622|19.355807170644777|15.952217567727642|115.24400235133803|7.884160293010772|
|    min|              0.0|               0.0|               0.0|               0.0|              0.0|
|    25%|             99.0|              62.0|               0.0|               0.0|             27.3|
|    50%|            117.0|              72.0|              23.0|              29.0|             32.0|
|    75%|            140.0|              80.0|              32.0|        

In [36]:
raw_data.rdd.map(lambda row: 1 if sum([c == None for c in row]) > 0 else 0)\
            .reduce(lambda x, y: x + y)

0

In [37]:
prep_cols = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
raw_data.rdd.map(lambda row: 1 if sum([row[c] == 0 for c in prep_cols]) > 0 else 0)\
            .reduce(lambda x, y: x + y)

376

In [38]:
import pyspark.sql.functions as fn

for c in prep_cols:
    raw_data = raw_data.withColumn(c, fn.when(fn.col(c) == 0, None).otherwise(fn.col(c)))

raw_data.show(5)

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|        6.0|  148.0|         72.0|         35.0|   null|33.6|                   0.627| 50|      1|
|        1.0|   85.0|         66.0|         29.0|   null|26.6|                   0.351| 31|      0|
|        8.0|  183.0|         64.0|         null|   null|23.3|                   0.672| 32|      1|
|        1.0|   89.0|         66.0|         23.0|   94.0|28.1|                   0.167| 21|      0|
|        0.0|  137.0|         40.0|         35.0|  168.0|43.1|                   2.288| 33|      1|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
only showing top 5 rows



In [39]:
raw_data.select(*[(fn.count(c) / fn.count('*')).alias(c + '_missing') for c in prep_cols]).show()

+------------------+---------------------+---------------------+------------------+------------------+
|   Glucose_missing|BloodPressure_missing|SkinThickness_missing|   Insulin_missing|       BMI_missing|
+------------------+---------------------+---------------------+------------------+------------------+
|0.9934895833333334|   0.9544270833333334|   0.7044270833333334|0.5130208333333334|0.9856770833333334|
+------------------+---------------------+---------------------+------------------+------------------+



In [40]:
from pyspark.ml.feature import Imputer

imputer = Imputer(inputCols=prep_cols, outputCols=prep_cols)
model = imputer.fit(raw_data)
raw_data = model.transform(raw_data)
raw_data.show(5)

+-----------+-------+-------------+-------------+---------+----+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|  Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+-------------+---------+----+------------------------+---+-------+
|        6.0|  148.0|         72.0|         35.0|155.54822|33.6|                   0.627| 50|      1|
|        1.0|   85.0|         66.0|         29.0|155.54822|26.6|                   0.351| 31|      0|
|        8.0|  183.0|         64.0|     29.15342|155.54822|23.3|                   0.672| 32|      1|
|        1.0|   89.0|         66.0|         23.0|     94.0|28.1|                   0.167| 21|      0|
|        0.0|  137.0|         40.0|         35.0|    168.0|43.1|                   2.288| 33|      1|
+-----------+-------+-------------+-------------+---------+----+------------------------+---+-------+
only showing top 5 rows



In [41]:
cols = raw_data.columns
cols.remove('Outcome')

In [42]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=cols, outputCol='features')
raw_data = assembler.transform(raw_data)
raw_data.select('features').show(5, truncate=False)

+-----------------------------------------------------------------------------------------------+
|features                                                                                       |
+-----------------------------------------------------------------------------------------------+
|[6.0,148.0,72.0,35.0,155.5482177734375,33.599998474121094,0.6269999742507935,50.0]             |
|[1.0,85.0,66.0,29.0,155.5482177734375,26.600000381469727,0.35100001096725464,31.0]             |
|[8.0,183.0,64.0,29.153419494628906,155.5482177734375,23.299999237060547,0.671999990940094,32.0]|
|[1.0,89.0,66.0,23.0,94.0,28.100000381469727,0.16699999570846558,21.0]                          |
|[0.0,137.0,40.0,35.0,168.0,43.099998474121094,2.2880001068115234,33.0]                         |
+-----------------------------------------------------------------------------------------------+
only showing top 5 rows



In [43]:
from pyspark.ml.feature import StandardScaler

standard_scaler = StandardScaler(inputCol='features', outputCol='scaled_features', withStd=True, withMean=False)
raw_data = standard_scaler.fit(raw_data).transform(raw_data)
raw_data.select('features', 'scaled_features').show(5)

+--------------------+--------------------+
|            features|     scaled_features|
+--------------------+--------------------+
|[6.0,148.0,72.0,3...|[1.78063837321943...|
|[1.0,85.0,66.0,29...|[0.29677306220323...|
|[8.0,183.0,64.0,2...|[2.37418449762590...|
|[1.0,89.0,66.0,23...|[0.29677306220323...|
|[0.0,137.0,40.0,3...|[0.0,4.5012560836...|
+--------------------+--------------------+
only showing top 5 rows



In [44]:
train, test = raw_data.randomSplit([0.8, 0.2], seed=37)
train.count(), test.count()

(613, 155)

In [45]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(labelCol='Outcome', featuresCol='scaled_features', maxIter=100)
model = lr.fit(train)
predict_train = model.transform(train)
predict_test = model.transform(test)
predict_test.select('Outcome', 'prediction').show()

+-------+----------+
|Outcome|prediction|
+-------+----------+
|      0|       0.0|
|      0|       0.0|
|      0|       0.0|
|      0|       0.0|
|      0|       0.0|
|      0|       0.0|
|      0|       0.0|
|      0|       0.0|
|      0|       0.0|
|      0|       0.0|
|      0|       0.0|
|      0|       0.0|
|      0|       0.0|
|      0|       0.0|
|      0|       0.0|
|      1|       0.0|
|      0|       0.0|
|      0|       0.0|
|      1|       0.0|
|      0|       0.0|
+-------+----------+
only showing top 20 rows



In [46]:
predict_test.select('Outcome', 'rawPrediction', 'probability', 'prediction').show(5, truncate=False)

+-------+----------------------------------------+----------------------------------------+----------+
|Outcome|rawPrediction                           |probability                             |prediction|
+-------+----------------------------------------+----------------------------------------+----------+
|0      |[2.6606282231648404,-2.6606282231648404]|[0.9346630414096359,0.06533695859036415]|0.0       |
|0      |[1.951762315123215,-1.951762315123215]  |[0.8756386772239592,0.12436132277604084]|0.0       |
|0      |[2.5766438051311003,-2.5766438051311003]|[0.929343203851855,0.07065679614814502] |0.0       |
|0      |[1.204333872708819,-1.204333872708819]  |[0.7692948579791697,0.23070514202083026]|0.0       |
|0      |[2.9971688375226133,-2.9971688375226133]|[0.9524460603511284,0.04755393964887156]|0.0       |
+-------+----------------------------------------+----------------------------------------+----------+
only showing top 5 rows



In [47]:
predict_test.rdd.map(lambda row: 1 if row['Outcome'] == row['prediction'] else 0)\
                .reduce(lambda x, y: x + y)

124

In [49]:
predict_test.rdd.map(lambda row: 1 if row['Outcome'] == row['prediction'] else 0).reduce(lambda x, y: x + y) / predict_test.count()

0.8

In [53]:
sc.stop()