# Import and Explore Data

In [23]:
path = "s3://ms-syntheamass-1m-mldata/syntheamassCSV/covid/10k_synthea_covid19_csv/"
bucket_file = "indexed_data_2.csv"

indexed = spark.read.format("csv").option("inferSchema", True)\
.option("header", True).load(path + bucket_file)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [24]:
# cols = indexed.columns
# cols.remove("D_dimer")
# cols.remove("Serum_Ferritin")
# cols.remove("High_Sensitivity_Cardiac_Troponin_I")
# cols.remove("IL_6")
# cols.remove("Lymphocytes")
# cols.remove("Lactate_dehydrogenase")
# # cols.remove("AGE")
# cols.remove("icu_admitted_index")

# indexed = indexed.select(cols)
# indexed.columns
# indexed = indexed.select("Id", "BMI", "AGE", "DECEASED_index")
indexed = indexed.where("AGE <= 50")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [25]:
indexed.count()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

5440

In [26]:
indexed.show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+---------+-----------+---------+--------------+-----------------------------------+----+-----------+---------------------+-------------------+-------------------+----------+---------------+------------+--------------+------------------+----+
|                  Id|      BMI|Cholesterol|  D_dimer|Serum_Ferritin|High_Sensitivity_Cardiac_Troponin_I|IL_6|Lymphocytes|Lactate_dehydrogenase|HEALTHCARE_EXPENSES|HEALTHCARE_COVERAGE|RACE_index|ETHNICITY_index|GENDER_index|DECEASED_index|icu_admitted_index| AGE|
+--------------------+---------+-----------+---------+--------------+-----------------------------------+----+-----------+---------------------+-------------------+-------------------+----------+---------------+------------+--------------+------------------+----+
|00ebc091-9748-42c...|25.989397|  183.77635|      0.0|         498.0|                                3.0| 6.0|        0.0|                244.0|             6236.4|                0.0|       1.0|            0

# Create Features Column


In [27]:
cols = indexed.columns
cols.remove("DECEASED_index")
cols.remove("Id")

from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=cols, outputCol='features')

indexed = assembler.transform(indexed)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# Scale Features Column

In [28]:
from pyspark.ml.feature import StandardScaler
standardscaler = StandardScaler().setInputCol("features").setOutputCol("scaled_features")

scaled = standardscaler.fit(indexed).transform(indexed)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# Split Data into Train and Test sets

In [29]:
train, test = scaled.randomSplit([0.8, 0.2], seed=12345)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [30]:
# Analyzing distributions

percent_pos_total = float(scaled.where("DECEASED_index == 1").count()) \
/ scaled.count() * 100

percent_pos_train = float(train.where("DECEASED_index == 1").count()) \
/ train.count() * 100

percent_pos_test = float(test.where("DECEASED_index == 1").count()) \
/ test.count() * 100

print("percent deceased for total covid patients:", percent_pos_total)

print("percent deceased for train:", percent_pos_train)

print("percent deceased for test:", percent_pos_test)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

percent deceased for total covid patients: 0.6066176470588236
percent deceased for train: 0.6484483557202408
percent deceased for test: 0.4456327985739751

# Handle the imbalance between alive and deceased patients


In [31]:

num_negs = float(train.where("DECEASED_index == 0").count())
                 
balancing_ratio = num_negs/train.count()

print("Balancing ratio:", balancing_ratio)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Balancing ratio: 0.9935155164427976

In [32]:
from pyspark.sql import functions

train = train.withColumn("class_weights", functions.when(train.DECEASED_index == 1, balancing_ratio)
                        .otherwise(1 - balancing_ratio))
train.select("class_weights").show(5)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+
|       class_weights|
+--------------------+
|0.006484483557202436|
|0.006484483557202436|
|0.006484483557202436|
|  0.9935155164427976|
|0.006484483557202436|
+--------------------+
only showing top 5 rows

# Feature Selection using Chi Square Selector

In [33]:
from pyspark.ml.feature import ChiSqSelector

css = ChiSqSelector(featuresCol='scaled_features', outputCol = 'Aspect', 
                   labelCol='DECEASED_index', fpr=0.05)

train = css.fit(train).transform(train)

test = css.fit(test).transform(test)

test.select("Aspect").show(1, truncate=False)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Aspect                                                                                                                                                                                                                                     |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[5.323836845202088,24.19937577609338,1.819250365755044,7.589399726115801,3.0340805024860744,167.30324573864803,3.3282844435503494,25.15907568218014,3.0594307945041854,0.46035965036473825,0.0,0.0,2.00266380830145,0.0,2.9674272899731813]|
+-----------------------------------------------

# Building classification model using logistic regression

In [34]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(labelCol="DECEASED_index", 
                       featuresCol="Aspect", weightCol = "class_weights", maxIter = 5)
model = lr.fit(train)
predict_train = model.transform(train)

predict_test = model.transform(test)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# Model Evaluation

In [35]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator 
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="DECEASED_index")

predict_test.select("DECEASED_index", "rawPrediction", "prediction", "probability").show(5)

print("The area under ROC for train set is:", evaluator.evaluate(predict_train))
print("The area under ROC for test set is:", evaluator.evaluate(predict_test))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------+--------------------+----------+--------------------+
|DECEASED_index|       rawPrediction|prediction|         probability|
+--------------+--------------------+----------+--------------------+
|           0.0|[1.96355521897219...|       0.0|[0.87691719389683...|
|           0.0|[3.15953545439111...|       0.0|[0.95928280547093...|
|           0.0|[3.62826288184007...|       0.0|[0.97412501263454...|
|           0.0|[3.99618387222461...|       0.0|[0.98194626276465...|
|           0.0|[5.24016706432456...|       0.0|[0.99472856387198...|
+--------------+--------------------+----------+--------------------+
only showing top 5 rows

The area under ROC for train set is: 0.9993839493839494
The area under ROC for test set is: 0.9998209489704566

In [36]:
sc.install_pypi_package("matplotlib")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Package already installed for current Spark context!
Traceback (most recent call last):
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/context.py", line 1110, in install_pypi_package
    raise ValueError("Package already installed for current Spark context!")
ValueError: Package already installed for current Spark context!



In [37]:
sc.install_pypi_package("pandas")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Package already installed for current Spark context!
Traceback (most recent call last):
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/context.py", line 1110, in install_pypi_package
    raise ValueError("Package already installed for current Spark context!")
ValueError: Package already installed for current Spark context!



In [38]:
sc.install_pypi_package("handyspark")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Package already installed for current Spark context!
Traceback (most recent call last):
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/context.py", line 1110, in install_pypi_package
    raise ValueError("Package already installed for current Spark context!")
ValueError: Package already installed for current Spark context!



# Thresholds and Metrics

In [39]:
from handyspark import BinaryClassificationMetrics

bcm = BinaryClassificationMetrics(predict_test, scoreCol='probability', labelCol='DECEASED_index')

bcm.getMetricsByThreshold().filter('fpr between 0.19 and 0.21').toPandas()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

    threshold       fpr  recall  precision
0    0.108336  0.190689     1.0   0.022936
1    0.108295  0.191585     1.0   0.022831
2    0.108146  0.192480     1.0   0.022727
3    0.107967  0.193375     1.0   0.022624
4    0.107808  0.194270     1.0   0.022523
5    0.107686  0.195166     1.0   0.022422
6    0.107134  0.196061     1.0   0.022321
7    0.106942  0.196956     1.0   0.022222
8    0.106934  0.197851     1.0   0.022124
9    0.106820  0.198747     1.0   0.022026
10   0.106635  0.199642     1.0   0.021930
11   0.106540  0.200537     1.0   0.021834
12   0.106403  0.201432     1.0   0.021739
13   0.106250  0.202328     1.0   0.021645
14   0.105894  0.203223     1.0   0.021552
15   0.105678  0.204118     1.0   0.021459
16   0.105496  0.205013     1.0   0.021368
17   0.105435  0.205909     1.0   0.021277
18   0.105047  0.206804     1.0   0.021186
19   0.104819  0.207699     1.0   0.021097
20   0.104636  0.208594     1.0   0.021008
21   0.104633  0.209490     1.0   0.020921

# Confusion Matrix

In [40]:
bcm.print_confusion_matrix(.5)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

         Predicted     
                 0    1
Actual 0    1116.0  1.0
       1       0.0  5.0

In [41]:
# from pyspark.ml.classification import LogisticRegression
# import matplotlib.pyplot as plt
# import numpy as np

# beta = np.sort(model.coefficients)

# plt.plot(beta)
# plt.ylabel('Beta Coefficients')
# plt.show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [42]:
# trainingSummary = model.summary
# roc = trainingSummary.roc.toPandas()
# plt.plot(roc['FPR'],roc['TPR'])
# plt.ylabel('False Positive Rate')
# plt.xlabel('True Positive Rate')
# plt.title('ROC Curve')
# plt.show()
# print('Training set areaUnderROC: ' + str(trainingSummary.areaUnderROC))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…