In [1]:
import findspark
findspark.init()

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import numpy as np
from pyspark.ml import Pipeline
from __future__ import print_function
import seaborn as sns
import imblearn
import pandas as pd
from imblearn.over_sampling import SMOTE
import math 
import pyspark.sql.functions as F
from sklearn.preprocessing import LabelEncoder

In [64]:
from pyspark.sql import SparkSession
from pyspark import SparkContext
sc = SparkContext.getOrCreate();
spark = SparkSession \
    .builder \
    .appName("HealthCarePrediction") \
    .getOrCreate()

In [3]:
rawDF = spark.read.csv('healthcare-dataset-stroke-data.csv', header=True, inferSchema=True)
rawDF.show()

+-----+------+----+------------+-------------+------------+-------------+--------------+-----------------+----+---------------+------+
|   id|gender| age|hypertension|heart_disease|ever_married|    work_type|Residence_type|avg_glucose_level| bmi| smoking_status|stroke|
+-----+------+----+------------+-------------+------------+-------------+--------------+-----------------+----+---------------+------+
| 9046|  Male|67.0|           0|            1|         Yes|      Private|         Urban|           228.69|36.6|formerly smoked|     1|
|51676|Female|61.0|           0|            0|         Yes|Self-employed|         Rural|           202.21| N/A|   never smoked|     1|
|31112|  Male|80.0|           0|            1|         Yes|      Private|         Rural|           105.92|32.5|   never smoked|     1|
|60182|Female|49.0|           0|            0|         Yes|      Private|         Urban|           171.23|34.4|         smokes|     1|
| 1665|Female|79.0|           1|            0|         

In [4]:
rawDF = rawDF.dropna()
rawDF = rawDF.filter(rawDF['bmi'] != "N/A")
rawDF = rawDF.filter(rawDF['gender'] != "Other")
rawDF = rawDF.withColumn("bmi",rawDF.bmi.cast('double'))

In [46]:
from pyspark.ml.feature import OneHotEncoder, StandardScaler, StringIndexer, VectorAssembler
cat_features = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
stringIndexedDF = rawDF
for features in cat_features:
    # Index Categorical Features
    string_indexer = StringIndexer(inputCol=features, outputCol=features + "_index")
    stringIndexedDF = string_indexer.fit(stringIndexedDF).transform(stringIndexedDF)
for features in cat_features:     
    stringIndexedDF = stringIndexedDF.withColumn(features+"_index",stringIndexedDF[features+"_index"].cast('int'))

stringIndexedDF = stringIndexedDF.drop(*cat_features)

In [49]:
stringIndexedDF.printSchema()

root
 |-- id: integer (nullable = true)
 |-- age: double (nullable = true)
 |-- hypertension: integer (nullable = true)
 |-- heart_disease: integer (nullable = true)
 |-- avg_glucose_level: double (nullable = true)
 |-- bmi: double (nullable = true)
 |-- stroke: integer (nullable = true)
 |-- gender_index: integer (nullable = true)
 |-- ever_married_index: integer (nullable = true)
 |-- work_type_index: integer (nullable = true)
 |-- Residence_type_index: integer (nullable = true)
 |-- smoking_status_index: integer (nullable = true)



In [50]:
X = stringIndexedDF.drop('stroke')
Y = stringIndexedDF.select('stroke')
stk = SMOTE(random_state=42)
X_res,y_res = stk.fit_resample(X.toPandas(),Y.toPandas())
joinDF = pd.concat([X_res, y_res], axis=1, join="inner")
balancedData = spark.createDataFrame(joinDF)

In [51]:
balancedData.printSchema()

root
 |-- id: long (nullable = true)
 |-- age: double (nullable = true)
 |-- hypertension: long (nullable = true)
 |-- heart_disease: long (nullable = true)
 |-- avg_glucose_level: double (nullable = true)
 |-- bmi: double (nullable = true)
 |-- gender_index: long (nullable = true)
 |-- ever_married_index: long (nullable = true)
 |-- work_type_index: long (nullable = true)
 |-- Residence_type_index: long (nullable = true)
 |-- smoking_status_index: long (nullable = true)
 |-- stroke: long (nullable = true)



In [52]:
def select_features_to_scale(df=balancedData, lower_skew=-2, upper_skew=2, dtypes='double'):
    
    # Empty Selected Feature List for Output
    selected_features = []
    
    # Select Features to Scale based on Inputs ('in32' type, drop 'ID' columns or others, skew bounds)
    feature_list = list(df.toPandas().select_dtypes(include=[dtypes]).columns)
    
    # Loop through 'feature_list' to select features based on Kurtosis / Skew
    for feature in feature_list:

        if df.toPandas()[feature].kurtosis() < -2 or df.toPandas()[feature].kurtosis() > 2:
            
            selected_features.append(feature)
    
    # Return feature list to scale
    return selected_features

In [53]:
index_features = ['gender_index', 'ever_married_index', 'work_type_index', 'Residence_type_index', 'smoking_status_index']

encoderDF = balancedData

for features in index_features:
    encoder = OneHotEncoder(inputCols=[string_indexer.getOutputCol()],
                                    outputCols=[features + "_class_vec"])
    encoderDF = encoder.fit(encoderDF).transform(encoderDF)

encoderDF = encoderDF.drop(*index_features)

In [54]:
label = 'stroke'
stages = []
num_features = ['age','hypertension', 'heart_disease', 'avg_glucose_level', 'bmi']
label_str_index =  StringIndexer(inputCol=label, outputCol="label_index")

# Scale Feature: Select the Features to Scale using helper 'select_features_to_scale' function above and Standardize 
unscaled_features = select_features_to_scale(df=encoderDF, lower_skew=-2, upper_skew=2, dtypes='double')

unscaled_assembler = VectorAssembler(inputCols=unscaled_features, outputCol="unscaled_features")
scaler = StandardScaler(inputCol="unscaled_features", outputCol="scaled_features")

stages += [unscaled_assembler, scaler]

# Create list of Numeric Features that Are Not Being Scaled
num_unscaled_diff_list = list(set(num_features) - set(unscaled_features))

# Assemble or Concat the Categorical Features and Numeric Features
assembler_inputs = [feature + "_class_vec" for feature in index_features] + num_unscaled_diff_list

assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="assembled_inputs") 

stages += [label_str_index, assembler]

# Assemble Final Training Data of Scaled, Numeric, and Categorical Engineered Features
assembler_final = VectorAssembler(inputCols=["scaled_features","assembled_inputs"], outputCol="features")

stages += [assembler_final]

In [55]:
#Set Pipeline
pipeline = Pipeline(stages=stages)

# Fit Pipeline to Data
pipeline_model = pipeline.fit(encoderDF)

# Transform Data using Fitted Pipeline
df_transform = pipeline_model.transform(encoderDF)

In [56]:
df_transform_fin = df_transform.select('features','label_index')
# df_transform_fin.show()
# df_transform_fin.count()
print ("So lan xuat hien cua stroke la 0: ",df_transform_fin.filter(df_transform_fin['label_index'] == 0).count())
print ("So lan xuat hien cua stroke la 1: ",df_transform_fin.filter(df_transform_fin['label_index'] == 1).count())

So lan xuat hien cua stroke la 0:  4699
So lan xuat hien cua stroke la 1:  4699


In [61]:
from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(labelCol="label_index", featuresCol="features")
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
train_data, test_data = df_transform_fin.randomSplit([.7, .3],seed=1234)
model = dt.fit(train_data)
predictions = model.transform(test_data)
evaluator = MulticlassClassificationEvaluator(labelCol="label_index", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
model.save('model/decision_tree')
print(accuracy)

0.803539183820874


In [70]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(labelCol="label_index", featuresCol="features", numTrees=10)
train_data, test_data = df_transform_fin.randomSplit([.6, .4],seed=1234)
rfModel = rf.fit(train_data)
predictions = rfModel.transform(test_data)
evaluator = MulticlassClassificationEvaluator(labelCol="label_index", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
rfModel.save('model/random_forest')
print(accuracy)

0.8061579651941098


In [71]:
from pyspark.ml.classification import GBTClassifier
gbt = GBTClassifier(labelCol="label_index", featuresCol="features", maxIter=10)
train_data, test_data = df_transform_fin.randomSplit([.7, .3],seed=1534)
gbtModel = gbt.fit(train_data)
gbtPredictions = gbtModel.transform(test_data)
accuracy = evaluator.evaluate(gbtPredictions)
gbtModel.save('model/gbt')
print("Test Error = %g" % (1.0 - accuracy))

Test Error = 0.17424
