In [25]:
import sys
import pandas as pd
import pyspark.ml
from pyspark.sql import functions 
#from pyspark.sql.functions import *
from pyspark.sql.functions import col, sum, when, mean
from pyspark.sql.functions import count as spark_count
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StringIndexer, MinMaxScaler
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder
from sklearn.model_selection import GridSearchCV

In [26]:
# Check Spark Version
spark.version

'3.2.4'

In [27]:
# Check Python Version
sys.version_info

sys.version_info(major=3, minor=10, micro=12, releaselevel='final', serial=0)

In [86]:
# Initialize a Spark session

#spark = SparkSession.builder.appName("MLP").getOrCreate()

In [28]:
# Import dataset from hadoop file share and print data types for each column

df = spark.read.load('hdfs://localhost:9000/ca2/cirrhosis.csv', format="csv", header=True, inferSchema=True)
df.printSchema()

root
 |-- ID: integer (nullable = true)
 |-- N_Days: integer (nullable = true)
 |-- Status: string (nullable = true)
 |-- Drug: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Ascites: string (nullable = true)
 |-- Hepatomegaly: string (nullable = true)
 |-- Spiders: string (nullable = true)
 |-- Edema: string (nullable = true)
 |-- Bilirubin: double (nullable = true)
 |-- Cholesterol: string (nullable = true)
 |-- Albumin: double (nullable = true)
 |-- Copper: string (nullable = true)
 |-- Alk_Phos: string (nullable = true)
 |-- SGOT: string (nullable = true)
 |-- Tryglicerides: string (nullable = true)
 |-- Platelets: string (nullable = true)
 |-- Prothrombin: string (nullable = true)
 |-- Stage: string (nullable = true)



In [29]:
# No built in method for getting shape using Pyspark, so combining count and length methods 
num_rows = df.count()
num_columns = len(df.columns)
print(f"rows: {num_rows}, columns: {num_columns}")

rows: 418, columns: 20


In [30]:
# View dataset in pandas format for ease of review
df.limit(40).toPandas()

Unnamed: 0,ID,N_Days,Status,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
0,1,400,D,D-penicillamine,21464,F,Y,Y,Y,Y,14.5,261.0,2.6,156,1718.0,137.95,172.0,190.0,12.2,4
1,2,4500,C,D-penicillamine,20617,F,N,Y,Y,N,1.1,302.0,4.14,54,7394.8,113.52,88.0,221.0,10.6,3
2,3,1012,D,D-penicillamine,25594,M,N,N,N,S,1.4,176.0,3.48,210,516.0,96.1,55.0,151.0,12.0,4
3,4,1925,D,D-penicillamine,19994,F,N,Y,Y,S,1.8,244.0,2.54,64,6121.8,60.63,92.0,183.0,10.3,4
4,5,1504,CL,Placebo,13918,F,N,Y,Y,N,3.4,279.0,3.53,143,671.0,113.15,72.0,136.0,10.9,3
5,6,2503,D,Placebo,24201,F,N,Y,N,N,0.8,248.0,3.98,50,944.0,93.0,63.0,,11.0,3
6,7,1832,C,Placebo,20284,F,N,Y,N,N,1.0,322.0,4.09,52,824.0,60.45,213.0,204.0,9.7,3
7,8,2466,D,Placebo,19379,F,N,N,N,N,0.3,280.0,4.0,52,4651.2,28.38,189.0,373.0,11.0,3
8,9,2400,D,D-penicillamine,15526,F,N,N,Y,N,3.2,562.0,3.08,79,2276.0,144.15,88.0,251.0,11.0,2
9,10,51,D,Placebo,25772,F,Y,N,Y,Y,12.6,200.0,2.74,140,918.0,147.25,143.0,302.0,11.5,4


In [31]:
# Iterate through all columns and check for null
for column in df.columns:
    null_count = df.where(df[column].isNull()).count()
    print(f"Column '{column}': {null_count} null values")

Column 'ID': 0 null values
Column 'N_Days': 0 null values
Column 'Status': 0 null values
Column 'Drug': 0 null values
Column 'Age': 0 null values
Column 'Sex': 0 null values
Column 'Ascites': 0 null values
Column 'Hepatomegaly': 0 null values
Column 'Spiders': 0 null values
Column 'Edema': 0 null values
Column 'Bilirubin': 0 null values
Column 'Cholesterol': 0 null values
Column 'Albumin': 0 null values
Column 'Copper': 0 null values
Column 'Alk_Phos': 0 null values
Column 'SGOT': 0 null values
Column 'Tryglicerides': 0 null values
Column 'Platelets': 0 null values
Column 'Prothrombin': 0 null values
Column 'Stage': 0 null values


In [32]:
# Iterate through all columns and check for NA values, information obtained from dataset description
# https://archive.ics.uci.edu/dataset/878/cirrhosis+patient+survival+prediction+dataset-1
for column in df.columns:
    na_count = df.where(df[column] == "NA").count()
    print(f"Column '{column}': {na_count} 'NA' values")

Column 'ID': 0 'NA' values
Column 'N_Days': 0 'NA' values
Column 'Status': 0 'NA' values
Column 'Drug': 106 'NA' values
Column 'Age': 0 'NA' values
Column 'Sex': 0 'NA' values
Column 'Ascites': 106 'NA' values
Column 'Hepatomegaly': 106 'NA' values
Column 'Spiders': 106 'NA' values
Column 'Edema': 0 'NA' values
Column 'Bilirubin': 0 'NA' values
Column 'Cholesterol': 134 'NA' values
Column 'Albumin': 0 'NA' values
Column 'Copper': 108 'NA' values
Column 'Alk_Phos': 106 'NA' values
Column 'SGOT': 106 'NA' values
Column 'Tryglicerides': 136 'NA' values
Column 'Platelets': 11 'NA' values
Column 'Prothrombin': 2 'NA' values
Column 'Stage': 6 'NA' values


In [33]:
# Look at the % of NA values per column, any feature over 50%
# will be removed from dataframe

# Calculate the percentage of "NA" values for each column
missing_percentage = df.select([(spark_count(when(col(c) == 'NA', 1)).alias(c)) for c in df.columns])

total_rows = df.count()

# Calculate percentages
missing_percentage = missing_percentage.select(
    *[(col(c) / total_rows * 100).alias(c) for c in missing_percentage.columns]
)

# Display the result
missing_percentage.toPandas()

Unnamed: 0,ID,N_Days,Status,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
0,0.0,0.0,0.0,25.358852,0.0,0.0,25.358852,25.358852,25.358852,0.0,0.0,32.057416,0.0,25.837321,25.358852,25.358852,32.535885,2.631579,0.478469,1.435407


In [34]:
# Deal with NA missing values in continous features by imputing mean and replacing missing values

# Specify the list of columns to impute the mean for
columns_to_impute = ['N_Days', 'AGE', 'Cholesterol', 'Copper','Tryglicerides', 'Platelets', 'Stage', 'Albumin', 'SGOT', 'Prothrombin']

# Calculate the mean for each specified column
mean_values = df.select([mean(col(column)).alias(column) for column in columns_to_impute]).collect()[0]

# Extract the mean values store ina a dictionary
mean_values_dict = mean_values.asDict()

# replace NA values with the mean from the previous calculation
for column in columns_to_impute:
    mean_value = mean_values_dict[column]
    df = df.withColumn(column, when(col(column) == 'NA', mean_value).otherwise(col(column).cast('double')))

In [35]:
df.limit(40).toPandas()

Unnamed: 0,ID,N_Days,Status,Drug,AGE,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
0,1,400.0,D,D-penicillamine,21464.0,F,Y,Y,Y,Y,14.5,261.0,2.6,156.0,1718.0,137.95,172.0,190.0,12.2,4.0
1,2,4500.0,C,D-penicillamine,20617.0,F,N,Y,Y,N,1.1,302.0,4.14,54.0,7394.8,113.52,88.0,221.0,10.6,3.0
2,3,1012.0,D,D-penicillamine,25594.0,M,N,N,N,S,1.4,176.0,3.48,210.0,516.0,96.1,55.0,151.0,12.0,4.0
3,4,1925.0,D,D-penicillamine,19994.0,F,N,Y,Y,S,1.8,244.0,2.54,64.0,6121.8,60.63,92.0,183.0,10.3,4.0
4,5,1504.0,CL,Placebo,13918.0,F,N,Y,Y,N,3.4,279.0,3.53,143.0,671.0,113.15,72.0,136.0,10.9,3.0
5,6,2503.0,D,Placebo,24201.0,F,N,Y,N,N,0.8,248.0,3.98,50.0,944.0,93.0,63.0,257.02457,11.0,3.0
6,7,1832.0,C,Placebo,20284.0,F,N,Y,N,N,1.0,322.0,4.09,52.0,824.0,60.45,213.0,204.0,9.7,3.0
7,8,2466.0,D,Placebo,19379.0,F,N,N,N,N,0.3,280.0,4.0,52.0,4651.2,28.38,189.0,373.0,11.0,3.0
8,9,2400.0,D,D-penicillamine,15526.0,F,N,N,Y,N,3.2,562.0,3.08,79.0,2276.0,144.15,88.0,251.0,11.0,2.0
9,10,51.0,D,Placebo,25772.0,F,Y,N,Y,Y,12.6,200.0,2.74,140.0,918.0,147.25,143.0,302.0,11.5,4.0


In [36]:
df.printSchema()

root
 |-- ID: integer (nullable = true)
 |-- N_Days: double (nullable = true)
 |-- Status: string (nullable = true)
 |-- Drug: string (nullable = true)
 |-- AGE: double (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Ascites: string (nullable = true)
 |-- Hepatomegaly: string (nullable = true)
 |-- Spiders: string (nullable = true)
 |-- Edema: string (nullable = true)
 |-- Bilirubin: double (nullable = true)
 |-- Cholesterol: double (nullable = true)
 |-- Albumin: double (nullable = true)
 |-- Copper: double (nullable = true)
 |-- Alk_Phos: string (nullable = true)
 |-- SGOT: double (nullable = true)
 |-- Tryglicerides: double (nullable = true)
 |-- Platelets: double (nullable = true)
 |-- Prothrombin: double (nullable = true)
 |-- Stage: double (nullable = true)



In [37]:
# Define a list of columns to convert to integer and float
columns_to_integer = ['N_Days', 'AGE', 'Bilirubin', 'Cholesterol', 'Copper','Tryglicerides', 'Platelets', 'Stage', "Alk_Phos"]
columns_to_float = ['Albumin', 'SGOT', 'Prothrombin']

# Convert the specified columns to integer
for column in columns_to_integer:
    df = df.withColumn(column, col(column).cast("integer"))
    
# Convert the specified columns to float
for column in columns_to_float:
    df = df.withColumn(column, col(column).cast("float"))

In [38]:
df.printSchema()

root
 |-- ID: integer (nullable = true)
 |-- N_Days: integer (nullable = true)
 |-- Status: string (nullable = true)
 |-- Drug: string (nullable = true)
 |-- AGE: integer (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Ascites: string (nullable = true)
 |-- Hepatomegaly: string (nullable = true)
 |-- Spiders: string (nullable = true)
 |-- Edema: string (nullable = true)
 |-- Bilirubin: integer (nullable = true)
 |-- Cholesterol: integer (nullable = true)
 |-- Albumin: float (nullable = true)
 |-- Copper: integer (nullable = true)
 |-- Alk_Phos: integer (nullable = true)
 |-- SGOT: float (nullable = true)
 |-- Tryglicerides: integer (nullable = true)
 |-- Platelets: integer (nullable = true)
 |-- Prothrombin: float (nullable = true)
 |-- Stage: integer (nullable = true)



In [39]:
for column in df.columns:
    na_count = df.where(df[column] == "NA").count()
    print(f"Column '{column}': {na_count} 'NA' values")

Column 'ID': 0 'NA' values
Column 'N_Days': 0 'NA' values
Column 'Status': 0 'NA' values
Column 'Drug': 106 'NA' values
Column 'AGE': 0 'NA' values
Column 'Sex': 0 'NA' values
Column 'Ascites': 106 'NA' values
Column 'Hepatomegaly': 106 'NA' values
Column 'Spiders': 106 'NA' values
Column 'Edema': 0 'NA' values
Column 'Bilirubin': 0 'NA' values
Column 'Cholesterol': 0 'NA' values
Column 'Albumin': 0 'NA' values
Column 'Copper': 0 'NA' values
Column 'Alk_Phos': 0 'NA' values
Column 'SGOT': 0 'NA' values
Column 'Tryglicerides': 0 'NA' values
Column 'Platelets': 0 'NA' values
Column 'Prothrombin': 0 'NA' values
Column 'Stage': 0 'NA' values


In [17]:
# change 'Status' for observations = 0 representing deceased patients, this is our dependent variable
#df = df.withColumn('Status',functions.when(df['Status']=='D',0).otherwise(1))

In [40]:
df.limit(5).toPandas()

Unnamed: 0,ID,N_Days,Status,Drug,AGE,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
0,1,400,D,D-penicillamine,21464,F,Y,Y,Y,Y,14,261,2.6,156,1718,137.949997,172,190,12.2,4
1,2,4500,C,D-penicillamine,20617,F,N,Y,Y,N,1,302,4.14,54,7394,113.519997,88,221,10.6,3
2,3,1012,D,D-penicillamine,25594,M,N,N,N,S,1,176,3.48,210,516,96.099998,55,151,12.0,4
3,4,1925,D,D-penicillamine,19994,F,N,Y,Y,S,1,244,2.54,64,6121,60.630001,92,183,10.3,4
4,5,1504,CL,Placebo,13918,F,N,Y,Y,N,3,279,3.53,143,671,113.150002,72,136,10.9,3


In [41]:
# List of columns to keep
focus_columns = ['ID', 'Cholesterol','Copper','Tryglicerides','Platelets','SGOT','Status']

# Drop all columns except the ones in columns_to_keep
bloodwork_df = df[focus_columns]
bloodwork_df.limit(5).toPandas()

Unnamed: 0,ID,Cholesterol,Copper,Tryglicerides,Platelets,SGOT,Status
0,1,261,156,172,190,137.949997,D
1,2,302,54,88,221,113.519997,C
2,3,176,210,55,151,96.099998,D
3,4,244,64,92,183,60.630001,D
4,5,279,143,72,136,113.150002,CL


In [42]:
# Create a vector of features for the model
vectorAssembler = VectorAssembler(inputCols = ['Cholesterol','Copper','Tryglicerides','Platelets'], outputCol = 'features')
bloodwork_df = vectorAssembler.transform(bloodwork_df)
bloodwork_df.show(5)

+---+-----------+------+-------------+---------+------+------+--------------------+
| ID|Cholesterol|Copper|Tryglicerides|Platelets|  SGOT|Status|            features|
+---+-----------+------+-------------+---------+------+------+--------------------+
|  1|        261|   156|          172|      190|137.95|     D|[261.0,156.0,172....|
|  2|        302|    54|           88|      221|113.52|     C|[302.0,54.0,88.0,...|
|  3|        176|   210|           55|      151|  96.1|     D|[176.0,210.0,55.0...|
|  4|        244|    64|           92|      183| 60.63|     D|[244.0,64.0,92.0,...|
|  5|        279|   143|           72|      136|113.15|    CL|[279.0,143.0,72.0...|
+---+-----------+------+-------------+---------+------+------+--------------------+
only showing top 5 rows



In [43]:
indexer = StringIndexer(inputCol = 'Status', outputCol = 'label')
bloodwork_df = indexer.fit(bloodwork_df).transform(bloodwork_df)
bloodwork_df.show(5)

+---+-----------+------+-------------+---------+------+------+--------------------+-----+
| ID|Cholesterol|Copper|Tryglicerides|Platelets|  SGOT|Status|            features|label|
+---+-----------+------+-------------+---------+------+------+--------------------+-----+
|  1|        261|   156|          172|      190|137.95|     D|[261.0,156.0,172....|  1.0|
|  2|        302|    54|           88|      221|113.52|     C|[302.0,54.0,88.0,...|  0.0|
|  3|        176|   210|           55|      151|  96.1|     D|[176.0,210.0,55.0...|  1.0|
|  4|        244|    64|           92|      183| 60.63|     D|[244.0,64.0,92.0,...|  1.0|
|  5|        279|   143|           72|      136|113.15|    CL|[279.0,143.0,72.0...|  2.0|
+---+-----------+------+-------------+---------+------+------+--------------------+-----+
only showing top 5 rows



In [44]:
# Check all observations present
bloodwork_df.select('Status','label').groupBy('Status','label').count().show()

+------+-----+-----+
|Status|label|count|
+------+-----+-----+
|    CL|  2.0|   25|
|     C|  0.0|  232|
|     D|  1.0|  161|
+------+-----+-----+



In [45]:
splits = bloodwork_df.randomSplit([0.6,0.4],1)
train_df = splits[0]
test_df = splits[1]
train_df.count(), test_df.count(), bloodwork_df.count()

(248, 170, 418)

In [46]:
layers = [4, 5, 5, 3]
mlp = MultilayerPerceptronClassifier(layers = layers, maxIter=100, seed = 1)
mlp_model = mlp.fit(train_df)

In [47]:
pred_df = mlp_model.transform(test_df)
pred_df.select('Id','features','label','rawPrediction','probability','prediction').show(5)

+---+--------------------+-----+--------------------+--------------------+----------+
| Id|            features|label|       rawPrediction|         probability|prediction|
+---+--------------------+-----+--------------------+--------------------+----------+
|  1|[261.0,156.0,172....|  1.0|[0.09464074457615...|[0.29792791703442...|       1.0|
|  5|[279.0,143.0,72.0...|  2.0|[0.46588443013636...|[0.50212553395372...|       0.0|
|  6|[248.0,50.0,63.0,...|  1.0|[0.46588480708670...|[0.50212571542912...|       0.0|
| 10|[200.0,140.0,143....|  1.0|[0.09464074473421...|[0.29792791710969...|       1.0|
| 14|[369.0,43.0,124.0...|  1.0|[0.46588443013636...|[0.50212553395372...|       0.0|
+---+--------------------+-----+--------------------+--------------------+----------+
only showing top 5 rows



In [48]:
evaluator = MulticlassClassificationEvaluator(labelCol = 'label', predictionCol = 'prediction', metricName = 'accuracy')
mlpacc = evaluator.evaluate(pred_df)
mlpacc

0.6352941176470588