In [24]:
import seaborn as sns
import matplotlib.pyplot as plt
import sys
import pandas as pd
from pyspark.sql import functions 

In [9]:
# Check Spark Version
spark.version

'3.2.4'

In [10]:
# Check Python Version
sys.version_info

sys.version_info(major=3, minor=10, micro=12, releaselevel='final', serial=0)

In [11]:
# Import dataset from hadoop file share and print data types for each column

df = spark.read.load('hdfs://localhost:9000/ca2/cirrhosis.csv', format="csv", header=True, inferSchema=True)
df.printSchema()

                                                                                

root
 |-- ID: integer (nullable = true)
 |-- N_Days: integer (nullable = true)
 |-- Status: string (nullable = true)
 |-- Drug: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Ascites: string (nullable = true)
 |-- Hepatomegaly: string (nullable = true)
 |-- Spiders: string (nullable = true)
 |-- Edema: string (nullable = true)
 |-- Bilirubin: double (nullable = true)
 |-- Cholesterol: string (nullable = true)
 |-- Albumin: double (nullable = true)
 |-- Copper: string (nullable = true)
 |-- Alk_Phos: string (nullable = true)
 |-- SGOT: string (nullable = true)
 |-- Tryglicerides: string (nullable = true)
 |-- Platelets: string (nullable = true)
 |-- Prothrombin: string (nullable = true)
 |-- Stage: string (nullable = true)



In [28]:
# View dataset in pandas format for ease of review
df.limit(5).toPandas()

Unnamed: 0,ID,N_Days,Status,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
0,1,400,D,D-penicillamine,21464,F,Y,Y,Y,Y,14.5,261,2.6,156,1718.0,137.95,172,190,12.2,4
1,2,4500,C,D-penicillamine,20617,F,N,Y,Y,N,1.1,302,4.14,54,7394.8,113.52,88,221,10.6,3
2,3,1012,D,D-penicillamine,25594,M,N,N,N,S,1.4,176,3.48,210,516.0,96.1,55,151,12.0,4
3,4,1925,D,D-penicillamine,19994,F,N,Y,Y,S,1.8,244,2.54,64,6121.8,60.63,92,183,10.3,4
4,5,1504,CL,Placebo,13918,F,N,Y,Y,N,3.4,279,3.53,143,671.0,113.15,72,136,10.9,3


In [13]:
# Look at dependent variable status
df.describe(['Status']).show()

+-------+------+
|summary|Status|
+-------+------+
|  count|   418|
|   mean|  null|
| stddev|  null|
|    min|     C|
|    max|     D|
+-------+------+



In [13]:
# Define feature Status with observation = 0 for deceased patients, this is our target class
df = df.withColumn('Status',functions.when(df['Status']=='D',0).otherwise(1))

In [31]:
# Iterate through all columns and check for null
for column in df.columns:
    null_count = df.where(df[column].isNull()).count()
    print(f"Column '{column}': {null_count} null values")

Column 'ID': 0 null values
Column 'N_Days': 0 null values
Column 'Status': 0 null values
Column 'Drug': 0 null values
Column 'Age': 0 null values
Column 'Sex': 0 null values
Column 'Ascites': 0 null values
Column 'Hepatomegaly': 0 null values
Column 'Spiders': 0 null values
Column 'Edema': 0 null values
Column 'Bilirubin': 0 null values
Column 'Cholesterol': 0 null values
Column 'Albumin': 0 null values
Column 'Copper': 0 null values
Column 'Alk_Phos': 0 null values
Column 'SGOT': 0 null values
Column 'Tryglicerides': 0 null values
Column 'Platelets': 0 null values
Column 'Prothrombin': 0 null values
Column 'Stage': 0 null values


In [32]:
# Iterate through all columns and check for null
for column in df.columns:
    na_count = df.where(df[column] == "NA").count()
    print(f"Column '{column}': {na_count} 'NA' values")

Column 'ID': 0 'NA' values
Column 'N_Days': 0 'NA' values
Column 'Status': 0 'NA' values
Column 'Drug': 106 'NA' values
Column 'Age': 0 'NA' values
Column 'Sex': 0 'NA' values
Column 'Ascites': 106 'NA' values
Column 'Hepatomegaly': 106 'NA' values
Column 'Spiders': 106 'NA' values
Column 'Edema': 0 'NA' values
Column 'Bilirubin': 0 'NA' values
Column 'Cholesterol': 134 'NA' values
Column 'Albumin': 0 'NA' values
Column 'Copper': 108 'NA' values
Column 'Alk_Phos': 106 'NA' values
Column 'SGOT': 106 'NA' values
Column 'Tryglicerides': 136 'NA' values
Column 'Platelets': 11 'NA' values
Column 'Prothrombin': 2 'NA' values
Column 'Stage': 6 'NA' values


In [None]:
# impute values for numeric features with N/A values ??

In [None]:
# Drop NA values for categorical features ??

In [None]:
# Determine feature correlation for dependent variable and take subset of features

In [None]:
# transform variables using vector assembler?