In [1]:
# File location and type
file_location = "/FileStore/tables/indian_liver_patient__1_-a7752.csv"
file_type = "csv"

# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

display(df)

Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
65,Female,0.7,0.1,187,16,18,6.8,3.3,0.9,1
62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
58,Male,1.0,0.4,182,14,20,6.8,3.4,1.0,1
72,Male,3.9,2.0,195,27,59,7.3,2.4,0.4,1
46,Male,1.8,0.7,208,19,14,7.6,4.4,1.3,1
26,Female,0.9,0.2,154,16,12,7.0,3.5,1.0,1
29,Female,0.9,0.3,202,14,11,6.7,3.6,1.1,1
17,Male,0.9,0.3,202,22,19,7.4,4.1,1.2,2
55,Male,0.7,0.2,290,53,58,6.8,3.4,1.0,1


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
#Read the training & test data
liver_df = pd.read_csv('/dbfs/FileStore/tables/indian_liver_patient__1_-a7752.csv')

In [4]:
liver_df.head()

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.4,1


In [5]:
liver_df.info()

In [6]:
#Describe gives statistical information about NUMERICAL columns in the dataset
liver_df.describe(include='all')
#We can see that there are missing values for Albumin_and_Globulin_Ratio as only 579 entries have valid values indicating 4 missing values.
#Gender has only 2 values - Male/Female

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
count,583.0,583,583.0,583.0,583.0,583.0,583.0,583.0,583.0,579.0,583.0
unique,,2,,,,,,,,,
top,,Male,,,,,,,,,
freq,,441,,,,,,,,,
mean,44.746141,,3.298799,1.486106,290.576329,80.713551,109.910806,6.48319,3.141852,0.947064,1.286449
std,16.189833,,6.209522,2.808498,242.937989,182.620356,288.918529,1.085451,0.795519,0.319592,0.45249
min,4.0,,0.4,0.1,63.0,10.0,10.0,2.7,0.9,0.3,1.0
25%,33.0,,0.8,0.2,175.5,23.0,25.0,5.8,2.6,0.7,1.0
50%,45.0,,1.0,0.3,208.0,35.0,42.0,6.6,3.1,0.93,1.0
75%,58.0,,2.6,1.3,298.0,60.5,87.0,7.2,3.8,1.1,2.0


In [7]:
#Which features are available in the dataset?
liver_df.columns

In [8]:
#Check for any null values
liver_df.isnull().sum()

In [9]:
#Data Visualization

sns.countplot(data=liver_df, x = 'Dataset', label='Count')

LD, NLD = liver_df['Dataset'].value_counts()
print('Percentage of patients diagnosed with liver disease: ',LD / (LD+NLD) * 100)

In [10]:
display()

In [11]:
#Subjects in this study are more likely to have liver patients than the general public

#Out of the 583 subjects, 71.35% (416) of cases had liver disease and 28.64% (167) did not. In the general population, we do not expect to see two out of three people with liver disease. This points to a bias in the data collection, in which subjects who were more likely to have issues with the liver were more likely to be picked. In a larger study with 633,323 subjects, approximately 0.27% were found to have liver cirrhosis [1].

#For example in this dataset, if a predictor always predicts that the subject has liver disease, he will be correct about 70% of the time, but in the real world, this predictor would be 0.27% correct.

#However, this group of subjects might better reflect the distribution of people who visited a Hepatologist (liver specialist).

In [12]:
sns.countplot(data=liver_df, x = 'Gender', label='Count')

M, F = liver_df['Gender'].value_counts()
malesWithLiverDisease = liver_df[(liver_df['Gender'] == 'Male') & (liver_df['Dataset'] == 1)]['Age'].count()
femalesWithLiverDisease = liver_df[(liver_df['Gender'] == 'Female') & (liver_df['Dataset'] == 1)]['Age'].count()
patientsWithLiverDisease = liver_df[liver_df['Dataset'] == 1]['Age'].count()
totalPatients = liver_df['Age'].count()
print('Percent of patients that have liver disease: ',patientsWithLiverDisease /totalPatients * 100)
print('Percent of male patients that have liver disease: ',malesWithLiverDisease /M * 100)
print('Percent of female patients that have liver disease: ',femalesWithLiverDisease /F * 100)

print('Number of patients that are male: ',M)
print('Number of patients that are female: ',F)

In [13]:
display()

In [14]:
#Women are less likely to have liver disease. Is this real or biased data?

#The fraction of women with liver disease (64.78%) was lower than the fraction of men with liver disease (73.46). This is contrary to studies that women are more susceptible to liver disease [2]. The higher prevalence of liver disease among men in this dataset could be due to high rates of alcohol consumption among men in India (where women drinking alcohol was a social taboo). This might not be true in western countries.

#If our aim is to predict the presence of liver disease outside of India, to avoid bias (as our model is likely to guess that being a woman makes having liver disease less likely instead of more), we should disregard the gender in this study. On the other hand, including gender will help improve results for a predictive task in India.

In [15]:
#Age seems to be a factor for liver disease for both male and female genders
liver_df[['Dataset','Age']].groupby(['Dataset']).mean()

Unnamed: 0_level_0,Age
Dataset,Unnamed: 1_level_1
1,46.153846
2,41.239521


In [16]:
#liverDisease_df = liver_df[liver_df['Dataset'] == 1]
#liverDisease_df.drop(['Gender', 'Dataset'], axis=1).boxplot()
#nonLiverDisease_df = liver_df[liver_df['Dataset'] == 2]
#nonLiverDisease_df.drop(['Gender', 'Dataset'], axis=1).boxplot()
fig=plt.figure(figsize=(20, 24), dpi= 80, facecolor='w', edgecolor='k')

ax = liver_df.drop(['Gender'], axis='columns').set_index('Dataset', append=True).stack().to_frame().reset_index().rename(columns={'level_2':'quantity', 0:'value'}).pipe((sns.boxplot,'data'), x='quantity', y='value', hue='Dataset')
ax.set(ylim=(0,500))

In [17]:
display()

In [18]:
# Correlation
liver_corr = liver_df.corr()
liver_corr

Unnamed: 0,Age,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
Age,1.0,0.011763,0.007529,0.080425,-0.086883,-0.01991,-0.187461,-0.265924,-0.216408,-0.137351
Total_Bilirubin,0.011763,1.0,0.874618,0.206669,0.214065,0.237831,-0.008099,-0.22225,-0.206267,-0.220208
Direct_Bilirubin,0.007529,0.874618,1.0,0.234939,0.233894,0.257544,-0.000139,-0.228531,-0.200125,-0.246046
Alkaline_Phosphotase,0.080425,0.206669,0.234939,1.0,0.12568,0.167196,-0.028514,-0.165453,-0.234166,-0.184866
Alamine_Aminotransferase,-0.086883,0.214065,0.233894,0.12568,1.0,0.791966,-0.042518,-0.029742,-0.002375,-0.163416
Aspartate_Aminotransferase,-0.01991,0.237831,0.257544,0.167196,0.791966,1.0,-0.025645,-0.08529,-0.07004,-0.151934
Total_Protiens,-0.187461,-0.008099,-0.000139,-0.028514,-0.042518,-0.025645,1.0,0.784053,0.234887,0.035008
Albumin,-0.265924,-0.22225,-0.228531,-0.165453,-0.029742,-0.08529,0.784053,1.0,0.689632,0.161388
Albumin_and_Globulin_Ratio,-0.216408,-0.206267,-0.200125,-0.234166,-0.002375,-0.07004,0.234887,0.689632,1.0,0.163131
Dataset,-0.137351,-0.220208,-0.246046,-0.184866,-0.163416,-0.151934,0.035008,0.161388,0.163131,1.0


In [19]:
plt.figure(figsize=(15, 15))
sns.heatmap(liver_corr, cbar = True,  square = True, annot=True, fmt= '.2f',annot_kws={'size': 10},
           cmap= 'coolwarm')
plt.title('Correlation between features');

In [20]:
display()

In [21]:
#Observation:

#The "Dataset" column which indicates the presence or absence of liver disease does not strongly correlate with one of the measured metrics. There are strong positive correlations between pairs of measured quantities (Total_Proteins & Albumin, Total_Proteins & Albumin, Alamine_Aminotransferase & Aspartate_Aminotransferase, Direct_Bilirubin & Total_Bilirubin. Without more insight into the actual meaning of these quantities, it is possible these measurements are similar in nature and we might be able to reduce the dimensionality of this problem using Principle Component Analysis (PCA).

In [22]:
#Clean up data

#The dataset is pretty clean, except for a few NA's in the Albumin_and_Globulin_Ratio. There is a column for Albumin, but not one for Globulin. So we do not have any information to fill in the values. We can ignore the four rows which have this column, or replace them with the mean or median. Because of the noisy data, I chose to use the median value to fill in the missing values.

#Lets prepare the input and output data frames.
liver_df["Albumin_and_Globulin_Ratio"] = liver_df.Albumin_and_Globulin_Ratio.fillna(liver_df['Albumin_and_Globulin_Ratio'].median())

X = liver_df.drop(['Gender', 'Dataset'], axis='columns')
X.head()

Unnamed: 0,Age,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio
0,65,0.7,0.1,187,16,18,6.8,3.3,0.9
1,62,10.9,5.5,699,64,100,7.5,3.2,0.74
2,62,7.3,4.1,490,60,68,7.0,3.3,0.89
3,58,1.0,0.4,182,14,20,6.8,3.4,1.0
4,72,3.9,2.0,195,27,59,7.3,2.4,0.4


In [23]:
X.describe()

Unnamed: 0,Age,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio
count,583.0,583.0,583.0,583.0,583.0,583.0,583.0,583.0,583.0
mean,44.746141,3.298799,1.486106,290.576329,80.713551,109.910806,6.48319,3.141852,0.946947
std,16.189833,6.209522,2.808498,242.937989,182.620356,288.918529,1.085451,0.795519,0.318495
min,4.0,0.4,0.1,63.0,10.0,10.0,2.7,0.9,0.3
25%,33.0,0.8,0.2,175.5,23.0,25.0,5.8,2.6,0.7
50%,45.0,1.0,0.3,208.0,35.0,42.0,6.6,3.1,0.93
75%,58.0,2.6,1.3,298.0,60.5,87.0,7.2,3.8,1.1
max,90.0,75.0,19.7,2110.0,2000.0,4929.0,9.6,5.5,2.8


In [24]:
y = liver_df['Dataset'] # 1 for liver disease; 2 for no liver disease

In [25]:
#Machine Learning
#I tried Logistic Regression and Nearest neighbor algorithms. If we have a biased estimator, which always predicts that the patient has liver disease, it will have a 71.35% sucess rate. So our estimator to be of any use must do substantially better than that.

In [26]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=41)
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import average_precision_score
from pyspark.sql import SparkSession

def processResults(clk, str="", X_test = X_test, X_train = X_train, y_test = y_test, y_train = y_train):
    predicted = clk.predict(X_test)
    score = round(clk.score(X_train, y_train) * 100, 2)
    score_test = round(clk.score(X_test, y_test) * 100, 2)

    print(str + 'Training score: \n', score)
    print(str + 'Test Score: \n', score_test)
    print('Accuracy: \n', accuracy_score(y_test,predicted))
    print(confusion_matrix(y_test,predicted))
    print(classification_report(y_test,predicted))
    sns.heatmap(confusion_matrix(y_test,predicted),annot=True,fmt="d")
    return score, score_test
    

In [27]:
#1) Logistic Regression
#A simple way to check the performance of a classifier is to compare its accuracy to a biased estimator. 
#If we had a biased estimator, which, independent of all the metrics, always predicted that the patient has liver disease 
#for the complete dataset we would have an accuracy of 71.35%.

from sklearn.linear_model import LogisticRegression

# Create logistic regression object
logreg = LogisticRegression()

# Train the model using the training sets and check score
logreg.fit(X_train, y_train)
logreg_score, logreg_score_test = processResults(logreg, "Logistic Regression ")

print('Coefficient: \n', logreg.coef_)
print('Intercept: \n', logreg.intercept_)

In [28]:
#Logistic Regression observation
#The model has an accuracy of 71.43%. The training and testing scores are very close to each other at 71.24% and 76.07% respectively.


#The training and testing accuracy percentages being so close to each other indicate that the model has high bias and is underfitting the data. To get a better fit, we should increase the number of parameters that we fit to the data or use a different model.

In [29]:
display()

In [30]:
#2) K-Nearest Neighbour
#python based implementation code of the proposed model using K-Nearest Neighbour
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

pipeknn = Pipeline([
    ('scale', StandardScaler()),    
    ('knn', KNeighborsClassifier(n_neighbors=5)),
])
pipeknn.fit(X_train, y_train)
knnTrainScore, knnTestScore = processResults(pipeknn)

In [31]:
display()

In [32]:
## K nearest neighbors
liver_df.head()

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.4,1


In [33]:
##Model evaluation
#We can now rank our evaluation of all the models to choose the best one for our problem. 
models = pd.DataFrame({
    'Model': [ 'Logistic Regression', 'Nearest Neighbour'],
    'Score': [ logreg_score, knnTrainScore],
    'Test Score': [ logreg_score_test, knnTestScore]})
models.sort_values(by='Test Score', ascending=False)

Unnamed: 0,Model,Score,Test Score
0,Logistic Regression,71.46,73.5
1,Nearest Neighbour,79.61,70.09
