# Cirrhosis Patient Survival Prediction
## Group 38
##### EG/2020/4111 - Perera G.A.L.S.
##### EG/2020/4330 - Sheshan K.H.N.

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest

from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import classification_report
from sklearn.model_selection import learning_curve
from sklearn.metrics import accuracy_score

: 

### Import Data

In [None]:
df = pd.read_csv('cirrhosis.csv')
df.head()

: 

### Data Analysis

In [None]:
df.columns

: 

In [None]:
df.drop(['ID'], axis=1, inplace=True)
df.describe(include='all')

: 

Visualization of the categorical values

In [None]:
sns.countplot(x='Drug', data=df, palette='viridis')
plt.title('Countplot of Drug')

: 

In [None]:
sns.countplot(x='Sex', data=df, palette='viridis')
plt.title('Countplot of Sex')

: 

In [None]:
sns.countplot(x='Ascites', data=df, palette='viridis')
plt.title('Countplot of Ascites')

: 

In [None]:
sns.countplot(x='Hepatomegaly', data=df, palette='viridis')
plt.title('Countplot of Hepatomegaly')

: 

In [None]:
sns.countplot(x='Spiders', data=df, palette='viridis')
plt.title('Countplot of Spiders')

: 

In [None]:
sns.countplot(x='Edema', data=df, palette='viridis')
plt.title('Countplot of Edema')

: 

In [None]:
sns.countplot(x='Stage', data=df, palette='viridis')
plt.title('Countplot of Stage')

: 

In [None]:
sns.countplot(x='Status', data=df, palette='viridis')
plt.title('Countplot of Status')

: 

In [None]:
sns.countplot(x='Status', data=df, palette='viridis', hue='Sex')
plt.title('Distribution of Status with sex')

: 

In [None]:
# Number of status values
df.Status.value_counts()

: 

Visualization of the numerical values

In [None]:
sns.histplot(x="Age", data=df, palette='viridis')

: 

### Data Preprocessing

In [None]:
df.isnull().sum()

: 

In [None]:
sns.heatmap(data=df.isnull())

: 

Handling missing numerical values

In [None]:
num_cols = ['Cholesterol', 'Copper', 'Alk_Phos', 'SGOT', 'Tryglicerides']

for i in num_cols:
    df[i].fillna(df[i].mean(), inplace=True)

: 

In [None]:
sns.heatmap(data=df.isnull())

: 

Deleting some missing rows

In [None]:
remove_row_cols = ['Platelets', 'Prothrombin', 'Stage']

for i in remove_row_cols:
    df = df[pd.notnull(df[i])]

: 

In [None]:
sns.heatmap(data=df.isnull())

: 

Handling missing categorical values

In [None]:
missing_categorical_columns = ['Drug', 'Ascites', 'Hepatomegaly', 'Spiders', 'Stage']

for i in missing_categorical_columns:
    df[i] = df[i].fillna('Missing')

: 

In [None]:
sns.heatmap(data=df.isnull())

: 

Handle Categorical Data using One-Hot-Encoding

In [None]:
drug = pd.get_dummies(df['Drug'], drop_first=True)
drug.head()

: 

In [None]:
label_encoder = LabelEncoder()
df['Status'] = label_encoder.fit_transform(df['Status'])
print("Encoded values:", dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))

: 

In [None]:
sex = pd.get_dummies(df['Sex'],drop_first=True)
sex.head()

: 

In [None]:
ascites = pd.get_dummies(df['Ascites'],drop_first=True)
ascites = ascites.rename(columns={'N': 'Ascites_N', 'Y': 'Ascites_Y'})
ascites.head()

: 

In [None]:
hepatomegaly = pd.get_dummies(df['Hepatomegaly'], drop_first=True)
hepatomegaly = hepatomegaly.rename(columns={'N': 'Hepatomegaly_N', 'Y': 'Hepatomegaly_Y'})
hepatomegaly.head()

: 

In [None]:
spiders = pd.get_dummies(df['Spiders'], drop_first=True)
spiders = spiders.rename(columns={'N': 'Spiders_N', 'Y': 'Spiders_Y'})
spiders.head()

: 

In [None]:
edema = pd.get_dummies(df['Edema'], drop_first=True)
edema = edema.rename(columns={'S': 'Edema_S', 'Y': 'Edema_Y'})
edema.head()

: 

Update dataset with the One-Hot-Encoded columns

In [None]:
df = pd.concat([df,drug,sex,ascites,hepatomegaly,spiders,edema],axis=1)

: 

In [None]:
df.drop(['Drug','Sex','Ascites','Hepatomegaly','Spiders','Edema'], axis=1, inplace=True)

: 

In [None]:
df.columns

: 

In [None]:
X = df.drop(['Status'], axis=1)
y = df[['Status']]

: 

In [None]:
numerical_col=['N_Days', 'Age', 'Bilirubin', 'Cholesterol', 'Albumin', 'Copper', 'Alk_Phos', 'SGOT', 'Tryglicerides', 'Platelets', 'Prothrombin']

for col in numerical_col:
    q3=df[col].quantile(0.75).round(4)
    q1=df[col].quantile(0.25).round(4)
    upper_lim = round(q3 + 1.5*(q3-q1),4)
    lower_lim = round(q1 - 1.5*(q3-q1),4)
    df = df[(df[col] < upper_lim) & (df[col] > lower_lim)]
df.shape

: 

Feature Selection

In [None]:
fs =SelectKBest(score_func=chi2, k=9)
fs.fit(X,y)

: 

In [None]:
fs.scores_

: 

In [None]:
mi_score = pd.Series(fs.scores_, index=X.columns)
mi_score

: 

In [None]:
mi_score.sort_values(ascending=False).plot.bar(figsize=(6,4))

: 

In [None]:
X_selected = fs.fit_transform(X, y)
X_selected = pd.DataFrame(X_selected)

: 

In [None]:
X_selected.head()

: 

In [None]:
X_selected.shape

: 

Splitting the data

In [None]:
X_train, X_test, y_train,y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

: 

Scaling the data

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

: 

## Decision Tree

In [None]:
dtree_model = DecisionTreeClassifier(random_state=4)
dtree_model.fit(X_train_scaled, y_train)

: 

In [None]:
ConfusionMatrixDisplay.from_estimator(
    dtree_model, X_test_scaled, y_test,
    display_labels=["C", "CL", "D"])
plt.show()

: 

In [None]:
y_pred_dtree = dtree_model.predict(X_test_scaled)

: 

In [None]:
print(accuracy_score(y_test, y_pred_dtree)*100,"%")

: 

In [None]:
print(classification_report(y_test, y_pred_dtree))

: 

## SVM

In [None]:
svm_model = SVC(random_state=42)
svm_model.fit(X_train_scaled, y_train)

: 

In [None]:
ConfusionMatrixDisplay.from_estimator(
    svm_model, X_test_scaled, y_test,
    display_labels=["C", "CL", "D"])
plt.show()

: 

In [None]:
y_pred_svm = svm_model.predict(X_test_scaled)

: 

In [None]:
print(accuracy_score(y_test, y_pred_svm)*100,"%")

: 

In [None]:
print(classification_report(y_test, y_pred_svm))

: 

# Post Processing

### Boosting For Decision Tree

For Decision Trees

In [None]:
Boost_dtree = AdaBoostClassifier(base_estimator=dtree_model, random_state=11)
boostmodel_dtree = Boost_dtree.fit(X_train_scaled, y_train)

: 

In [None]:
y_pred_boost_dtree = boostmodel_dtree.predict(X_test_scaled)
print(accuracy_score(y_test, y_pred_boost_dtree)*100,"%")

: 

## Grid Search

For Decision Trees

In [None]:
param_grid_tree = {
    'n_estimators': [20, 25, 28, 30],
    'learning_rate': [0.01, 0.1, 0.15],
    'algorithm': ['SAMME', 'SAMME.R']  
}

grid_search_tree = GridSearchCV(estimator=boostmodel_dtree,param_grid=param_grid_tree, verbose=0, cv=5, n_jobs=-1)

: 

In [None]:
grid_search_tree.fit(X_train_scaled, y_train)

: 

In [None]:
# Print best hyperparameters detected from the Grid Search
print("Best Hyperparameters:", grid_search_tree.best_params_)

: 

In [None]:
# Print the mean cross-validated score of the best_estimator
print("Mean Cross-validated Score of the Best Estimator:", grid_search_tree.best_score_)

: 

In [None]:
ConfusionMatrixDisplay.from_estimator(
    grid_search_tree, X_test_scaled, y_test,
    display_labels=["C", "CL", "D"])
plt.show()

: 

In [None]:
# Use best estimator to obtain the accuracy for the test set
test_accuracy = grid_search_tree.best_estimator_.score(X_test_scaled, y_test)

# Print the accuracy on the test set
print("Test Set Accuracy: ", test_accuracy*100,"%")

: 

In [None]:
y_pred_grid_tree = grid_search_tree.predict(X_test_scaled)

: 

In [None]:
print(classification_report(y_test, y_pred_grid_tree))

: 

For SVM

In [None]:
svm_classifier = SVC(random_state=11)

: 

In [None]:
param_grid_svm = {
    'C': [0.825,0.85,0.875,1.0],
    'kernel': ['linear', 'poly','rbf'],
    'degree': [2, 3, 4],
    'coef0': [0.17, 0.18, 0.20],
    'gamma': ['scale', 'auto', 1.0, 0.1],
    'decision_function_shape':['ovo', 'ovr']             
}

grid_search_svm = GridSearchCV(estimator=svm_classifier, param_grid=param_grid_svm, verbose=0,cv=5, n_jobs=-1)

: 

In [None]:
grid_search_svm.fit(X_train_scaled, y_train)

: 

In [None]:
# Print best hyperparameters detected from the Grid Search
print("Best Hyperparameters:", grid_search_svm.best_params_)

: 

In [None]:
# Print the mean cross-validated score of the best_estimator
print("Mean Cross-validated Score of the Best Estimator:", grid_search_svm.best_score_)

: 

In [None]:
ConfusionMatrixDisplay.from_estimator(
    grid_search_svm, X_test_scaled, y_test,
    display_labels=["C", "CL", "D"])
plt.show()

: 

In [None]:
# Use best estimator to obtain the accuracy for the test set
test_accuracy = grid_search_svm.best_estimator_.score(X_test_scaled, y_test)

# Print the accuracy on the test set
print("Test Set Accuracy: ", test_accuracy*100,"%")

: 

In [None]:
y_pred_grid_svm = grid_search_svm.predict(X_test_scaled)

: 

In [None]:
print(classification_report(y_test, y_pred_grid_svm))

: 