Liver Disease Explainable AI System

Load Data

In [None]:
import pandas as pd
train = pd.read_csv('data/Training_indian_liver_disease_dataset.csv')
test = pd.read_csv('data/Testing_indian_liver_disease_dataset.csv')

Preprocessing

In [None]:
train = train.drop('Patient_ID', axis=1)
test = test.drop('Patient_ID', axis=1)
from sklearn.preprocessing import LabelEncoder
cat_cols = train.select_dtypes(include='object').columns
encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    train[col] = le.fit_transform(train[col].astype(str))
    test[col] = le.transform(test[col].astype(str))
    encoders[col] = le

Feature Engineering

In [None]:
train['AST_ALT_ratio'] = train['AST']/(train['ALT']+1e-6)
train['BMI_Alcohol'] = train['BMI']*train['Alcohol_Consumption']
train['Bilirubin_Albumin'] = train['Bilirubin']/(train['Albumin']+1e-6)
test['AST_ALT_ratio'] = test['AST']/(test['ALT']+1e-6)
test['BMI_Alcohol'] = test['BMI']*test['Alcohol_Consumption']
test['Bilirubin_Albumin'] = test['Bilirubin']/(test['Albumin']+1e-6)

Train Binary Model

In [None]:
target='Liver_Disease_Type'
X_train=train.drop(target,axis=1)
y_train=train[target]
X_test=test.drop(target,axis=1)
y_test=test[target]
y_train_binary=(y_train!=0).astype(int)
y_test_binary=(y_test!=0).astype(int)
from xgboost import XGBClassifier
model_bin=XGBClassifier(n_estimators=800,max_depth=10,learning_rate=0.03)
model_bin.fit(X_train,y_train_binary)

Evaluate

In [None]:
from sklearn.metrics import accuracy_score
preds_bin=model_bin.predict(X_test)
print(accuracy_score(y_test_binary,preds_bin))

Patient Input

In [None]:
patient={'Age':45,'Gender':1,'Occupation':2,'BMI':28,'Obesity_Class':1,'Diet_Quality':2,
'Physical_Activity':1,'Sleep_Hours':6,'Smoking_Status':1,'Alcohol_Consumption':3,
'Sym_Fatigue':1,'Sym_Jaundice':0,'Sym_Abdominal_Pain':1,'Sym_Itching':0,'Sym_Ascites':0,
'Sym_Dark_Urine':1,'Sym_Weight_Loss':0,'Comorb_Diabetes':1,'Comorb_Hypertension':0,
'Comorb_Genetic_History':0,'ALT':65,'AST':70,'Bilirubin':1.8,'Albumin':3.2,'Platelets':210,
'Alk_Phosphatase':180}

Prediction Pipeline

In [None]:
import numpy as np
patient_df=pd.DataFrame([patient])
patient_df['AST_ALT_ratio']=patient_df['AST']/(patient_df['ALT']+1e-6)
patient_df['BMI_Alcohol']=patient_df['BMI']*patient_df['Alcohol_Consumption']
patient_df['Bilirubin_Albumin']=patient_df['Bilirubin']/(patient_df['Albumin']+1e-6)
patient_df=patient_df[X_train.columns]
prob=model_bin.predict_proba(patient_df)[0][1]
pred=model_bin.predict(patient_df)[0]

Risk Score

In [None]:
def risk_category(prob):
    return 'High Risk' if prob>0.85 else 'Moderate Risk' if prob>0.6 else 'Low Risk'
risk=risk_category(prob)
print(pred,prob,risk)

Explainability

In [None]:
import shap
explainer=shap.TreeExplainer(model_bin)
shap_values=explainer.shap_values(patient_df)
shap.force_plot(explainer.expected_value,shap_values[0],patient_df,matplotlib=True)