In [1]:
# Import required libraries for data handling, model training, and evaluation
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report


In [2]:
# Load the liver dataset and clean missing values + encode gender and target
df = pd.read_csv("../data/indian_liver_patient.csv")

# Fill missing values in 'Albumin_and_Globulin_Ratio'
df['Albumin_and_Globulin_Ratio'].fillna(df['Albumin_and_Globulin_Ratio'].mean(), inplace=True)

# Encode gender
df['Gender'] = df['Gender'].map({'Male': 1, 'Female': 0})

# Re-encode target: 1 = Liver Disease, 2 = No Disease → 1 and 0
df['Dataset'] = df['Dataset'].apply(lambda x: 1 if x == 1 else 0)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Albumin_and_Globulin_Ratio'].fillna(df['Albumin_and_Globulin_Ratio'].mean(), inplace=True)


In [5]:
# Select top 10 features based on previous importance/ranking
df['AST_ALT_Ratio'] = df['Aspartate_Aminotransferase'] / (df['Alamine_Aminotransferase'] + 1)

selected_features = [
    'Age', 'Gender', 'Total_Bilirubin', 'Direct_Bilirubin',
    'Alamine_Aminotransferase', 'Aspartate_Aminotransferase',
    'Total_Protiens', 'Albumin', 'Albumin_and_Globulin_Ratio',
    'AST_ALT_Ratio'
]

X = df[selected_features]
y = df['Dataset']


In [4]:
print(df.columns)


Index(['Age', 'Gender', 'Total_Bilirubin', 'Direct_Bilirubin',
       'Alkaline_Phosphotase', 'Alamine_Aminotransferase',
       'Aspartate_Aminotransferase', 'Total_Protiens', 'Albumin',
       'Albumin_and_Globulin_Ratio', 'Dataset', 'AST_ALT_Ratio'],
      dtype='object')


In [6]:
# Split the data into train and test sets (80/20) with stratified target classes
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=43
)


In [7]:
# Normalize features using StandardScaler to improve model performance
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [8]:
# Train the XGBoost classifier using the scaled data
model = XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=43
)
model.fit(X_train_scaled, y_train)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [9]:
# Evaluate the model using accuracy and classification report
y_pred = model.predict(X_test_scaled)
print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=4))


✅ Accuracy: 0.7435897435897436
              precision    recall  f1-score   support

           0     0.5833    0.4118    0.4828        34
           1     0.7849    0.8795    0.8295        83

    accuracy                         0.7436       117
   macro avg     0.6841    0.6456    0.6562       117
weighted avg     0.7264    0.7436    0.7288       117



In [12]:
# Save the model and scaler for deployment in your Streamlit app
pickle.dump(model, open("../models/liver_model.pkl", "wb"))
pickle.dump(scaler, open("../models/liver_scaler.pkl", "wb"))
