In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, mean_squared_error, classification_report, confusion_matrix
import joblib

# -----------------------------
# Load dataset
df = pd.read_csv("liver_cirrhosis.csv")

# Drop non-numeric column
df.drop(columns=['Status'], inplace=True)

# Encode categorical features
df['Sex'] = df['Sex'].map({'F': 0, 'M': 1})
df['Drug'] = df['Drug'].map({'Placebo': 0, 'D': 1})
df['Ascites'] = df['Ascites'].map({'N': 0, 'Y': 1})
df['Hepatomegaly'] = df['Hepatomegaly'].map({'N': 0, 'Y': 1})
df['Spiders'] = df['Spiders'].map({'N': 0, 'Y': 1})
df['Edema'] = df['Edema'].map({'N': 0, 'S': 1, 'Y': 2})
df['Stage'] = df['Stage'].astype(int)

# Drop missing values
df = df.dropna()

# Features and target
X = df.drop(columns=['Stage'])
y = df['Stage']

# Check for any NaNs left
assert X.isnull().sum().sum() == 0
assert y.isnull().sum() == 0

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# =============================
# 1. Logistic Regression
# =============================
print("\n" + "="*30)
print("LOGISTIC REGRESSION")
print("="*30)
logreg = LogisticRegression(multi_class='ovr', solver='lbfgs', max_iter=1000)
param_grid_lr = {'C': [0.01, 0.1, 1, 10]}
grid_lr = GridSearchCV(logreg, param_grid_lr, cv=5, n_jobs=-1, verbose=1)
grid_lr.fit(X_train, y_train)
y_pred_lr = grid_lr.predict(X_test)

print("Best Params:", grid_lr.best_params_)
acc_lr = accuracy_score(y_test, y_pred_lr)
print("Accuracy:", acc_lr)
print("MSE:", mean_squared_error(y_test, y_pred_lr))
print("Classification Report:\n", classification_report(y_test, y_pred_lr))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr))

# =============================
# 2. Decision Tree
# =============================
print("\n" + "="*30)
print("DECISION TREE")
print("="*30)
dt = DecisionTreeClassifier(random_state=42)
param_grid_dt = {'max_depth': [3, 5, 7], 'min_samples_split': [2, 4, 6]}
grid_dt = GridSearchCV(dt, param_grid_dt, cv=5, n_jobs=-1, verbose=1)
grid_dt.fit(X_train, y_train)
y_pred_dt = grid_dt.predict(X_test)

print("Best Params:", grid_dt.best_params_)
acc_dt = accuracy_score(y_test, y_pred_dt)
print("Accuracy:", acc_dt)
print("MSE:", mean_squared_error(y_test, y_pred_dt))
print("Classification Report:\n", classification_report(y_test, y_pred_dt))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_dt))

# =============================
# 3. Random Forest
# =============================
print("\n" + "="*30)
print("RANDOM FOREST")
print("="*30)
rf = RandomForestClassifier(random_state=42)
param_grid_rf = {'n_estimators': [50, 100], 'max_depth': [3, 5, 7]}
grid_rf = GridSearchCV(rf, param_grid_rf, cv=5, n_jobs=-1, verbose=1)
grid_rf.fit(X_train, y_train)
y_pred_rf = grid_rf.predict(X_test)

print("Best Params:", grid_rf.best_params_)
acc_rf = accuracy_score(y_test, y_pred_rf)
print("Accuracy:", acc_rf)
print("MSE:", mean_squared_error(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))

# =============================
# 📊 Accuracy Comparison
# =============================
print("\n" + "="*30)
print("MODEL ACCURACY COMPARISON")
print("="*30)
print(f"Logistic Regression Accuracy: {acc_lr:.4f}")
print(f"Decision Tree Accuracy     : {acc_dt:.4f}")
print(f"Random Forest Accuracy     : {acc_rf:.4f}")

# =============================
# 💾 Save the best model
# =============================
best_model = grid_rf.best_estimator_ if acc_rf >= max(acc_lr, acc_dt) else \
             grid_dt.best_estimator_ if acc_dt >= max(acc_lr, acc_rf) else \
             grid_lr.best_estimator_

joblib.dump(best_model, "best_liver_stage_model.pkl")
print("\n✅ Best model saved to: best_liver_stage_model.pkl")



LOGISTIC REGRESSION
Fitting 5 folds for each of 4 candidates, totalling 20 fits




Best Params: {'C': 1}
Accuracy: 0.5293746051800379
MSE: 0.8051168667087808
Classification Report:
               precision    recall  f1-score   support

           1       0.48      0.41      0.44       961
           2       0.52      0.53      0.52      1179
           3       0.58      0.64      0.61      1026

    accuracy                           0.53      3166
   macro avg       0.53      0.53      0.52      3166
weighted avg       0.53      0.53      0.53      3166

Confusion Matrix:
 [[392 366 203]
 [272 627 280]
 [150 219 657]]

DECISION TREE
Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best Params: {'max_depth': 7, 'min_samples_split': 2}
Accuracy: 0.7763739734680986
MSE: 0.3401768793430196
Classification Report:
               precision    recall  f1-score   support

           1       0.84      0.68      0.75       961
           2       0.72      0.83      0.77      1179
           3       0.81      0.80      0.80      1026

    accuracy                   