In [1]:
# STEP 1: IMPORT LIBRARIES
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

In [4]:
# STEP 2: LOAD AND INSPECT THE LIVER DATA
file_path = '../data/liver_disease.csv'
df = pd.read_csv(file_path, encoding='ISO-8859-1')

clean_columns = [
    'Age', 'Gender', 'Total_Bilirubin', 'Direct_Bilirubin',
    'Alkaline_Phosphotase', 'Alamine_Aminotransferase', 'Aspartate_Aminotransferase',
    'Total_Protiens', 'Albumin', 'Albumin_and_Globulin_Ratio', 'Dataset'
]
df.columns = clean_columns

print("\n--- First 5 rows of the liver dataset with clean columns ---")
print(df.head())

print("\n--- Missing Values Before Cleaning ---")
print(df.isnull().sum())


--- First 5 rows of the liver dataset with clean columns ---
    Age  Gender  Total_Bilirubin  Direct_Bilirubin  Alkaline_Phosphotase  \
0  65.0  Female              0.7               0.1                 187.0   
1  62.0    Male             10.9               5.5                 699.0   
2  62.0    Male              7.3               4.1                 490.0   
3  58.0    Male              1.0               0.4                 182.0   
4  72.0    Male              3.9               2.0                 195.0   

   Alamine_Aminotransferase  Aspartate_Aminotransferase  Total_Protiens  \
0                      16.0                        18.0             6.8   
1                      64.0                       100.0             7.5   
2                      60.0                        68.0             7.0   
3                      14.0                        20.0             6.8   
4                      27.0                        59.0             7.3   

   Albumin  Albumin_and_Globul

In [5]:
# STEP 3: DATA CLEANING AND PREPROCESSING
df = pd.get_dummies(df, columns=['Gender'], drop_first=True, prefix='Gender')
df['Dataset'] = df['Dataset'].map({1: 1, 2: 0})
df['Albumin_and_Globulin_Ratio'].fillna(df['Albumin_and_Globulin_Ratio'].median(), inplace=True)

print("\n--- Data after cleaning and preprocessing ---")
print(df.head())
print(f"\nTotal missing values after cleaning: {df.isnull().sum().sum()}")



--- Data after cleaning and preprocessing ---
    Age  Total_Bilirubin  Direct_Bilirubin  Alkaline_Phosphotase  \
0  65.0              0.7               0.1                 187.0   
1  62.0             10.9               5.5                 699.0   
2  62.0              7.3               4.1                 490.0   
3  58.0              1.0               0.4                 182.0   
4  72.0              3.9               2.0                 195.0   

   Alamine_Aminotransferase  Aspartate_Aminotransferase  Total_Protiens  \
0                      16.0                        18.0             6.8   
1                      64.0                       100.0             7.5   
2                      60.0                        68.0             7.0   
3                      14.0                        20.0             6.8   
4                      27.0                        59.0             7.3   

   Albumin  Albumin_and_Globulin_Ratio  Dataset  Gender_Male  
0      3.3                    

In [6]:
# STEP 4: SEPARATE FEATURES (X) AND TARGET (y)
X = df.drop('Dataset', axis=1)
y = df['Dataset']

In [7]:
# STEP 5: SPLIT DATA INTO TRAINING AND TESTING SETS
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [8]:
# STEP 6: FEATURE SCALING
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [9]:
# STEP 7: TRAIN THE MODEL
print("\n--- Training the liver model... ---")
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)
print("Model training complete!")


--- Training the liver model... ---
Model training complete!


In [10]:
# STEP 8: EVALUATE THE MODEL
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)

print(f"\n--- Liver Model Performance ---")
print(f"Accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:\n", classification_report(y_test, y_pred))


--- Liver Model Performance ---
Accuracy: 99.79%

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      1.00      1755
           1       1.00      1.00      1.00      4384

    accuracy                           1.00      6139
   macro avg       1.00      1.00      1.00      6139
weighted avg       1.00      1.00      1.00      6139



In [11]:
# STEP 9: SAVE THE NEW MODEL AND SCALER
model_path = '../saved_models/liver_model.joblib'
scaler_path = '../saved_models/liver_scaler.joblib'

joblib.dump(model, model_path)
joblib.dump(scaler, scaler_path)

print(f"\nLiver model saved to: {model_path}")
print(f"Liver scaler saved to: {scaler_path}")


Liver model saved to: ../saved_models/liver_model.joblib
Liver scaler saved to: ../saved_models/liver_scaler.joblib
