In [23]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
import os
import joblib
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from scipy.stats import randint, uniform

In [2]:
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

In [3]:
data_path = "../data/processed/appendicitis_cleaned_data_v1s.csv"
df = pd.read_csv(data_path)
df.head()

Unnamed: 0,Age,BMI,Sex,Height,Weight,Diagnosis,Appendix_on_US,Migratory_Pain,Lower_Right_Abd_Pain,Contralateral_Rebound_Tenderness,...,CRP,Dysuria,Stool,Peritonitis,Psoas_Sign,US_Performed,US_Number,Free_Fluids,Age_Group,BMI_Category
0,12.68,16.9,female,148.0,37.0,appendicitis,yes,no,yes,yes,...,0.0,no,normal,no,yes,yes,882.0,no,Preteen,Underweight
1,14.1,31.9,male,147.0,69.5,no appendicitis,no,yes,yes,yes,...,3.0,yes,normal,no,yes,yes,883.0,no,Teenager,Obese
2,14.14,23.3,female,163.0,62.0,no appendicitis,no,no,yes,yes,...,3.0,no,constipation,no,yes,yes,884.0,no,Teenager,Normal
3,16.37,20.6,female,165.0,56.0,no appendicitis,no,yes,yes,no,...,0.0,yes,normal,no,yes,yes,886.0,no,Teenager,Normal
4,11.08,16.9,female,163.0,45.0,appendicitis,yes,no,yes,yes,...,0.0,no,constipation,no,yes,yes,887.0,no,Preteen,Underweight


In [4]:
df.duplicated().sum()

np.int64(0)

In [5]:
df.shape

(589, 30)

In [6]:
df.isnull().sum().sum()

np.int64(0)

## Before any preprocessing, lets divide the data

In [7]:
X = df.drop(columns=["Diagnosis", "BMI", "Age"], axis=1)
y_raw = df["Diagnosis"]

In [8]:
le = LabelEncoder()
y = le.fit_transform(y_raw)
joblib.dump(le, "../artifacts/label_encoder.joblib")

['../artifacts/label_encoder.joblib']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)

In [10]:
print("Classes (original):", list(le.classes_))
print("Train size:", X_train.shape, "Test size:", X_test.shape)

Classes (original): ['appendicitis', 'no appendicitis']
Train size: (471, 27) Test size: (118, 27)


## Preprocessing pipeline

In [11]:
num_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_features = X.select_dtypes(include=["object"]).columns.tolist()

print("Numerical features:", num_features)
print("Categorical features:", cat_features)

Numerical features: ['Height', 'Weight', 'Body_Temperature', 'WBC_Count', 'RBC_Count', 'Hemoglobin', 'RDW', 'Thrombocyte_Count', 'CRP', 'US_Number']
Categorical features: ['Sex', 'Appendix_on_US', 'Migratory_Pain', 'Lower_Right_Abd_Pain', 'Contralateral_Rebound_Tenderness', 'Coughing_Pain', 'Nausea', 'Loss_of_Appetite', 'Neutrophilia', 'Dysuria', 'Stool', 'Peritonitis', 'Psoas_Sign', 'US_Performed', 'Free_Fluids', 'Age_Group', 'BMI_Category']


In [12]:
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_features),
        ('cat', categorical_transformer, cat_features)
    ]
)



In [13]:
joblib.dump(preprocessor, "../mlflow/artifacts/prepocessor.joblib")

['../mlflow/artifacts/prepocessor.joblib']

In [14]:
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

print("Preprocessed train shape:", X_train_preprocessed.shape)

Preprocessed train shape: (471, 49)


## Model pipelines

#### Logistic Regression 

In [19]:
lr_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", LogisticRegression(
        penalty="l1", solver="saga", max_iter=5000, random_state=RANDOM_STATE
    ))
])

In [15]:
rf_pipeline = Pipeline(steps=[
("preprocessor", preprocessor),
("model", RandomForestClassifier(random_state=RANDOM_STATE))
])

In [17]:
xgb_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", XGBClassifier(
        eval_metric="logloss", use_label_encoder=False, random_state=RANDOM_STATE
    ))
])

In [20]:
# quick sanity check
for name, pipe in [
    ("Random Forest", rf_pipeline),
    ("Logistic Regression", lr_pipeline),
    ("XGBoost", xgb_pipeline)
]:
    pipe.fit(X_train, y_train)
    acc = pipe.score(X_test, y_test)
    print(f"{name} test accuracy: {acc:.3f}")

Random Forest test accuracy: 0.805
Logistic Regression test accuracy: 0.822


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost test accuracy: 0.839


In [22]:
joblib.dump(rf_pipeline, "../mlflow/artifacts/rf_pipeline_base.joblib")
joblib.dump(lr_pipeline, "../mlflow/artifacts/lr_pipeline_base.joblib")
joblib.dump(xgb_pipeline, "../mlflow/artifacts/xgb_pipeline_base.joblib")

['../mlflow/artifacts/xgb_pipeline_base.joblib']

### Cross-validation and hyperparameter search