In [1]:
# Setup imports and path for local modules
import sys
sys.path.append("../src")

from data_preprocessing import load_and_preprocess
from hpo_optuna import run_optimization
from train_model import train
from batch_inference import batch_predict
# from drift_detection import detect_drift  # Optional bonus

import joblib
import mlflow
import pandas as pd

df = pd.read_csv("../data/diabetes.csv")
print(df.columns)
df.head()


# Step 1: Preprocess Data
X_train, X_test, y_train, y_test, scaler = load_and_preprocess("../data/diabetes.csv")
joblib.dump(scaler, "../models/scaler.pkl")  # Save scaler for inference



  from .autonotebook import tqdm as notebook_tqdm


Index(['PatientID', 'Pregnancies', 'PlasmaGlucose', 'DiastolicBloodPressure',
       'TricepsThickness', 'SerumInsulin', 'BMI', 'DiabetesPedigree', 'Age',
       'Diabetic'],
      dtype='object')


['../models/scaler.pkl']

In [2]:
# Step 2: Hyperparameter Tuning with Optuna
best_params = run_optimization()
print("Best Parameters:", best_params)


[I 2025-07-18 02:31:28,903] A new study created in memory with name: no-name-f5c7314d-2168-4121-84d9-9a354533d5ec
[I 2025-07-18 02:31:33,685] Trial 0 finished with value: 0.985138168499522 and parameters: {'n_estimators': 122, 'max_depth': 10, 'learning_rate': 0.17275640563159309}. Best is trial 0 with value: 0.985138168499522.
[I 2025-07-18 02:31:36,346] Trial 1 finished with value: 0.9855388409826753 and parameters: {'n_estimators': 93, 'max_depth': 8, 'learning_rate': 0.08820908809726784}. Best is trial 1 with value: 0.9855388409826753.
[I 2025-07-18 02:31:37,227] Trial 2 finished with value: 0.9867801174074297 and parameters: {'n_estimators': 62, 'max_depth': 4, 'learning_rate': 0.11266015705482257}. Best is trial 2 with value: 0.9867801174074297.
[I 2025-07-18 02:31:37,732] Trial 3 finished with value: 0.9873874768487513 and parameters: {'n_estimators': 69, 'max_depth': 2, 'learning_rate': 0.19888160239314795}. Best is trial 3 with value: 0.9873874768487513.
[I 2025-07-18 02:31:38

Best trial: {'n_estimators': 153, 'max_depth': 4, 'learning_rate': 0.13706227574622626}
Best Parameters: {'n_estimators': 153, 'max_depth': 4, 'learning_rate': 0.13706227574622626}


In [3]:
# Step 3: Train final model using best parameters
best_model, acc, auc = train(X_train, X_test, y_train, y_test, best_params)
print(f"Final Accuracy: {acc:.4f}, AUC: {auc:.4f}")




Model saved to ../models/model.pkl
Final Accuracy: 0.9520, AUC: 0.9915


In [4]:
# Step 4: Model Registration

# 1. Import the register_model function from your script
from model_registration import register_model

import mlflow
mlflow.end_run() # End any active run

# 2. Call it with your final model and test data
acc, auc = register_model(best_model, X_test, y_test, best_params)

print("Final Accuracy:", acc)
print("Final AUC:", auc)




Final model saved at: models/final_model.pkl




Model registered to MLflow
Final Accuracy: 0.952
Final AUC: 0.9914611728734337


In [9]:
# Step 5: Run Batch Inference on New Data (diabetes2.csv)
import sys
sys.path.append("../src")

from batch_inference import batch_predict  # import after adding path

import joblib
import pandas as pd

# Paths
model_path = "../models/model.pkl"
scaler_path = "../models/scaler.pkl"
input_csv_path = "../data/diabetes2.csv"
output_csv_path = "../output/predictions.csv"

# Load scaler
scaler = joblib.load(scaler_path)

# Run batch inference
batch_predict(model_path, scaler, input_csv_path, output_csv_path)

# Preview some predictions
predictions = pd.read_csv(output_csv_path)
print(predictions.head())



FileNotFoundError: [Errno 2] No such file or directory: '../models/final_model.pkl'