## Version 2 - Refactored Implementation
We need to refactor this solution to be more maintainable and production-ready. We'll:
1. Move code into proper modules
2. Add MLflow tracking
3. Implement proper code quality checks

Add autoreload since we are now modifying scripts in other folders.

In [14]:
from DSML.config import DATASET, PROCESSED_DATA_DIR, RAW_DATA_DIR
from pathlib import Path
from kaggle.api.kaggle_api_extended import KaggleApi
from loguru import logger

api = KaggleApi()
api.authenticate()
download_folder = Path(RAW_DATA_DIR)
api.dataset_download_files(DATASET, path=str(download_folder), unzip=True)
logger.info(f"RAW_DATA_DIR is: {RAW_DATA_DIR}")

[32m2025-09-26 20:17:33.562[0m | [1mINFO    [0m | [36mDSML.config[0m:[36m<module>[0m:[36m14[0m - [1mPROJ_ROOT path is: D:\sdev\wwsi\arisa\MLOps-on-new-dataset[0m


Dataset URL: https://www.kaggle.com/datasets/gabrielluizone/high-school-alcoholism-and-academic-performance


[32m2025-09-26 20:17:34.816[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m10[0m - [1mRAW_DATA_DIR is: D:\sdev\wwsi\arisa\MLOps-on-new-dataset\dataset\raw[0m


In [9]:
%load_ext autoreload
%autoreload 2

In [10]:
import sys
import os
sys.path.append(os.path.abspath(".."))

from DSML.preproc import get_raw_data
from DSML.config import RAW_DATA_DIR, target, categorical

In [11]:
# Check Python executable path
import sys
print(sys.executable)

d:\sdev\wwsi\arisa\MLOps-on-new-dataset\.venv\Scripts\python.exe


In [13]:
import mlflow
from DSML.train import run_hyperopt, get_or_create_experiment

experiment_id = get_or_create_experiment("alcohol_hyperparam_tuning")
mlflow.set_experiment(experiment_id=experiment_id)

# Load and preprocess data
df_train = get_raw_data()

# Create target variable and split features
y = df_train.pop('Alcoholic')
X = df_train

categorical_indices = [X.columns.get_loc(col) for col in categorical if col in X.columns]

# Run hyperparameter optimization
best_params_path = run_hyperopt(X, y, categorical_indices, overwrite=True)

KeyboardInterrupt: 

In [None]:
from DSML.train import train_cv
import joblib

params = joblib.load(best_params_path)
print("Best parameters:", params)

n_folds = 5
cv_output_path = train_cv(X, y, categorical_indices, params, n=n_folds)

In [None]:
import pandas as pd
from DSML.train import plot_error_scatter

cv_results = pd.read_csv(cv_output_path)

# Plot F1 score
plot_error_scatter(
    df_plot=cv_results,
    name="Mean F1 Score",
    title="Cross-Validation (N=5) Mean F1 score with Error Bands",
    xtitle="Training Steps",
    ytitle="Performance Score",
    yaxis_range=[0.5, 1]
)

# Plot logloss
plot_error_scatter(
    df_plot=cv_results,
    x="iterations",
    y="test-Logloss-mean",
    err="test-Logloss-std", 
    name="Mean logloss",
    title="Cross-Validation (N=5) Mean Logloss with Error Bands",
    xtitle="Training Steps",
    ytitle="Logloss"
)

In [None]:
# Train final model
from DSML.train import train
model_path, model_params_path = train(X, y, categorical_indices, params)

# Generate SHAP plots
from DSML.predict import explain_predictions
explain_predictions(model_path, X)

# Make predictions
from DSML.predict import predict_and_save
predictions_path = predict_and_save(model_path, X, df_ids)
print(f"Predictions saved to: {predictions_path}")