In [1]:
import pandas as pd
import os
import sys

In [2]:

path = os.path.abspath("../")
if(path not in sys.path):
    sys.path.append(path) 

Load Processed Data

In [3]:
import pandas as pd

X_train = pd.read_csv("../data/processed/X_train.csv")
X_test = pd.read_csv("../data/processed/X_test.csv")
y_train = pd.read_csv("../data/processed/y_train.csv").squeeze()
y_test = pd.read_csv("../data/processed/y_test.csv").squeeze()

Load Preprocessing Pipeline (for consistency)

In [4]:
from src.models.model_training import load_preprocessing_pipeline

preprocessing_pipeline = load_preprocessing_pipeline()


Model Evaluation & Selection

In [5]:
from src.models.model_training import (
    get_candidate_models,
    evaluate_models,
    select_best_model
)

models = get_candidate_models()

evaluation_results = evaluate_models(
    models=models,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test
)

evaluation_results


{'LogisticRegression': {'cv_mean_accuracy': np.float64(0.826905759975159),
  'cv_std_accuracy': np.float64(0.041857683774783765),
  'test_accuracy': 0.7676056338028169,
  'precision': 0.75,
  'recall': 0.8028169014084507,
  'f2_score': 0.7916666666666666,
  'roc_auc': np.float64(0.8698670898631224),
  'confusion_matrix': [[52, 19], [14, 57]]},
 'RandomForest': {'cv_mean_accuracy': np.float64(0.8870672255860891),
  'cv_std_accuracy': np.float64(0.04099149368422021),
  'test_accuracy': 0.8732394366197183,
  'precision': 0.8117647058823529,
  'recall': 0.971830985915493,
  'f2_score': 0.9349593495934959,
  'roc_auc': np.float64(0.9775838127355684),
  'confusion_matrix': [[55, 16], [2, 69]]},
 'SVM': {'cv_mean_accuracy': np.float64(0.8163173420276355),
  'cv_std_accuracy': np.float64(0.027131181256450292),
  'test_accuracy': 0.7816901408450704,
  'precision': 0.7380952380952381,
  'recall': 0.8732394366197183,
  'f2_score': 0.842391304347826,
  'roc_auc': np.float64(0.8887125570323349),
  

Select and Train Best Model

In [6]:
from src.models.model_training import train_final_model

best_model_name = select_best_model(
    evaluation_results,
    metric="f2_score"
)

best_model = models[best_model_name]

best_model = train_final_model(
    best_model,
    X_train,
    y_train
)

best_model_name


'RandomForest'

Final Test Evaluation

In [7]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = best_model.predict(X_test)

test_accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

test_accuracy, report


(0.8732394366197183,
 '              precision    recall  f1-score   support\n\n        Fail       0.96      0.77      0.86        71\n        Pass       0.81      0.97      0.88        71\n\n    accuracy                           0.87       142\n   macro avg       0.89      0.87      0.87       142\nweighted avg       0.89      0.87      0.87       142\n')

Save Model and Metadata

In [8]:
from src.models.model_training import save_model_and_metadata

metadata = {
    "model_name": best_model_name,
    "problem_type": "classification",
    "target": "Pass_Fail",
    "cv_results": evaluation_results,
    "test_accuracy": test_accuracy
}

save_model_and_metadata(best_model, metadata)
