In [1]:
import warnings

import pandas as pd

from config import KAGGLE_DATA_PATH, KAGGLE_TARGET
from data_preprocessing import load_datasets
from oversampling import OversamplingTechniques

from eval import evaluate_models
from plotting import plot_results
from utils import load_from_pickle, save_to_pickle

warnings.simplefilter(action="ignore")

data_path = KAGGLE_DATA_PATH
datasets = load_datasets(
    data_path,
    [],
    KAGGLE_TARGET,
    oversampling=OversamplingTechniques.SMOTE,
)

In [2]:
use_overall_performance_cache = False
if use_overall_performance_cache:
    overall_performance = load_from_pickle("results/overall_performance_kaggle.pkl")
else:
    overall_performance = evaluate_models(datasets, KAGGLE_TARGET)

save_to_pickle(overall_performance, "results/overall_performance_kaggle.pkl")
overall_performance

Processing data for dataset...
  Training RandomForest...
{'bootstrap': False, 'max_depth': 20, 'max_features': 3, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}


ValueError: Classification metrics can't handle a mix of multiclass and continuous-multioutput targets

In [None]:
plot_results(overall_performance)

In [None]:
for course_name, course_performance in overall_performance.items():
    print(f"Performance Metrics for {course_name}")

    data = []
    # loop over each classifier and its performance metrics
    for clf_name, clf_performance in course_performance.items():
        row = {"Classifier": clf_name}

        metrics = clf_performance["performance"]
        best_params = clf_performance.get("best_params", {})
        row["Best Params"] = str(best_params)

        row.update(
            metrics
        )  # add 'AUC', 'F1', 'Accuracy' keys and their values to each row

        data.append(row)

    df_metrics = pd.DataFrame(data)
    print(df_metrics)
    print("\n" + "=" * 50 + "\n")
    df_metrics.to_csv(f"results/{course_name}_performance_metrics.csv", index=False)