In [5]:
import warnings

import pandas as pd

from config import MOODLE_DATA_PATH
from data_preprocessing import load_datasets
from oversampling import OversamplingTechniques

from eval import evaluate_models
from plotting import plot_results
from utils import load_from_pickle, save_to_pickle

warnings.simplefilter(action="ignore")

data_path = MOODLE_DATA_PATH
datasets = load_datasets(data_path, OversamplingTechniques.SMOTE)

In [6]:
use_overall_performance_cache = True
if use_overall_performance_cache:
    overall_performance = load_from_pickle("results/overall_performance.pkl")
else:
    overall_performance = evaluate_models(datasets)

save_to_pickle(overall_performance, "results/overall_performance.pkl")
overall_performance

{'110': {'RandomForest': {'best_params': {'bootstrap': False,
    'max_depth': 5,
    'max_features': None,
    'min_samples_leaf': 2,
    'min_samples_split': 10,
    'n_estimators': 100},
   'performance': {'accuracy': 0.9666666666666667,
    'f1_score': 0.9665544332210999,
    'roc_auc': 0.9642857142857143},
   'learning_curve': {'train_sizes': array([ 5, 17, 29, 41, 54]),
    'train_scores': array([[1.        , 0.6       , 0.6       , 0.6       , 0.6       ],
           [1.        , 0.82352941, 0.82352941, 0.82352941, 0.82352941],
           [0.96551724, 0.93103448, 0.96551724, 0.96551724, 0.96551724],
           [0.87804878, 0.87804878, 0.90243902, 0.92682927, 0.97560976],
           [0.90740741, 0.90740741, 0.92592593, 0.92592593, 0.88888889]]),
    'test_scores': array([[0.5       , 0.5       , 0.5       , 0.46153846, 0.46153846],
           [0.71428571, 0.85714286, 0.92857143, 0.69230769, 0.92307692],
           [0.78571429, 0.78571429, 0.85714286, 0.76923077, 0.69230769],
    

In [7]:
plot_results(overall_performance)

In [8]:
for course_name, course_performance in overall_performance.items():
    print(f"Performance Metrics for {course_name}")

    data = []
    # loop over each classifier and its performance metrics
    for clf_name, clf_performance in course_performance.items():
        row = {"Classifier": clf_name}

        metrics = clf_performance["performance"]
        best_params = clf_performance.get("best_params", {})
        row["Best Params"] = str(best_params)

        row.update(
            metrics
        )  # add 'AUC', 'F1', 'Accuracy' keys and their values to each row

        data.append(row)

    df_metrics = pd.DataFrame(data)
    print(df_metrics)
    print("\n" + "=" * 50 + "\n")
    df_metrics.to_csv(f"results/{course_name}_performance_metrics.csv", index=False)

Performance Metrics for 110
                  Classifier  \
0               RandomForest   
1         K-nearest-neighbor   
2  Artificial Neural Network   
3              Decision Tree   
4        Logistic Regression   
5     Support Vector Machine   
6                Naive Bayes   
7                   XG-boost   

                                         Best Params  accuracy  f1_score  \
0  {'bootstrap': False, 'max_depth': 5, 'max_feat...  0.966667  0.966554   
1  {'leaf_size': 30, 'n_neighbors': 5, 'weights':...  0.933333  0.932730   
2  {'activation': 'relu', 'hidden_layer_sizes': (...  0.966667  0.966554   
3          {'criterion': 'gini', 'splitter': 'best'}  0.866667  0.862963   
4  {'C': 0.5, 'penalty': 'l1', 'solver': 'libline...  0.866667  0.866667   
5     {'C': 1, 'gamma': 'scale', 'kernel': 'linear'}  0.966667  0.966554   
6                                                 {}  0.800000  0.794570   
7  {'learning_rate': 0.01, 'n_estimators': 50, 'o...  0.966667  0.966554   