In [34]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

TARGET = 'mark'


def load_datasets(input_folder):
    datasets = {}
    
    for filename in os.listdir(input_folder):
        if filename.endswith(".csv"):
            course_name = filename[:-4]
            df = pd.read_csv(f"{input_folder}/{filename}")
            datasets[course_name] = clean_dataset(df)

    return datasets

def clean_dataset(df):
    from sklearn.preprocessing import LabelEncoder
    le = LabelEncoder()
    df[TARGET] = le.fit_transform(df[TARGET])
    return df


def split_datasets(datasets):
    split_datasets = {}

    for course_name, df in datasets.items():
        X = df.drop(TARGET, axis=1)
        y = df[TARGET]
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

        split_datasets[course_name] = {
            'x_train': X_train,
            'y_train': y_train,
            'x_test': X_test,
            'y_test': y_test,
        }
    return split_datasets


def evaluate_model(y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    auc = roc_auc_score(y_test, y_pred)
    
    metrics = {
        'accuracy': accuracy,
        'f1_score': f1,
        'roc_auc': auc
    }
    
    return metrics

In [35]:
DATA_PATH = 'data'
datasets = load_datasets(DATA_PATH)

In [37]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

classifiers = {
    "RandomForest": {'model': RandomForestClassifier(), 'params': {'n_estimators': [100, 200], 'min_samples_leaf': [1, 2]}},
    "K-nearest-neighbor": {'model': KNeighborsClassifier(), 'params': {'n_neighbors': [2, 5], 'weights': ['uniform', 'distance'], 'leaf_size': [30, 50]}},
    "Artificial Neural Network": {'model': MLPClassifier(), 'params': {'hidden_layer_sizes': [(100,), (50, 50)], 'activation': ['tanh', 'relu'], 'max_iter': [200, 300]}},
    "Decision Tree": {'model': DecisionTreeClassifier(), 'params': {'criterion': ['gini', 'entropy'], 'splitter': ['best', 'random']}},
    "Logistic Regression": {'model': LogisticRegression(), 'params': {'C': [0.5, 1, 1.5], 'penalty': ['l1', 'l2'], 'solver': ['liblinear']}},
    "Support Vector Machine": {'model': SVC(), 'params': {'kernel': ['linear', 'rbf'], 'C': [0.1, 1, 10], 'gamma': ['scale', 'auto']}},
    "Naive Bayes": {'model': GaussianNB(), 'params': {}},
    "XG-boost": {'model': XGBClassifier(), 'params': {'n_estimators': [50, 100], 'objective': ['binary:logistic'], 'learning_rate': [0.01, 0.1, 1.0]}},
}

overall_performance = {}

for course_name, data in split_datasets.items():
    print(f"Processing data for {course_name}...")
    
    X_train = data['x_train']
    y_train = data['y_train']
    X_test = data['x_test']
    y_test = data['y_test']
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    course_performance = {}
    
    for clf_name, clf_spec in classifiers.items():
        print(f"  Training {clf_name}...")
        
        grid_search = GridSearchCV(estimator=clf_spec['model'], param_grid=clf_spec['params'], cv=5, scoring='accuracy')
        grid_search.fit(X_train_scaled, y_train)
        
        best_params = grid_search.best_params_
        best_model = grid_search.best_estimator_
        
        y_pred = best_model.predict(X_test_scaled)
        
        metrics = evaluate_model(y_test, y_pred)
        print(f"  Metrics for {clf_name}: {metrics}")
        
        course_performance[clf_name] = {'best_params': best_params, 'performance': metrics}
    
    overall_performance[course_name] = course_performance

print("\nOverall Classifier Performance:")
for course_name, performance in overall_performance.items():
    print(f"\n{course_name}")
    for clf_name, clf_performance in performance.items():
        print(f"{clf_name} with {clf_performance['best_params']}: {clf_performance['performance']}")


Processing data for 110...
  Training RandomForest...
  Metrics for RandomForest: {'accuracy': 0.7894736842105263, 'f1_score': 0.7609649122807017, 'roc_auc': 0.5916666666666667}
  Training K-nearest-neighbor...
  Metrics for K-nearest-neighbor: {'accuracy': 0.631578947368421, 'f1_score': 0.6456946965113934, 'roc_auc': 0.49166666666666664}
  Training Artificial Neural Network...


  if is_sparse(data):


  Metrics for Artificial Neural Network: {'accuracy': 0.631578947368421, 'f1_score': 0.661350345560872, 'roc_auc': 0.5833333333333333}
  Training Decision Tree...
  Metrics for Decision Tree: {'accuracy': 0.6842105263157895, 'f1_score': 0.7125506072874493, 'roc_auc': 0.7083333333333334}
  Training Logistic Regression...
  Metrics for Logistic Regression: {'accuracy': 0.6842105263157895, 'f1_score': 0.7045112781954886, 'roc_auc': 0.6166666666666667}
  Training Support Vector Machine...
  Metrics for Support Vector Machine: {'accuracy': 0.7368421052631579, 'f1_score': 0.758107389686337, 'roc_auc': 0.7416666666666667}
  Training Naive Bayes...
  Metrics for Naive Bayes: {'accuracy': 0.631578947368421, 'f1_score': 0.661350345560872, 'roc_auc': 0.5833333333333333}
  Training XG-boost...


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):


  Metrics for XG-boost: {'accuracy': 0.8947368421052632, 'f1_score': 0.8804824561403509, 'roc_auc': 0.75}
Processing data for 111...
  Training RandomForest...
  Metrics for RandomForest: {'accuracy': 0.5, 'f1_score': 0.3333333333333333, 'roc_auc': 0.5}
  Training K-nearest-neighbor...




  Metrics for K-nearest-neighbor: {'accuracy': 0.5, 'f1_score': 0.3333333333333333, 'roc_auc': 0.5}
  Training Artificial Neural Network...


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):


  Metrics for Artificial Neural Network: {'accuracy': 0.25, 'f1_score': 0.2, 'roc_auc': 0.25}
  Training Decision Tree...
  Metrics for Decision Tree: {'accuracy': 0.5, 'f1_score': 0.5, 'roc_auc': 0.5}
  Training Logistic Regression...
  Metrics for Logistic Regression: {'accuracy': 0.25, 'f1_score': 0.2, 'roc_auc': 0.25}
  Training Support Vector Machine...
  Metrics for Support Vector Machine: {'accuracy': 0.5, 'f1_score': 0.3333333333333333, 'roc_auc': 0.5}
  Training Naive Bayes...
  Metrics for Naive Bayes: {'accuracy': 0.5, 'f1_score': 0.5, 'roc_auc': 0.5}
  Training XG-boost...


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):


  Metrics for XG-boost: {'accuracy': 0.5, 'f1_score': 0.3333333333333333, 'roc_auc': 0.5}
Processing data for 218...
  Training RandomForest...
  Metrics for RandomForest: {'accuracy': 0.7916666666666666, 'f1_score': 0.7944339025932954, 'roc_auc': 0.78125}
  Training K-nearest-neighbor...
  Metrics for K-nearest-neighbor: {'accuracy': 0.7916666666666666, 'f1_score': 0.7765567765567766, 'roc_auc': 0.71875}
  Training Artificial Neural Network...




  Metrics for Artificial Neural Network: {'accuracy': 0.6666666666666666, 'f1_score': 0.6666666666666666, 'roc_auc': 0.625}
  Training Decision Tree...
  Metrics for Decision Tree: {'accuracy': 0.7083333333333334, 'f1_score': 0.7030303030303031, 'roc_auc': 0.65625}
  Training Logistic Regression...
  Metrics for Logistic Regression: {'accuracy': 0.7083333333333334, 'f1_score': 0.7122074636306136, 'roc_auc': 0.6875}
  Training Support Vector Machine...
  Metrics for Support Vector Machine: {'accuracy': 0.75, 'f1_score': 0.7394957983193278, 'roc_auc': 0.6875}
  Training Naive Bayes...
  Metrics for Naive Bayes: {'accuracy': 0.75, 'f1_score': 0.7394957983193278, 'roc_auc': 0.6875}
  Training XG-boost...


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):


  Metrics for XG-boost: {'accuracy': 0.7083333333333334, 'f1_score': 0.7030303030303031, 'roc_auc': 0.65625}
Processing data for 29...
  Training RandomForest...
  Metrics for RandomForest: {'accuracy': 0.8292682926829268, 'f1_score': 0.7739837398373984, 'roc_auc': 0.4857142857142857}
  Training K-nearest-neighbor...
  Metrics for K-nearest-neighbor: {'accuracy': 0.8048780487804879, 'f1_score': 0.7613711272247858, 'roc_auc': 0.4714285714285714}
  Training Artificial Neural Network...




  Metrics for Artificial Neural Network: {'accuracy': 0.8536585365853658, 'f1_score': 0.7862644415917843, 'roc_auc': 0.5}
  Training Decision Tree...
  Metrics for Decision Tree: {'accuracy': 0.8780487804878049, 'f1_score': 0.873364354642266, 'roc_auc': 0.7214285714285713}
  Training Logistic Regression...
  Metrics for Logistic Regression: {'accuracy': 0.8536585365853658, 'f1_score': 0.7862644415917843, 'roc_auc': 0.5}
  Training Support Vector Machine...
  Metrics for Support Vector Machine: {'accuracy': 0.8536585365853658, 'f1_score': 0.7862644415917843, 'roc_auc': 0.5}
  Training Naive Bayes...
  Metrics for Naive Bayes: {'accuracy': 0.8536585365853658, 'f1_score': 0.7862644415917843, 'roc_auc': 0.5}
  Training XG-boost...


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):


  Metrics for XG-boost: {'accuracy': 0.8780487804878049, 'f1_score': 0.8602294242120502, 'roc_auc': 0.6523809523809523}
Processing data for 46...
  Training RandomForest...
  Metrics for RandomForest: {'accuracy': 0.6666666666666666, 'f1_score': 0.5333333333333333, 'roc_auc': 0.5}
  Training K-nearest-neighbor...
  Metrics for K-nearest-neighbor: {'accuracy': 0.3333333333333333, 'f1_score': 0.3333333333333333, 'roc_auc': 0.25}
  Training Artificial Neural Network...


Traceback (most recent call last):
  File "c:\Users\isabe\Documents\TFM-Jorge\.venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\isabe\Documents\TFM-Jorge\.venv\Lib\site-packages\sklearn\metrics\_scorer.py", line 266, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\isabe\Documents\TFM-Jorge\.venv\Lib\site-packages\sklearn\metrics\_scorer.py", line 353, in _score
    y_pred = method_caller(estimator, "predict", X)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\isabe\Documents\TFM-Jorge\.venv\Lib\site-packages\sklearn\metrics\_scorer.py", line 86, in _cached_call
    result, _ = _get_response_values(
                ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\isabe\Documents\TFM-J

  Metrics for Artificial Neural Network: {'accuracy': 0.6666666666666666, 'f1_score': 0.5333333333333333, 'roc_auc': 0.5}
  Training Decision Tree...
  Metrics for Decision Tree: {'accuracy': 0.6666666666666666, 'f1_score': 0.5333333333333333, 'roc_auc': 0.5}
  Training Logistic Regression...
  Metrics for Logistic Regression: {'accuracy': 0.3333333333333333, 'f1_score': 0.16666666666666666, 'roc_auc': 0.5}
  Training Support Vector Machine...
  Metrics for Support Vector Machine: {'accuracy': 0.6666666666666666, 'f1_score': 0.5333333333333333, 'roc_auc': 0.5}
  Training Naive Bayes...
  Metrics for Naive Bayes: {'accuracy': 0.6666666666666666, 'f1_score': 0.5333333333333333, 'roc_auc': 0.5}
  Training XG-boost...


6 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
6 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\isabe\Documents\TFM-Jorge\.venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\isabe\Documents\TFM-Jorge\.venv\Lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\isabe\Documents\TFM-Jorge\.venv\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1227, in fit
    self.coef_, self.intercept_, self.n_iter_ = _fit_liblinear(
          

  Metrics for XG-boost: {'accuracy': 0.6666666666666666, 'f1_score': 0.5333333333333333, 'roc_auc': 0.5}
Processing data for 88...
  Training RandomForest...
  Metrics for RandomForest: {'accuracy': 0.6363636363636364, 'f1_score': 0.6363636363636364, 'roc_auc': 0.6239316239316239}
  Training K-nearest-neighbor...
  Metrics for K-nearest-neighbor: {'accuracy': 0.5909090909090909, 'f1_score': 0.5452289645838033, 'roc_auc': 0.5341880341880343}
  Training Artificial Neural Network...




  Metrics for Artificial Neural Network: {'accuracy': 0.5909090909090909, 'f1_score': 0.5934928229665072, 'roc_auc': 0.5854700854700855}
  Training Decision Tree...
  Metrics for Decision Tree: {'accuracy': 0.4090909090909091, 'f1_score': 0.4128229665071771, 'roc_auc': 0.39743589743589747}
  Training Logistic Regression...
  Metrics for Logistic Regression: {'accuracy': 0.5454545454545454, 'f1_score': 0.5332792207792209, 'roc_auc': 0.5128205128205128}
  Training Support Vector Machine...
  Metrics for Support Vector Machine: {'accuracy': 0.6818181818181818, 'f1_score': 0.6664576802507836, 'roc_auc': 0.6452991452991452}
  Training Naive Bayes...
  Metrics for Naive Bayes: {'accuracy': 0.4090909090909091, 'f1_score': 0.3044932079414839, 'roc_auc': 0.48290598290598286}
  Training XG-boost...


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):


  Metrics for XG-boost: {'accuracy': 0.5, 'f1_score': 0.503157894736842, 'roc_auc': 0.4914529914529914}
Processing data for 94...
  Training RandomForest...
  Metrics for RandomForest: {'accuracy': 0.65, 'f1_score': 0.636, 'roc_auc': 0.6313131313131313}
  Training K-nearest-neighbor...
  Metrics for K-nearest-neighbor: {'accuracy': 0.55, 'f1_score': 0.5051282051282051, 'roc_auc': 0.5202020202020202}
  Training Artificial Neural Network...




  Metrics for Artificial Neural Network: {'accuracy': 0.65, 'f1_score': 0.636, 'roc_auc': 0.6313131313131313}
  Training Decision Tree...
  Metrics for Decision Tree: {'accuracy': 0.65, 'f1_score': 0.636, 'roc_auc': 0.6313131313131313}
  Training Logistic Regression...
  Metrics for Logistic Regression: {'accuracy': 0.65, 'f1_score': 0.6150997150997151, 'roc_auc': 0.6212121212121212}
  Training Support Vector Machine...
  Metrics for Support Vector Machine: {'accuracy': 0.55, 'f1_score': 0.5051282051282051, 'roc_auc': 0.5202020202020202}
  Training Naive Bayes...
  Metrics for Naive Bayes: {'accuracy': 0.5, 'f1_score': 0.48989898989898994, 'roc_auc': 0.5151515151515152}
  Training XG-boost...


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):


  Metrics for XG-boost: {'accuracy': 0.55, 'f1_score': 0.532, 'roc_auc': 0.5303030303030303}

Overall Classifier Performance:

110
RandomForest with {'min_samples_leaf': 1, 'n_estimators': 100}: {'accuracy': 0.7894736842105263, 'f1_score': 0.7609649122807017, 'roc_auc': 0.5916666666666667}
K-nearest-neighbor with {'leaf_size': 30, 'n_neighbors': 5, 'weights': 'uniform'}: {'accuracy': 0.631578947368421, 'f1_score': 0.6456946965113934, 'roc_auc': 0.49166666666666664}
Artificial Neural Network with {'activation': 'relu', 'hidden_layer_sizes': (100,), 'max_iter': 300}: {'accuracy': 0.631578947368421, 'f1_score': 0.661350345560872, 'roc_auc': 0.5833333333333333}
Decision Tree with {'criterion': 'gini', 'splitter': 'random'}: {'accuracy': 0.6842105263157895, 'f1_score': 0.7125506072874493, 'roc_auc': 0.7083333333333334}
Logistic Regression with {'C': 0.5, 'penalty': 'l2', 'solver': 'liblinear'}: {'accuracy': 0.6842105263157895, 'f1_score': 0.7045112781954886, 'roc_auc': 0.6166666666666667}
S

  if is_sparse(data):
  if is_sparse(data):


In [41]:
import matplotlib.pyplot as plt

overall_performance


{'110': {'RandomForest': {'best_params': {'min_samples_leaf': 1,
    'n_estimators': 100},
   'performance': {'accuracy': 0.7894736842105263,
    'f1_score': 0.7609649122807017,
    'roc_auc': 0.5916666666666667}},
  'K-nearest-neighbor': {'best_params': {'leaf_size': 30,
    'n_neighbors': 5,
    'weights': 'uniform'},
   'performance': {'accuracy': 0.631578947368421,
    'f1_score': 0.6456946965113934,
    'roc_auc': 0.49166666666666664}},
  'Artificial Neural Network': {'best_params': {'activation': 'relu',
    'hidden_layer_sizes': (100,),
    'max_iter': 300},
   'performance': {'accuracy': 0.631578947368421,
    'f1_score': 0.661350345560872,
    'roc_auc': 0.5833333333333333}},
  'Decision Tree': {'best_params': {'criterion': 'gini', 'splitter': 'random'},
   'performance': {'accuracy': 0.6842105263157895,
    'f1_score': 0.7125506072874493,
    'roc_auc': 0.7083333333333334}},
  'Logistic Regression': {'best_params': {'C': 0.5,
    'penalty': 'l2',
    'solver': 'liblinear'},
 