In [1]:
#!conda create -n ag python=3.10
#!conda activate ag
#!conda install -c conda-forge mamba
#!mamba install -c conda-forge -c pytorch -c nvidia autogluon "pytorch=*=*cuda*"
#!mamba install -c conda-forge "ray-tune >=2.6.3,<2.7" "ray-default >=2.6.3,<2.7"  # install ray for faster training

In [2]:
# %pip install autogluon
#!pip install --upgrade numpy pandas scipy
#!pip install numpy==1.26.4
#!pip install pyJoules
#!pip install mxnet-cu110
#!pip install jedi
#!pip install setuptools
#!pip install scikit-learn==1.3.0
#!pip install pandas==2.0.0
#!pip install fsspec==2023.1.0
#!pip install torch==2.0.1+cu118 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
#cls
# !pip install cudatoolkit


In [3]:
import torch
print(torch.cuda.is_available())  # Should be True
print(torch.cuda.device_count())  # Should be > 0

True
1


In [4]:
import pandas as pd
import numpy as np
import logging
import os
import shutil
import time
import json
from autogluon.tabular import TabularPredictor
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Deep Learning Part

In [5]:
FILENAME = "XAPI"
DATA_PATH = "xapi.csv"
TARGET = "Class"
KFOLD = 10  # Number of folds for cross-validation

# Load dataset
df = pd.read_csv(DATA_PATH)
# Remove spaces in column names (if any)
df.columns = df.columns.str.replace(' ', '')

# Separate features and target variable
X = df.drop(columns=[TARGET])
y = df[TARGET]

# Create a DataFrame including features and target
df_selected = X.copy()
df_selected[TARGET] = y

# Check if CUDA (GPU) is available
gpu_available = 1 if torch.cuda.is_available() else 0

validation_type = 'kfold'
    
start_time = time.time()

# Define output directory
path = f"GPU_{gpu_available}_{FILENAME}_DL_VALIDATION_{validation_type}"
# If the directory already exists, delete it before creating a new one
if os.path.exists(path):
    shutil.rmtree(path)  # Remove previous results
#os.makedirs(path, exist_ok=True)

# Create AutoGluon predictor
predictor = TabularPredictor(
    label=TARGET, 
    path=path, 
    problem_type="multiclass",
)

# Fit models using AutoGluon with 10-fold bagging
if gpu_available:
    predictor.fit(
        df_selected,
        num_bag_folds=KFOLD,
        verbosity=2,
        num_gpus=1,
        excluded_model_types=['RF', 'KNN', 'GBM', 'XGB', 'CAT', 'XT', 'LR'],
        presets="best_quality"
    )
else:
    predictor.fit(
        df_selected,
        num_bag_folds=KFOLD,
        verbosity=2,
        excluded_model_types=['RF', 'KNN', 'GBM', 'XGB', 'CAT', 'XT', 'LR'],
        presets="best_quality"
    )


# Summary of cross-validation metrics
cv_scores = predictor.leaderboard(extra_info=True)

Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.1.1
Python Version:     3.11.4
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.26100
CPU Count:          32
Memory Avail:       45.11 GB / 63.94 GB (70.5%)
Disk Space Avail:   140.64 GB / 464.91 GB (30.3%)
Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=10, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on subsets of the data. Then holdout validation data is used to detect stacked overfitting.
	Running DyStack for up to 900s of the 3600s of remaining time (25%).
	Run

In [6]:
# Get names of level 1 bagging models (corresponding to folds)
bagged_models = [m for m in predictor.get_model_names() if '_BAG_L1' in m]

# Sort them to match the 10 folds
bagged_models = sorted(bagged_models, key=lambda x: int(x.split('_F')[-1]) if '_F' in x else 0)

# List to store metrics per fold
metrics_by_fold = []

# Iterate over bagging models corresponding to the 10 folds
for i, model in enumerate(bagged_models[:10]):  # Only 10 folds
    y_pred = predictor.predict(df_selected, model=model)  # Class predictions
    y_prob = predictor.predict_proba(df_selected, model=model)  # Probability predictions
    
    acc = accuracy_score(y, y_pred)
    prec = precision_score(y, y_pred, average='weighted')
    rec = recall_score(y, y_pred, average='weighted')
    f1 = f1_score(y, y_pred, average='weighted')
    
    # If the problem is multiclass, calculate ROC-AUC using one-vs-rest
    roc_auc = roc_auc_score(pd.get_dummies(y), y_prob, average='weighted', multi_class='ovr')

    metrics_by_fold.append({
        'fold': i+1, 'model': model, 'accuracy': acc, 'precision': prec, 
        'recall': rec, 'f1': f1, 'roc_auc': roc_auc
    })

# Convert to DataFrame
df_metrics_by_fold = pd.DataFrame(metrics_by_fold)
df_metrics_by_fold


  bagged_models = [m for m in predictor.get_model_names() if '_BAG_L1' in m]


Unnamed: 0,fold,model,accuracy,precision,recall,f1,roc_auc
0,1,NeuralNetFastAI_BAG_L1,0.822917,0.84273,0.822917,0.82059,0.945417
1,2,NeuralNetTorch_BAG_L1,0.94375,0.943772,0.94375,0.943755,0.987213
2,3,NeuralNetTorch_r79_BAG_L1,0.9625,0.96266,0.9625,0.962473,0.993416
3,4,NeuralNetFastAI_r191_BAG_L1,0.872917,0.875617,0.872917,0.872293,0.96205
4,5,NeuralNetTorch_r22_BAG_L1,0.897917,0.897979,0.897917,0.897926,0.975229
5,6,NeuralNetFastAI_r102_BAG_L1,0.935417,0.936691,0.935417,0.93556,0.988206
6,7,NeuralNetFastAI_r145_BAG_L1,0.908333,0.908658,0.908333,0.908449,0.981759
7,8,NeuralNetTorch_r30_BAG_L1,0.935417,0.935494,0.935417,0.935448,0.989438
8,9,NeuralNetTorch_r86_BAG_L1,0.970833,0.970913,0.970833,0.970799,0.995538
9,10,NeuralNetFastAI_r11_BAG_L1,0.916667,0.917488,0.916667,0.916386,0.979921


In [7]:

# Evaluate model on the full training dataset (not a true test set)
y_pred_final = predictor.predict(df_selected)
y_prob_final = predictor.predict_proba(df_selected)

accuracy_final = accuracy_score(df_selected[TARGET], y_pred_final)
precision_final = precision_score(df_selected[TARGET], y_pred_final, average='weighted')
recall_final = recall_score(df_selected[TARGET], y_pred_final, average='weighted')
f1_final = f1_score(df_selected[TARGET], y_pred_final, average='weighted')

# For multi-class ROC-AUC:
roc_auc_final = roc_auc_score(pd.get_dummies(df_selected[TARGET]), 
                                y_prob_final, 
                                average='weighted', 
                                multi_class='ovr')

# Store final evaluation metrics in a DataFrame
df_final_metrics = pd.DataFrame({
    'Metric': ['Accuracy (Final)', 'Precision (Final)', 'Recall (Final)', 'F1 Score (Final)', 'ROC AUC (Final)'],
    'Score': [accuracy_final, precision_final, recall_final, f1_final, roc_auc_final]
})

# Save results to CSV files
end_time = time.time()
execution_time_minutes = (end_time - start_time) / 60

filename_final = f"GPU_{gpu_available}_{FILENAME}_FINAL_RESULTS_{validation_type}_{execution_time_minutes:.2f}.csv"

df_final_metrics.to_csv(filename_final, index=False)

print("Final Metrics saved to:", filename_final)
print("CV Scores summary:", cv_scores)
print("Execution time (min):", execution_time_minutes)

Final Metrics saved to: GPU_1_XAPI_FINAL_RESULTS_kfold_60.70.csv
CV Scores summary:                           model  score_val eval_metric  pred_time_val  \
0           WeightedEnsemble_L3   0.885417    accuracy       3.332850   
1   NeuralNetFastAI_r134_BAG_L2   0.881250    accuracy       3.164241   
2   NeuralNetFastAI_r187_BAG_L2   0.879167    accuracy       3.162000   
3    NeuralNetFastAI_r65_BAG_L2   0.877083    accuracy       3.156638   
4   NeuralNetFastAI_r100_BAG_L2   0.877083    accuracy       3.169394   
..                          ...        ...         ...            ...   
85  NeuralNetFastAI_r191_BAG_L1   0.789583    accuracy       0.080106   
86  NeuralNetFastAI_r111_BAG_L1   0.785417    accuracy       0.100281   
87  NeuralNetFastAI_r160_BAG_L1   0.783333    accuracy       0.075609   
88  NeuralNetFastAI_r100_BAG_L1   0.779167    accuracy       0.095187   
89  NeuralNetFastAI_r194_BAG_L1   0.775000    accuracy       0.078150   

       fit_time  pred_time_val_marginal

In [8]:
# ==========================
# MODEL INSPECTION PART
# ==========================

# This script retrieves information about trained AutoGluon models, including:
# Listing all trained models.
# Extracting hyperparameters from a specific model.
# Getting detailed training information.
# Saving hyperparameters to a JSON file.
# Displaying a full model summary.

# ==========================
# List all trained models
# ==========================
# AutoGluon trains multiple models (e.g., stacked, bagged, ensembles).
# This command lists all trained models in the predictor.
model_names = predictor.get_model_names()
print("Available Models:", model_names)
# Example output: 
# ['WeightedEnsemble_L2', 
# 'LightGBMXT_BAG_L1', 
# 'LightGBM_BAG_L1', 
# 'NeuralNetMXNet_BAG_L1']

Available Models: ['NeuralNetFastAI_BAG_L1', 'NeuralNetTorch_BAG_L1', 'NeuralNetTorch_r79_BAG_L1', 'NeuralNetFastAI_r191_BAG_L1', 'NeuralNetTorch_r22_BAG_L1', 'NeuralNetFastAI_r102_BAG_L1', 'NeuralNetFastAI_r145_BAG_L1', 'NeuralNetTorch_r30_BAG_L1', 'NeuralNetTorch_r86_BAG_L1', 'NeuralNetFastAI_r11_BAG_L1', 'NeuralNetFastAI_r103_BAG_L1', 'NeuralNetTorch_r14_BAG_L1', 'NeuralNetFastAI_r143_BAG_L1', 'NeuralNetFastAI_r156_BAG_L1', 'NeuralNetFastAI_r95_BAG_L1', 'NeuralNetTorch_r41_BAG_L1', 'NeuralNetTorch_r158_BAG_L1', 'NeuralNetFastAI_r37_BAG_L1', 'NeuralNetTorch_r197_BAG_L1', 'NeuralNetFastAI_r134_BAG_L1', 'NeuralNetTorch_r143_BAG_L1', 'NeuralNetFastAI_r111_BAG_L1', 'NeuralNetTorch_r31_BAG_L1', 'NeuralNetFastAI_r65_BAG_L1', 'NeuralNetFastAI_r88_BAG_L1', 'NeuralNetTorch_r87_BAG_L1', 'NeuralNetTorch_r71_BAG_L1', 'NeuralNetTorch_r185_BAG_L1', 'NeuralNetFastAI_r160_BAG_L1', 'NeuralNetFastAI_r69_BAG_L1', 'NeuralNetFastAI_r138_BAG_L1', 'NeuralNetFastAI_r172_BAG_L1', 'NeuralNetTorch_r76_BAG_L1',

  model_names = predictor.get_model_names()


In [9]:
# ==========================
# Get hyperparameters of a specific model
# ==========================
best_model_name = predictor.get_model_best()  # Get the name of the best-performing model
print(f"Best model: {best_model_name}")

# Retrieve hyperparameters of the best-performing model
model_info = predictor.info()
best_model_hyperparameters = model_info['model_info'][best_model_name]['hyperparameters']
print(f"Hyperparameters of the best model: {best_model_hyperparameters}")
#Example output: Hyperparameters of the best model:
# {'use_orig_features': False, 
# 'max_base_models': 25, 
# 'max_base_models_per_type': 5, 
# 'save_bag_folds': True}

Best model: WeightedEnsemble_L3


  best_model_name = predictor.get_model_best()  # Get the name of the best-performing model


Hyperparameters of the best model: {'use_orig_features': False, 'max_base_models': 25, 'max_base_models_per_type': 5, 'save_bag_folds': True}


In [10]:
# ==========================
# Get model-specific training details
# ==========================
# Retrieve additional information about the model (training time, memory usage, etc.)
model_name = best_model_name  # or specify any other model name
model_info = predictor.info()
training_details = model_info['model_info'][model_name]
print(f"Training details of {model_name}:")
print(training_details)

# Example output:
# {'name': 'WeightedEnsemble_L3', 'model_type': 'WeightedEnsembleModel', 'problem_type': 'multiclass', 'eval_metric': 'accuracy', 'stopping_metric': 'accuracy', 'fit_time': 0.07663822174072266, 'num_classes': 3, 'quantile_levels': None, 'predict_time': 0.0010030269622802734, 'val_score': 0.8854166666666666, 'hyperparameters': {'use_orig_features': False, 'max_base_models': 25, 'max_base_models_per_type': 5, 'save_bag_folds': True}, 'hyperparameters_fit': {}, 'hyperparameters_nondefault': ['save_bag_folds'], 'ag_args_fit': {'max_memory_usage_ratio': 1.0, 'max_time_limit_ratio': 1.0, 'max_time_limit': None, 'min_time_limit': 0, 'valid_raw_types': None, 'valid_special_types': None, 'ignored_type_group_special': None, 'ignored_type_group_raw': None, 'get_features_kwargs': None, 'get_features_kwargs_extra': None, 'predict_1_batch_size': None, 'temperature_scalar': None, 'drop_unique': False}, 'num_features': 6, 'features': ['NeuralNetTorch_r30_BAG_L2_1', 'NeuralNetFastAI_r134_BAG_L2_2', 'NeuralNetTorch_r30_BAG_L2_2', 'NeuralNetFastAI_r134_BAG_L2_1', 'NeuralNetTorch_r30_BAG_L2_0', 'NeuralNetFastAI_r134_BAG_L2_0'], 'feature_metadata': <autogluon.common.features.feature_metadata.FeatureMetadata object at 0x00000199481A5A10>, 'memory_size': 26017, 'compile_time': None, 'is_initialized': True, 'is_fit': True, 'is_valid': True, 'can_infer': True, 'bagged_info': {'child_model_type': 'GreedyWeightedEnsembleModel', 'num_child_models': 1, 'child_model_names': ['S1F1'], '_n_repeats': 1, '_k_per_n_repeat': [1], '_random_state': 3, 'low_memory': False, 'bagged_mode': False, 'max_memory_size': 26017, 'min_memory_size': 26017, 'child_hyperparameters': {'ensemble_size': 25, 'subsample_size': 1000000}, 'child_hyperparameters_fit': {'ensemble_size': 4}, 'child_ag_args_fit': {'max_memory_usage_ratio': 1.0, 'max_time_limit_ratio': 1.0, 'max_time_limit': None, 'min_time_limit': 0, 'valid_raw_types': None, 'valid_special_types': None, 'ignored_type_group_special': None, 'ignored_type_group_raw': None, 'get_features_kwargs': None, 'get_features_kwargs_extra': None, 'predict_1_batch_size': None, 'temperature_scalar': None, 'drop_unique': False}}, 'stacker_info': {'num_base_models': 2, 'base_model_names': ['NeuralNetTorch_r30_BAG_L2', 'NeuralNetFastAI_r134_BAG_L2']}, 'children_info': {'S1F1': {'name': 'S1F1', 'model_type': 'GreedyWeightedEnsembleModel', 'problem_type': 'multiclass', 'eval_metric': 'accuracy', 'stopping_metric': 'accuracy', 'fit_time': 0.07663822174072266, 'num_classes': 3, 'quantile_levels': None, 'predict_time': None, 'val_score': None, 'hyperparameters': {'ensemble_size': 25, 'subsample_size': 1000000}, 'hyperparameters_fit': {'ensemble_size': 4}, 'hyperparameters_nondefault': [], 'ag_args_fit': {'max_memory_usage_ratio': 1.0, 'max_time_limit_ratio': 1.0, 'max_time_limit': None, 'min_time_limit': 0, 'valid_raw_types': None, 'valid_special_types': None, 'ignored_type_group_special': None, 'ignored_type_group_raw': None, 'get_features_kwargs': None, 'get_features_kwargs_extra': None, 'predict_1_batch_size': None, 'temperature_scalar': None, 'drop_unique': False}, 'num_features': 6, 'features': ['NeuralNetTorch_r30_BAG_L2_0', 'NeuralNetTorch_r30_BAG_L2_1', 'NeuralNetTorch_r30_BAG_L2_2', 'NeuralNetFastAI_r134_BAG_L2_0', 'NeuralNetFastAI_r134_BAG_L2_1', 'NeuralNetFastAI_r134_BAG_L2_2'], 'feature_metadata': <autogluon.common.features.feature_metadata.FeatureMetadata object at 0x00000199481A7690>, 'memory_size': 7343, 'compile_time': None, 'is_initialized': True, 'is_fit': True, 'is_valid': True, 'can_infer': True, 'model_weights': {'NeuralNetTorch_r30_BAG_L2': 0.25, 'NeuralNetFastAI_r134_BAG_L2': 0.75}}}}

Training details of WeightedEnsemble_L3:
{'name': 'WeightedEnsemble_L3', 'model_type': 'WeightedEnsembleModel', 'problem_type': 'multiclass', 'eval_metric': 'accuracy', 'stopping_metric': 'accuracy', 'fit_time': 0.06194448471069336, 'num_classes': 3, 'quantile_levels': None, 'predict_time': 0.0010030269622802734, 'val_score': 0.8854166666666666, 'hyperparameters': {'use_orig_features': False, 'max_base_models': 25, 'max_base_models_per_type': 5, 'save_bag_folds': True}, 'hyperparameters_fit': {}, 'hyperparameters_nondefault': ['save_bag_folds'], 'ag_args_fit': {'max_memory_usage_ratio': 1.0, 'max_time_limit_ratio': 1.0, 'max_time_limit': None, 'min_time_limit': 0, 'valid_raw_types': None, 'valid_special_types': None, 'ignored_type_group_special': None, 'ignored_type_group_raw': None, 'get_features_kwargs': None, 'get_features_kwargs_extra': None, 'predict_1_batch_size': None, 'temperature_scalar': None, 'drop_unique': False}, 'num_features': 6, 'features': ['NeuralNetTorch_r30_BAG_L2_1

In [11]:
# ==========================
# Retrieve full model summary
# ==========================
# Display a leaderboard with all trained models, including:
# - Train time
# - Validation accuracy
# - Performance metrics
summary = predictor.leaderboard(extra_info=True)
print("Full model summary:")
print(summary)

Full model summary:
                          model  score_val eval_metric  pred_time_val  \
0           WeightedEnsemble_L3   0.885417    accuracy       3.332850   
1   NeuralNetFastAI_r134_BAG_L2   0.881250    accuracy       3.164241   
2   NeuralNetFastAI_r187_BAG_L2   0.879167    accuracy       3.162000   
3    NeuralNetFastAI_r65_BAG_L2   0.877083    accuracy       3.156638   
4   NeuralNetFastAI_r100_BAG_L2   0.877083    accuracy       3.169394   
..                          ...        ...         ...            ...   
85  NeuralNetFastAI_r191_BAG_L1   0.789583    accuracy       0.080106   
86  NeuralNetFastAI_r111_BAG_L1   0.785417    accuracy       0.100281   
87  NeuralNetFastAI_r160_BAG_L1   0.783333    accuracy       0.075609   
88  NeuralNetFastAI_r100_BAG_L1   0.779167    accuracy       0.095187   
89  NeuralNetFastAI_r194_BAG_L1   0.775000    accuracy       0.078150   

       fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  \
0   1503.659126             

In [12]:
predictor.feature_importance(df_selected)


Computing feature importance via permutation shuffling for 16 features using 480 rows with 5 shuffle sets...
	661.76s	= Expected runtime (132.35s per shuffle set)
	399.49s	= Actual runtime (Completed 5 of 5 shuffle sets)


Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
StudentAbsenceDays,0.22,0.015632,3e-06,5,0.252186,0.187814
Topic,0.082917,0.010765,3.3e-05,5,0.105082,0.060751
Relation,0.082917,0.006815,5e-06,5,0.096948,0.068885
raisedhands,0.071667,0.01651,0.000315,5,0.10566,0.037673
VisITedResources,0.063333,0.011748,0.000136,5,0.087523,0.039144
gender,0.060833,0.009247,6.2e-05,5,0.079873,0.041794
PlaceofBirth,0.044583,0.006002,3.8e-05,5,0.056942,0.032225
Discussion,0.044167,0.005781,3.4e-05,5,0.05607,0.032263
ParentAnsweringSurvey,0.04375,0.007065,7.9e-05,5,0.058297,0.029203
GradeID,0.04375,0.004886,1.8e-05,5,0.05381,0.03369


In [13]:
# ==========================
# WILCOXON SIGNED-RANK TEST
# ==========================

from scipy.stats import wilcoxon
import pandas as pd

# Reference values from Yu et al.
yu_et_al_metrics = {
    'accuracy': 0.7646,
    'precision': 0.6165,
    'recall': 0.6277,
    'f1': 0.6216,
    'roc_auc': None  # Not reported
}

# Dictionary to store Wilcoxon test results
wilcoxon_results = []

# Perform Wilcoxon Signed-Rank Test for each metric
for metric in ['accuracy', 'precision', 'recall', 'f1']:
    if yu_et_al_metrics[metric] is not None:
        # Compute paired differences
        differences = df_metrics_by_fold[metric] - yu_et_al_metrics[metric]
        # Compute Wilcoxon test
        stat, p_value = wilcoxon(differences)
                
        # Compute descriptive statistics
        median_diff = np.median(differences)
        mean_diff = np.mean(differences)
        std_diff = np.std(differences)

        # Store results
        wilcoxon_results.append({
            'metric': metric,
            'statistic': stat,
            'p-value': p_value,
            'median_difference': median_diff,
            'mean_difference': mean_diff,
            'std_difference': std_diff
        })

# Convert results to DataFrame
df_wilcoxon = pd.DataFrame(wilcoxon_results)
df_wilcoxon

Unnamed: 0,metric,statistic,p-value,median_difference,mean_difference,std_difference
0,accuracy,0.0,0.001953,0.161442,0.152067,0.041947
1,precision,0.0,0.001953,0.309991,0.3027,0.037483
2,recall,0.0,0.001953,0.298342,0.288967,0.041947
3,f1,0.0,0.001953,0.304317,0.294768,0.042532
