In [None]:
#!conda create -n ag python=3.10
#!conda activate ag
#!conda install -c conda-forge mamba
#!mamba install -c conda-forge -c pytorch -c nvidia autogluon "pytorch=*=*cuda*"
#!mamba install -c conda-forge "ray-tune >=2.6.3,<2.7" "ray-default >=2.6.3,<2.7"  # install ray for faster training

In [None]:
#!pip install autogluon==1.0.0
#!pip install --upgrade numpy pandas scipy
#!pip install numpy==1.26.4
#!pip install pyJoules
#!pip install mxnet-cu110
#!pip install jedi
#!pip install setuptools
#!pip install scikit-learn==1.3.0
#!pip install pandas==2.0.0
#!pip install fsspec==2023.1.0
#!pip install torch==2.0.1+cu118 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
#cls
# !pip install cudatoolkit


In [2]:
import torch
print(torch.cuda.is_available())  # Should be True
print(torch.cuda.device_count())  # Should be > 0

True
1


In [None]:
import pandas as pd
import numpy as np
import logging
import os
import time
from sklearn.feature_selection import mutual_info_classif, VarianceThreshold
from sklearn.model_selection import train_test_split
from autogluon.tabular import TabularPredictor
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score



# Deep Learning Part

In [None]:
FILENAME = "SPD"
DATA_PATH = "spd.csv"
TARGET = "G3" 
KFOLD=10    

# Load dataset
df = pd.read_csv(DATA_PATH)
df.columns = df.columns.str.replace(' ', '')
bins = [0, 10, 15, 20]
labels = ['failing', 'passing', 'excellent']
df['G3'] = pd.cut(df['G3'], bins=bins, labels=labels, right=False)

# Seleccionar las características más relevantes y la etiqueta
features = ['Medu', 'Fedu', 'traveltime', 'studytime', 'failures', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2']
data = df[features + [TARGET]]

# Separate features and target
X = data.drop(columns=[TARGET])
y = data[TARGET]

# Feature Selection: Variance Filtering
selector_variance = VarianceThreshold(threshold=(.8 * (1 - .8)))
X_variance = selector_variance.fit_transform(X)
variance_columns = X.columns[selector_variance.get_support()]

# Feature Selection: Mutual Information Filtering
mi_scores = mutual_info_classif(X_variance, y, discrete_features='auto')
mi_threshold = 0.01 
mi_mask = mi_scores > mi_threshold
X_mi = X_variance[:, mi_mask]
mi_columns = variance_columns[mi_mask]

# Create a DataFrame with the selected features
X_selected = pd.DataFrame(X_mi, columns=mi_columns)

# Create a new DataFrame with the selected features and selected class
df_selected = X_selected.copy()
df_selected[TARGET] = y

for gpu_available in [0, 1]:
    for validation_type in ['7525','kfold']:
        start_time = time.time()
        
        train_data = df_selected
        test_data = df_selected
        
        path = f"GPU_{gpu_available}_{FILENAME}_DL_VALIDATION_{validation_type}"

        # Create the dir if not exist
        os.makedirs(path, exist_ok=True)
        predictor = TabularPredictor(label=TARGET,  path=path, problem_type="multiclass")
        
        if(gpu_available):
            predictor.fit(train_data, num_bag_folds=KFOLD, verbosity=2, num_gpus=1, excluded_model_types= ['RF', 'KNN', 'GBM','XGB','CAT','XT','LR'], presets="best_quality")
        else:
            predictor.fit(train_data, num_bag_folds=KFOLD, verbosity=2, excluded_model_types= ['RF', 'KNN', 'GBM','XGB','CAT','XT','LR'], presets="best_quality")
        
        # Test data evaluaton
        y_pred = predictor.predict(test_data)
        y_true = test_data[TARGET]
        y_prob = predictor.predict_proba(test_data)

        accuracy = accuracy_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred, average='weighted')
        recall = recall_score(y_true, y_pred, average='weighted')
        f1 = f1_score(y_true, y_pred, average='weighted')
        roc_auc = roc_auc_score(pd.get_dummies(y_true), y_prob, average='weighted', multi_class='ovr')

        df1 = pd.DataFrame({
            'Metric': ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC'],
            'Score': [accuracy, precision, recall, f1, roc_auc]
        })

        emissions_filtering = tracker_filtering.stop()        
        end_time = time.time()
        execution_time_minutes = (end_time - start_time) / 60

        filename = f"GPU_{gpu_available}_{FILENAME}_DL_VALIDATION_{validation_type}_TIME_{execution_time_minutes:.2f}.csv"

        print(filename)
        df1.to_csv(filename, index=False)


[codecarbon INFO @ 17:43:57] [setup] RAM Tracking...
[codecarbon INFO @ 17:43:57] [setup] GPU Tracking...
[codecarbon INFO @ 17:43:57] Tracking Nvidia GPU via pynvml
[codecarbon INFO @ 17:43:57] [setup] CPU Tracking...
[codecarbon INFO @ 17:43:59] CPU Model on constant consumption mode: AMD Ryzen 9 5950X 16-Core Processor
[codecarbon INFO @ 17:43:59] >>> Tracker's metadata:
[codecarbon INFO @ 17:43:59]   Platform system: Windows-10-10.0.22631-SP0
[codecarbon INFO @ 17:43:59]   Python version: 3.11.4
[codecarbon INFO @ 17:43:59]   CodeCarbon version: 2.5.0
[codecarbon INFO @ 17:43:59]   Available RAM : 63.944 GB
[codecarbon INFO @ 17:43:59]   CPU count: 32
[codecarbon INFO @ 17:43:59]   CPU model: AMD Ryzen 9 5950X 16-Core Processor
[codecarbon INFO @ 17:43:59]   GPU count: 1
[codecarbon INFO @ 17:43:59]   GPU model: 1 x NVIDIA GeForce RTX 3090
[codecarbon INFO @ 17:43:59] Saving emissions data to file c:\Users\JAL\Documents\GitHub\Art1\100\SPD\emissions.csv
Verbosity: 2 (Standard Loggi

GPU_0_SPD_DL_VALIDATION_7525_TIME_13.52_EMISSION_0.005178.csv


Leaderboard on holdout data (DyStack):
                          model  score_holdout  score_val eval_metric  pred_time_test  pred_time_val    fit_time  pred_time_test_marginal  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0     NeuralNetTorch_r87_BAG_L1       0.876712   0.921875    accuracy        0.092806       0.043404    3.323743                 0.092806                0.043404           3.323743            1       True         26
1      NeuralNetTorch_r1_BAG_L1       0.876712   0.914931    accuracy        0.122042       0.050417    4.590214                 0.122042                0.050417           4.590214            1       True         43
2   NeuralNetFastAI_r100_BAG_L1       0.876712   0.913194    accuracy        0.192047       0.058938    9.168788                 0.192047                0.058938           9.168788            1       True         40
3   NeuralNetFastAI_r127_BAG_L1       0.876712   0.916667    accuracy        0.206367       0.045

GPU_0_SPD_DL_VALIDATION_kfold_TIME_14.55_EMISSION_0.010580.csv


Leaderboard on holdout data (DyStack):
                          model  score_holdout  score_val eval_metric  pred_time_test  pred_time_val    fit_time  pred_time_test_marginal  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0     NeuralNetTorch_r76_BAG_L1       0.907407   0.900463    accuracy        0.088116       0.043593    2.307365                 0.088116                0.043593           2.307365            1       True         33
1     NeuralNetTorch_r71_BAG_L1       0.907407   0.900463    accuracy        0.088327       0.042219    2.194861                 0.088327                0.042219           2.194861            1       True         27
2     NeuralNetTorch_r14_BAG_L1       0.907407   0.886574    accuracy        0.093893       0.031546    2.515980                 0.093893                0.031546           2.515980            1       True         12
3     NeuralNetTorch_r41_BAG_L1       0.907407   0.912037    accuracy        0.096930       0.045

GPU_1_SPD_DL_VALIDATION_7525_TIME_31.01_EMISSION_0.022043.csv


Leaderboard on holdout data (DyStack):
                          model  score_holdout  score_val eval_metric  pred_time_test  pred_time_val    fit_time  pred_time_test_marginal  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0     NeuralNetTorch_r87_BAG_L1       0.876712   0.921875    accuracy        0.103109       0.049355    3.249698                 0.103109                0.049355           3.249698            1       True         26
1      NeuralNetTorch_r1_BAG_L1       0.876712   0.914931    accuracy        0.121565       0.054304    4.495353                 0.121565                0.054304           4.495353            1       True         43
2   NeuralNetFastAI_r127_BAG_L1       0.876712   0.916667    accuracy        0.200095       0.050686    4.579178                 0.200095                0.050686           4.579178            1       True         35
3   NeuralNetFastAI_r100_BAG_L1       0.876712   0.913194    accuracy        0.204402       0.064

GPU_1_SPD_DL_VALIDATION_kfold_TIME_33.67_EMISSION_0.034498.csv


In [7]:
class_distribution_post = df_selected['G3'].value_counts()
print("Class Dist:")
print(class_distribution_post)

NameError: name 'df_selected' is not defined