# Imports and configs

In [1]:
!pip install -U autogluon.tabular
!pip install ray==2.10.0

Collecting autogluon.tabular
  Downloading autogluon.tabular-1.1.1-py3-none-any.whl.metadata (13 kB)
Collecting scipy<1.13,>=1.5.4 (from autogluon.tabular)
  Downloading scipy-1.12.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.4/60.4 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Collecting scikit-learn<1.4.1,>=1.3.0 (from autogluon.tabular)
  Downloading scikit_learn-1.4.0-1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting autogluon.core==1.1.1 (from autogluon.tabular)
  Downloading autogluon.core-1.1.1-py3-none-any.whl.metadata (11 kB)
Collecting autogluon.features==1.1.1 (from autogluon.tabular)
  Downloading autogluon.features-1.1.1-py3-none-any.whl.metadata (11 kB)
Collecting autogluon.common==1.1.1 (from autogluon.core==1.1.1->autogluon.tabular)
  Downloading autogluon.common-1.1.1-py3-none-any.whl.metadata (11 kB)
Collecting botocor

In [2]:
# !pip uninstall -y scikit-learn
# !pip install -U scikit-learn
# import sklearn
# sklearn.__version__

In [3]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score
from autogluon.tabular import TabularPredictor
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import warnings
import pickle
import shutil
import os

warnings.filterwarnings('ignore')

In [4]:
class CFG:
    train_path = '/kaggle/input/playground-series-s4e11/train.csv'
    test_path = '/kaggle/input/playground-series-s4e11/test.csv'
    sample_sub_path = '/kaggle/input/playground-series-s4e11/sample_submission.csv'
    original_data_path = '/kaggle/input/depression-surveydataset-for-analysis/final_depression_dataset_1.csv'
    
    target = 'Depression'
    n_folds = 5
    seed = 42
    time_limit = 3600 * 5

# Loading data and predefining folds

In [5]:
%%time
# train = pd.read_csv(CFG.train_path, index_col='id')
# test = pd.read_csv(CFG.test_path, index_col='id')
# original = pd.read_csv(CFG.original_data_path)

train = pd.read_csv('/kaggle/input/playground-series-s4e11/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s4e11/test.csv')
sample = pd.read_csv('/kaggle/input/playground-series-s4e11/sample_submission.csv')
original = pd.read_csv('/kaggle/input/depression-surveydataset-for-analysis/final_depression_dataset_1.csv')

original['Depression'] = original['Depression'].map({
    'No': 0,
    'Yes': 1
})

train = train.drop('id',axis=1)
test = test.drop('id',axis=1)

train = pd.concat(objs=[train, original], ignore_index=True)

CPU times: user 646 ms, sys: 97 ms, total: 743 ms
Wall time: 980 ms


In [6]:
def clean_columns(df, column, valid_categories):
    df[column] = df[column].apply(lambda x : x if x in valid_categories else 'Noise')
    return df

valid_sleep_duration = ["Less than 5 hours", "5-6 hours", "6-7 hours", "7-8 hours", "More than 8 hours"]
valid_dietary_habits = ['Healthy', 'Moderate', 'Unhealthy']

train = clean_columns(train, 'Sleep Duration', valid_sleep_duration)
train = clean_columns(train, 'Dietary Habits', valid_dietary_habits)
test = clean_columns(test, 'Sleep Duration', valid_sleep_duration)
test = clean_columns(test, 'Dietary Habits', valid_dietary_habits)

In [7]:
def removieNoise(df, columns, threshold=100):
    
    for column in columns:
        value_counts = df[column].value_counts()
        low_freq_categories = value_counts[value_counts < threshold].index
        df[column] = df[column].apply(lambda x: x if x not in low_freq_categories else 'Other')
    
    return df

train = removieNoise(train, ['Name', 'City', 'Profession', 'Degree'])
test = removieNoise(test, ['Name', 'City', 'Profession', 'Degree'])

In [8]:
cat_c = [col for col in train.columns if col != 'Depression']
train[cat_c] = train[cat_c].fillna('None').astype('str')
test = test.fillna('None').astype('str')

In [9]:
skf = StratifiedKFold(n_splits=CFG.n_folds, random_state=CFG.seed, shuffle=True)
split = skf.split(train, train[CFG.target])
for i, (_, val_index) in enumerate(split):
    train.loc[val_index, 'fold'] = i

# Fitting the predictor

In [10]:
predictor = TabularPredictor(
    log_file_path='logs.txt',
    log_to_file=True,
    problem_type='binary',
    eval_metric='roc_auc',
    label=CFG.target,
    groups='fold',
    verbosity=2
)

No path specified. Models will be saved in: "AutogluonModels/ag-20241109_141222"


In [11]:
# predictor.fit_pseudolabel(
#     train_data=train,
#     pseudo_data=original,
#     time_limit=CFG.time_limit,
#     presets='best_quality',
#     excluded_model_types=['KNN', 'NN', 'XT', 'FASTAI', 'NN_TORCH'],
#     ag_args_fit={
#         'num_gpus': 1, 
#         'num_cpus': 4
#     }
# )

predictor.fit(
    train_data=train,
    time_limit=CFG.time_limit,
    presets='best_quality',
    excluded_model_types=['KNN', 'NN', 'XT', 'FASTAI', 'NN_TORCH']
)

Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.1.1
Python Version:     3.10.14
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Thu Jun 27 20:43:36 UTC 2024
CPU Count:          4
Memory Avail:       30.15 GB / 31.36 GB (96.2%)
Disk Space Avail:   19.50 GB / 19.52 GB (99.9%)
Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on subsets of the data. Then holdout validation data is used to detect stacked overfitting.
	Running DyStack for up to 4500s of the 18000s of rem

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x7fea5ee68a30>

In [12]:
predictor.leaderboard(silent=True).style.background_gradient(subset=['score_val'], cmap='RdYlGn')

Unnamed: 0,model,score_val,eval_metric,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L3,0.976377,roc_auc,155.031338,10000.395332,0.027348,11.981486,3,True,27
1,WeightedEnsemble_L2,0.976343,roc_auc,99.581954,8320.629317,0.028736,7.794327,2,True,12
2,CatBoost_r177_BAG_L2,0.976185,roc_auc,152.277642,9148.584737,0.476778,236.445732,2,True,20
3,CatBoost_r9_BAG_L2,0.976135,roc_auc,152.643374,9070.548631,0.84251,158.409625,2,True,22
4,CatBoost_r137_BAG_L2,0.976111,roc_auc,152.595981,9187.201287,0.795117,275.062281,2,True,25
5,CatBoost_BAG_L2,0.976011,roc_auc,152.552102,9414.971274,0.751238,502.832268,2,True,17
6,XGBoost_BAG_L2,0.975933,roc_auc,153.684702,9593.558488,1.883838,681.419483,2,True,18
7,CatBoost_r177_BAG_L1,0.975719,roc_auc,1.580825,832.20729,1.580825,832.20729,1,True,8
8,LightGBM_r96_BAG_L2,0.975677,roc_auc,163.353412,8998.850323,11.552548,86.711318,2,True,23
9,CatBoost_BAG_L1,0.975615,roc_auc,3.021527,4102.265335,3.021527,4102.265335,1,True,5


# Visualizing the ensemble weights

In [13]:
# def get_ensemble_weights():
#     ensemble_weights = {}
#     with open('logs.txt', 'rb') as f:
#         prev_line = b''
#         for line in f.readlines():
#             if b'Ensemble Weights:' in line:
#                 try:
#                     name = prev_line.split(b'Fitting model: ')[1]
#                     name = name.split(b'...')[0]
#                     name = name.decode('utf-8')
#                     temp_ensemble_weights = line.split(b'Ensemble Weights: ')[1]
#                     temp_ensemble_weights = temp_ensemble_weights.split(b'\n')[0]
#                     temp_ensemble_weights = eval(temp_ensemble_weights)
#                     ensemble_weights[name] = temp_ensemble_weights
#                 except:
#                     continue
#             prev_line = line
#     return ensemble_weights

# get_ensemble_weights()

In [14]:
# ensemble_weights = get_ensemble_weights()

# for key, value in ensemble_weights.items():
#     plt.figure(figsize=(6, 6))
#     plt.pie(value.values(), labels=value.keys(), autopct='%1.1f%%', colors=sns.color_palette('Set2', len(value)))
#     plt.title(key)
#     plt.tight_layout()
#     plt.show()

# Collecting and saving OOF files

In [15]:
def save_pred_probs(pred_probs, cv_score, name, type, is_ensemble):
    base_path = 'oof_pred_probs' if type == 'oof' else 'test_pred_probs'
    base_path = '.' if is_ensemble else base_path
    with open(f'{base_path}/{name}_{type}_pred_probs_{cv_score:.6f}.pkl', 'wb') as f:
        pickle.dump(pred_probs, f)

def save_submission(test_pred_probs, score):
    sub = pd.read_csv(CFG.sample_sub_path)
    sub[CFG.target] = np.where(test_pred_probs > 0.5, 1, 0)
    sub.to_csv(f'submission.csv', index=False)
    
os.makedirs('oof_pred_probs', exist_ok=True)
os.makedirs('test_pred_probs', exist_ok=True)

In [16]:
oof_pred_probs = {}
test_pred_probs = {}

In [17]:
best_model = predictor.model_best
test_pred_probs = predictor.predict_proba_multi(test)
for model in predictor.model_names():
    model_oof_pred_probs = predictor.predict_proba_oof(model).values[:, 1]
    model_test_pred_probs = test_pred_probs[model].values[:, 1]
    
    cv_score = roc_auc_score(train[CFG.target], model_oof_pred_probs)
    if model != best_model:
        save_pred_probs(model_oof_pred_probs, cv_score, model, 'oof', False)
        save_pred_probs(model_test_pred_probs, cv_score, model, 'test', False)
    else:
        save_pred_probs(model_oof_pred_probs, cv_score, model, 'oof', True)
        save_pred_probs(model_test_pred_probs, cv_score, model, 'test', True)
        save_submission(model_test_pred_probs, cv_score)
        
    oof_pred_probs[model] = model_oof_pred_probs
    test_pred_probs[model] = model_test_pred_probs

# Visualizing the results

In [18]:
# scores = {}
# split = StratifiedKFold(n_splits=CFG.n_folds, random_state=CFG.seed, shuffle=True).split(train, train[CFG.target])
# for fold_idx, (train_index, val_index) in enumerate(split):
#     for model in predictor.model_names():
#         fold_score = roc_auc_score(train.loc[val_index, CFG.target], oof_pred_probs[model][val_index])
# #         fold_score = accuracy_score(train.loc[val_index, CFG.target], np.where(oof_pred_probs[model][val_index]>0.5, 1, 0))
#         if model not in scores:
#             scores[model] = []
#         scores[model].append(fold_score)

In [19]:
# scores = pd.DataFrame(scores)
# mean_scores = scores.mean().sort_values(ascending=False)
# order = scores.mean().sort_values(ascending=False).index.tolist()

# min_score = mean_scores.min()
# max_score = mean_scores.max()
# padding = (max_score - min_score) * 0.5
# lower_limit = min_score - padding
# upper_limit = max_score + padding

# fig, axs = plt.subplots(1, 2, figsize=(15, scores.shape[1] * 0.4))

# sns.boxplot(data=scores, order=order, ax=axs[0], orient='h', palette='RdYlGn_r')
# axs[0].set_title('Fold AUC')
# axs[0].set_xlabel('')
# axs[0].set_ylabel('')

# barplot = sns.barplot(x=mean_scores.values, y=mean_scores.index, ax=axs[1], palette='RdYlGn_r')
# axs[1].set_title('Average Accuracy')
# axs[1].set_xlabel('')
# axs[1].set_xlim(left=lower_limit, right=upper_limit)
# axs[1].set_ylabel('')

# for i, score in enumerate(mean_scores.values):
#     barplot.text(score, i, round(score, 6), va='center')

# plt.tight_layout()
# plt.show()

In [20]:
shutil.rmtree("AutogluonModels")