In [1]:
import pandas as pd
import numpy as np, os
import matplotlib.pyplot as plt
import seaborn as sn
import scipy
import torch
from sklearn.preprocessing import LabelEncoder,StandardScaler,OrdinalEncoder
from sklearn.model_selection import (
    train_test_split, GridSearchCV, StratifiedShuffleSplit,
    StratifiedKFold, train_test_split, cross_val_score, cross_validate
)
from sklearn.metrics import (
    roc_auc_score, roc_curve, auc, classification_report,
    confusion_matrix, accuracy_score, matthews_corrcoef
)
from autogluon.core.metrics import make_scorer
from autogluon.tabular import TabularDataset, TabularPredictor
import warnings
warnings.filterwarnings('ignore')

# Custom metric function
def custom_mcc_metric(y_true, y_pred):
    mcc = matthews_corrcoef(y_true, y_pred)
    return mcc

custom_mcc_scorer = make_scorer(name='mcc',
                                score_func=custom_mcc_metric,
                                greater_is_better=True)

print(torch.cuda.is_available())

True


  from .autonotebook import tqdm as notebook_tqdm


# Loading data

In [2]:
cd /workspace/data/

/workspace/data


In [3]:
train_data = pd.read_csv(os.path.join(os.getcwd(), "kaggle/playground-series-s4e8/train.csv"))
test_data = pd.read_csv(os.path.join(os.getcwd(), "kaggle/playground-series-s4e8/test.csv"))
sample_submission_data = pd.read_csv(os.path.join(os.getcwd(), "kaggle/playground-series-s4e8/sample_submission.csv"))

print("train_data :", train_data.shape)
print("test_data :", test_data.shape)
print("sample_submission_data :", sample_submission_data.shape)

train_data : (3116945, 22)
test_data : (2077964, 21)
sample_submission_data : (2077964, 2)


## <p style="background-color:#d8ecff; color: #009dff;margin:0; display:inline-block;padding:.4rem;border-radius:.25rem;border:1px solid #009dff">Handling NaN Values And Less Frequent Categories</p> 

In [4]:
# Index(['id', 'class', 'cap-diameter', 'cap-shape', 'cap-surface', 'cap-color',
#        'does-bruise-or-bleed', 'gill-attachment', 'gill-spacing', 'gill-color',
#        'stem-height', 'stem-width', 'stem-root', 'stem-surface', 'stem-color',
#        'veil-type', 'veil-color', 'has-ring', 'ring-type', 'spore-print-color',
#        'habitat', 'season', 'veil-info'],
#       dtype='object')

# 결측치와 특성 존재 여부를 동시에 다룰 수 있음: 앞에서 0과 1로 veil-type의 존재 여부를 표현하고, 뒤에서 'unknown'을 통해 veil-color의 결측치를 처리함으로써 두 정보를 효과적으로 결합할 수 있습니다.
# 모델 학습에 유용한 정보 제공: 결측치나 특성의 존재 여부를 단순히 무시하지 않고, 이들을 결합하여 모델이 더 많은 패턴을 학습할 수 있게 도와줍니다.
# Index(['w', 'y', 'n', 'u', 'k', 'e', 'g', 'p', 'r', 'o', 's', 'a', 't', 'd',
#        'i', 'h', 'c', 'f', 'l', 'b', 'z', '8.25', '2.49', '3.32'],
#       dtype='object', name='veil-color')
# Index(['u', 'w', 'a', 'f', 'e', 'b', 'c', 'y', 'k', 'g', 'n', 's', 'r', 'd',
#        'p', 'h', 'i', 'l', 'is None', 't', '21.11', '5.94'],
#       dtype='object', name='veil-type')

# Index(['f10', 'tnone', 't1', 't7', 't10', 't4', 't2', 't5', 't3', 't9',
#        'fnone', 'f1', 'f2', 'f7', 'f4', 't8', 'f5', 'r2', 't6', 'f3', 'l4',
#        't0', 'p5', 'z7', 'c10', 'x10', 'f9', 'f6', 's10', 'm9', 'hnone', 's1',
#        'g3', 'g5', 'h7', 'e1', 'f0', 'r4', 'dnone', 's5', 'cnone', 'h1', 'p1',
#        'h5', 'h10', 'w3', 'y2', 'a10', 'ynone', 'e10', 'p7', '10.310', 's2',
#        'o2', 'g10', 'h2', 'g1', 's3', 'p3', 'knone', 'inone', 'nnone', 'rnone',
#        'l5', 'c1', 'n10', 'c3', 'o10', 'e4', 'd10', 'f has-ring10', 'lnone',
#        'c7', 'e3', 'y1', 'k10'],
#       dtype='object', name='has-ring-type')

train_data['veil-info'] = train_data['veil-type'].notna().astype(int).astype(str) + train_data['veil-color'].fillna('unknown')
cap_shape_mapping = {'b': 0, 'c': 1, 'x': 2, 'f': 3, 's': 4, 'p': 5, 'o': 6}
cap_color_mapping = {'n': 0, 'b': 1, 'g': 2, 'r': 3, 'p': 4, 'u': 5, 'e': 6, 'w': 7, 'y': 8, 'l': 9, 'o': 10, 'k': 11}
ring_type_mapping = {'c': 0, 'e': 1, 'r': 2, 'g': 3, 'l': 4, 'p': 5, 's': 6, 'z': 7, 'y': 8, 'm': 9, 'f': 10}
train_data['cap-shape'] = train_data['cap-shape'].map(cap_shape_mapping)
train_data['cap-color'] = train_data['cap-color'].map(cap_color_mapping)
train_data['has-ring-type'] = (
    train_data['has-ring'] + 
    train_data['ring-type'].fillna('none').map(ring_type_mapping).apply(lambda x: f"{x:.0f}" if pd.notna(x) else 'none').str.replace('.0', '', regex=False)
)

test_data['veil-info'] = test_data['veil-type'].notna().astype(int).astype(str) + test_data['veil-color'].fillna('unknown')
test_data['cap-shape'] = test_data['cap-shape'].map(cap_shape_mapping)
test_data['cap-color'] = test_data['cap-color'].map(cap_color_mapping)
test_data['has-ring-type'] = (
    test_data['has-ring'] + 
    test_data['ring-type'].fillna('none').map(ring_type_mapping).apply(lambda x: f"{x:.0f}" if pd.notna(x) else 'none').str.replace('.0', '', regex=False)
)

train_data = train_data.drop(['id', 'veil-color', 'veil-type', 'has-ring', 'ring-type'], axis=1)
test_data = test_data.drop(['id', 'veil-color', 'veil-type', 'has-ring', 'ring-type'], axis=1)

In [5]:
cat_feats = ['cap-shape', 'cap-surface', 'cap-color',
       'does-bruise-or-bleed', 'gill-attachment', 'gill-spacing', 'gill-color',
       'stem-root', 'stem-surface', 'stem-color', 'spore-print-color',
       'habitat', 'season', 'veil-info', 'has-ring-type']

for feat in cat_feats:
    train_data[feat] = train_data[feat].astype('category')
for feat in cat_feats:
    test_data[feat] = test_data[feat].astype('category')

def cleaning(df):
    threshold = 100
    for feat in cat_feats:
        if df[feat].dtype.name == 'category':
            # Add 'missing' and 'noise' to categories if not present
            if 'missing' not in df[feat].cat.categories:
                df[feat] = df[feat].cat.add_categories('missing')
            if 'noise' not in df[feat].cat.categories:
                df[feat] = df[feat].cat.add_categories('noise')
        else:
            # Convert to category and add new categories
            df[feat] = df[feat].astype('category')
            df[feat] = df[feat].cat.add_categories(['missing', 'noise'])
        
        # Fill missing values with 'missing'
        df[feat] = df[feat].fillna('missing')
        
        # Replace infrequent categories with 'noise'
        counts = df[feat].value_counts(dropna=False)
        infrequent_categories = counts[counts < threshold].index
        df[feat] = df[feat].apply(lambda x: 'missing' if x in infrequent_categories else x)
    
    return df

#
train_data = cleaning(train_data)
test_data = cleaning(test_data)

In [6]:
# Calculate group means for the train data
group_by_features = ['stem-width', 'stem-height']
group_means_train = train_data.groupby(group_by_features)['cap-diameter'].mean()

def fill_na_with_group_mean(row, group_means):
    if pd.isna(row['cap-diameter']):
        group = tuple(row[group_by_features])
        return group_means.get(group, np.nan)
    else:
        return row['cap-diameter']

train_data['cap-diameter'] = train_data.apply(fill_na_with_group_mean, axis=1, group_means=group_means_train)
test_data['cap-diameter'] = test_data.apply(fill_na_with_group_mean, axis=1, group_means=group_means_train)

# Calculate the mode from the training data
cap_diameter_mode = train_data['cap-diameter'].mode()[0]
stem_height_mode = train_data['stem-height'].mode()[0]

# Fill missing values in the training data
train_data['cap-diameter'] = train_data['cap-diameter'].fillna(cap_diameter_mode)
test_data['cap-diameter'] = test_data['cap-diameter'].fillna(cap_diameter_mode)

train_data['stem-height'] = train_data['stem-height'].fillna(stem_height_mode)
test_data['stem-height'] = test_data['stem-height'].fillna(stem_height_mode)

# Split Data

In [7]:
CONFIG = {
    "n_folds" : 5,
    "fold" : 3,
    "seed" : 4885,
    "drop_cols" : [],
    "target" : "class",
    "presets" : ['best_quality', "optimize_for_deployment"],# auto_stack=True 포함
    "binary_threshold" : 0.5,
}

skfold = StratifiedKFold(n_splits=CONFIG['n_folds'], shuffle=True, random_state=CONFIG["seed"])

for fold, ( _, val_) in enumerate(skfold.split(train_data, train_data['class'])):
      train_data.loc[val_ , "kfold"] = int(fold)
    
def prepare_datasets(df, fold, drop_columns=[]):
    _df_train = df[df.kfold != fold].reset_index(drop=True)
    _df_valid = df[df.kfold == fold].reset_index(drop=True)

    # drop "id" column
    _df_train = _df_train.drop(columns=drop_columns)
    _df_valid = _df_valid.drop(columns=drop_columns)
    
    train_data = TabularDataset(_df_train)
    valid_data = TabularDataset(_df_valid)
    return train_data, valid_data

train_data, valid_data = prepare_datasets(
    train_data, CONFIG["fold"], drop_columns=CONFIG["drop_cols"]
)

# Initialize TabularPredictor with the custom MCC metric
predictor = TabularPredictor(
    label='class',
    problem_type='binary',
    eval_metric=custom_mcc_scorer,
    path='./kaggle/autogluon_models/',
)

predictor.fit(
    train_data,
    tuning_data=valid_data,
    save_space=True,
    presets=CONFIG["presets"],
    use_bag_holdout=True,
    ag_args_fit={'num_gpus': 1},
)

Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.1.1
Python Version:     3.9.19
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #129-Ubuntu SMP Fri Aug 2 19:25:20 UTC 2024
CPU Count:          32
Memory Avail:       121.04 GB / 125.57 GB (96.4%)
Disk Space Avail:   1225.59 GB / 1712.40 GB (71.6%)
Presets specified: ['best_quality', 'optimize_for_deployment']
Setting dynamic_stacking from 'auto' to False. Reason: Skip dynamic_stacking when use_bag_holdout is enabled. (use_bag_holdout=True)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Beginning AutoGluon training ... Time limit = 3600s
AutoGluon will save models to "./kaggle/autogluon_models/"
Train Data Rows:    2493556
Train Data Columns: 19
Tuning Data Rows:    623389
Tuning Data Columns: 19
Label Column:       class
Problem Type:       binary
Preprocessing data ...
Selected class <--> label mapping:  class 1 = p, class 0 = e
	Note: For your binary classificati

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x7fac3823dbb0>

In [8]:
predictor.leaderboard(valid_data, extra_metrics=[custom_mcc_scorer], silent=True)
predictor

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x7fac3823dbb0>

In [9]:
test_data = TabularDataset(test_data)
test_data['kfold'] = CONFIG["fold"]
y_test_pred = predictor.predict_proba(test_data).iloc[:, 1].values

In [10]:
id_column = sample_submission_data.pop('id')
# y_test_pred = predictor.predict(test_data)

# Create the submission DataFrame
submission_df = pd.DataFrame({
    'id': id_column,
    'class': y_test_pred
})

# Save the submission DataFrame to a CSV file
submission_df.to_csv(f'./kaggle/submission_autog_0831_fold{CONFIG["fold"]}.csv', index=False)
print("Submission file created")

Submission file created
