## 1. Library Import

- 필요한 라이브러리를 불러옵니다.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install autogluon

Collecting autogluon
  Downloading autogluon-1.1.0-py3-none-any.whl (9.7 kB)
Collecting autogluon.core[all]==1.1.0 (from autogluon)
  Downloading autogluon.core-1.1.0-py3-none-any.whl (232 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting autogluon.features==1.1.0 (from autogluon)
  Downloading autogluon.features-1.1.0-py3-none-any.whl (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.0/63.0 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting autogluon.tabular[all]==1.1.0 (from autogluon)
  Downloading autogluon.tabular-1.1.0-py3-none-any.whl (308 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m308.5/308.5 kB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting autogluon.multimodal==1.1.0 (from autogluon)
  Downloading autogluon.multimodal-1.1.0-py3-none-any.whl (427 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [

## 2. Data Load

In [None]:
# data_path = '/content/drive/MyDrive/Colab Notebooks/data/Dacon_CTR/'

# df = pd.read_csv(data_path + '/train.csv')
# test = pd.read_csv(data_path + '/test.csv')

# print(df.shape, test.shape)

In [None]:
#  # pandas DataFrame을 Parquet 파일로 저장
# df.to_parquet(data_path + '/train.parquet')
# test.to_parquet(data_path + '/test.parquet')

In [None]:
import pandas as pd

data_path = '/content/drive/MyDrive/Colab Notebooks/data/Dacon_CTR/'

df = pd.read_parquet(data_path + '/train.parquet')
test = pd.read_parquet(data_path + '/test.parquet')


print(df.shape, test.shape)

(28605391, 41) (4538541, 40)


In [None]:
def create_balanced_dataframe(df, fraction=0.1):
    # 'Click' 칼럼의 0과 1의 데이터를 분리
    df_0 = df[df['Click'] == 0]
    df_1 = df[df['Click'] == 1]

    # 각 클래스의 최소 개수로 맞추기
    min_size = min(len(df_0), len(df_1))

    target_size = int(len(df) * fraction / 2)
    target_size = min(target_size, min_size)

    # 각 클래스에서 동일한 개수의 샘플을 무작위로 추출
    df_0_balanced = df_0.sample(n=target_size, random_state=42)
    df_1_balanced = df_1.sample(n=target_size, random_state=42)

    # 두 클래스의 데이터프레임을 합쳐서 새로운 데이터프레임 생성
    df_balanced = pd.concat([df_0_balanced, df_1_balanced]).sample(frac=1, random_state=42).reset_index(drop=True)

    return df_balanced

# 새로운 1:1 비율의 데이터프레임 생성
df = create_balanced_dataframe(df, 1)

# 새로운 데이터프레임의 'Click' 칼럼 비율 확인
ratio_df_balanced = df['Click'].value_counts(normalize=True)

print(df.shape)
print(ratio_df_balanced)

(11139720, 41)
Click
1    0.5
0    0.5
Name: proportion, dtype: float64


## 3. Modeling

In [None]:
from autogluon.tabular import TabularDataset, TabularPredictor
import autogluon.core as ag

train_data = TabularDataset(df)
test_data = TabularDataset(test)

label = 'Click'
eval_metric = 'roc_auc'

In [None]:
from autogluon.tabular import TabularPredictor

# 시간 제한 설정 (예: 12 시간)
time_limit = 12 * 60 * 60

# GPU를 사용할 수 없는 모델을 제외하도록 설정
exclude_model_types = [
    'KNN',  # K-Nearest Neighbors
    'RF',   # Random Forest
    'XT',   # Extra Trees
    'LR',   # Linear Regression
    'NN'    # Tabular Neural Network
]

# TabularPredictor 객체 생성 및 학습
predictor = TabularPredictor(
    label=label,
    eval_metric=eval_metric,
    path='AutogluonModels/ag-20240518_080907'  # 모델 저장 경로
).fit(
    train_data,
    presets='best_quality',  # 'best_quality', 'medium_quality', 'good_quality' 등의 프리셋 설정
    num_stack_levels=0,  # 스택 레벨 설정
    num_bag_folds=0,  # 배깅 설정
    time_limit=time_limit,  # 시간 제한 설정
    num_gpus=1,  # GPU 사용 설정
    excluded_model_types=exclude_model_types  # 제외할 모델 유형 설정
)


Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=0, num_bag_folds=0, num_bag_sets=1
Beginning AutoGluon training ... Time limit = 16200.0s
AutoGluon will save models to "AutogluonModels/ag-20240518_080907"
AutoGluon Version:  1.1.0
Python Version:     3.10.12
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP PREEMPT_DYNAMIC Sun Apr 28 14:29:16 UTC 2024
CPU Count:          8
Memory Avail:       35.04 GB / 50.99 GB (68.7%)
Disk Space Avail:   163.51 GB / 201.23 GB (81.3%)
Train Data Rows:    11139720
Train Data Columns: 40
Label Column:       Click
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [1, 0]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Problem Type:       binary
Preprocessin

[1000]	valid_set's binary_logloss: 0.565481
[2000]	valid_set's binary_logloss: 0.562139
[3000]	valid_set's binary_logloss: 0.560707
[4000]	valid_set's binary_logloss: 0.559904
[5000]	valid_set's binary_logloss: 0.559414
[6000]	valid_set's binary_logloss: 0.559123
[7000]	valid_set's binary_logloss: 0.558924
[8000]	valid_set's binary_logloss: 0.55882
[9000]	valid_set's binary_logloss: 0.558703
[10000]	valid_set's binary_logloss: 0.558635


	0.7843	 = Validation score   (roc_auc)
	4819.05s	 = Training   runtime
	115.35s	 = Validation runtime
Fitting model: LightGBM ... Training model for up to 10842.37s of the 10842.35s of remaining time.
	Training LightGBM with GPU, note that this may negatively impact model quality compared to CPU training.


[1000]	valid_set's binary_logloss: 0.564305
[2000]	valid_set's binary_logloss: 0.562129
[3000]	valid_set's binary_logloss: 0.561429
[4000]	valid_set's binary_logloss: 0.561232
[5000]	valid_set's binary_logloss: 0.561201
[6000]	valid_set's binary_logloss: 0.561422


	0.7817	 = Validation score   (roc_auc)
	2813.21s	 = Training   runtime
	36.01s	 = Validation runtime
Fitting model: CatBoost ... Training model for up to 7984.59s of the 7984.58s of remaining time.
	Training CatBoost with GPU, note that this may negatively impact model quality compared to CPU training.
	0.7688	 = Validation score   (roc_auc)
	1493.25s	 = Training   runtime
	3.88s	 = Validation runtime
Fitting model: NeuralNetFastAI ... Training model for up to 6478.74s of the 6478.73s of remaining time.
	Ran out of time, stopping training early. (Stopping on epoch 18)
	0.7627	 = Validation score   (roc_auc)
	6443.6s	 = Training   runtime
	1.4s	 = Validation runtime
Fitting model: XGBoost ... Training model for up to 30.05s of the 30.04s of remaining time.

    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to

In [None]:
print(predictor.leaderboard(silent = True))

                 model  score_val eval_metric  pred_time_val      fit_time  \
0  WeightedEnsemble_L2   0.785520     roc_auc     156.671587  15571.708904   
1           LightGBMXT   0.784337     roc_auc     115.354186   4819.048686   
2             LightGBM   0.781697     roc_auc      36.012419   2813.206511   
3             CatBoost   0.768801     roc_auc       3.881580   1493.251512   
4      NeuralNetFastAI   0.762715     roc_auc       1.402467   6443.600036   
5              XGBoost   0.679288     roc_auc       1.493099    269.992987   

   pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  \
0                0.020935           2.602160            2       True   
1              115.354186        4819.048686            1       True   
2               36.012419        2813.206511            1       True   
3                3.881580        1493.251512            1       True   
4                1.402467        6443.600036            1       True   
5                1.49

In [None]:
predictor.feature_importance(train_data)

These features in provided data are not utilized by the predictor and will be ignored: ['ID']
Computing feature importance via permutation shuffling for 39 features using 5000 rows with 5 shuffle sets...
	3927.06s	= Expected runtime (785.41s per shuffle set)
	1608.39s	= Actual runtime (Completed 5 of 5 shuffle sets)


Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
F21,0.017692,0.001504,6e-06,5,0.020789,0.014595
F32,0.017458,0.002255,3.3e-05,5,0.022101,0.012815
F09,0.014055,0.001403,1.2e-05,5,0.016943,0.011167
F29,0.012887,0.001871,5.2e-05,5,0.01674,0.009034
F39,0.012277,0.002064,9.2e-05,5,0.016527,0.008026
F17,0.009846,0.001349,4.1e-05,5,0.012624,0.007069
F24,0.008041,0.00169,0.000221,5,0.01152,0.004561
F07,0.007771,0.000848,1.7e-05,5,0.009517,0.006024
F16,0.006892,0.000693,1.2e-05,5,0.008318,0.005465
F37,0.006116,0.001211,0.000175,5,0.008611,0.003622


## 4. Submission

In [None]:
model_to_use = predictor.get_model_best()

# 확률 예측
prob_predictions = predictor.predict_proba(test_data, model=model_to_use)

  model_to_use = predictor.get_model_best()


In [None]:
submission = pd.read_csv(data_path + 'sample_submission.csv')
submission.head()

Unnamed: 0,ID,Click
0,TEST_0000000,0
1,TEST_0000001,0
2,TEST_0000002,0
3,TEST_0000003,0
4,TEST_0000004,0


In [None]:
submission["Click"] = prob_predictions.iloc[:, 1]
submission.to_csv(data_path + '/Gluon_submission_WeightedEnsemble_L2_0603_2.csv', index=False, encoding="utf-8")