### Import

In [1]:
import os
import random
import numpy as np
import pandas as pd
from tqdm import tqdm

import plotly.express as px

import category_encoders as ce
from sklearn.ensemble import VotingClassifier
import lightgbm as lgb
from xgboost import XGBClassifier

In [2]:
# 동일한 결과 보장을 위해 Seed값을 고정합니다
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed를 42로 고정

In [3]:
# 제공된 train 데이터와 test 데이터를 불러옵니다
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [4]:
# 빠른 훈련을 위해 데이터 갯수 줄이기
# train = train.sample(100000)
# train.info()

### EDA 1 : Sparse and Dense

In [5]:
train.head()

Unnamed: 0,ID,Click,F01,F02,F03,F04,F05,F06,F07,F08,...,F30,F31,F32,F33,F34,F35,F36,F37,F38,F39
0,TRAIN_00000000,1,NSLHFNS,AVKQTCL,DTZFPRW,114.0,ISVXFVA,1,PQZBVMG,LPYPUNA,...,NZGEZLW,GTISJWW,380.0,2.0,AXQFZWC,IRUDRFB,,TFJMLCZ,0.0,AURZYDY
1,TRAIN_00000001,0,VGIVWZQ,LSUSMVO,PQGWFJZ,26.0,NFRVLWS,43,IMPIGJT,MIGYEEG,...,NZGEZLW,GTISJWW,466.0,1.0,DRVVDHZ,IRUDRFB,19.0,AUGTURV,0.0,LUZRMLU
2,TRAIN_00000002,0,JCDXFYU,PILDDJU,IAGJDOH,119.0,LFPUEOV,0,FFUTIRZ,OFKQGTY,...,VHXETCF,KHZNEZF,197.0,0.0,QMOULXS,IRUDRFB,8.0,ZVSTLNM,0.0,MHBRSQK
3,TRAIN_00000003,1,PSMFWTP,ZYAVJHP,,15.0,ATQPZSJ,26,ZDTZNSB,THBWWCD,...,IVIRTPR,GTISJWW,8640.0,0.0,IZLJUJS,IRUDRFB,14.0,ZBSRLCQ,0.0,GAZBSSZ
4,TRAIN_00000004,0,SLCRICD,QPQWGXA,,13.0,CHZGJZR,20,PQZBVMG,MIGYEEG,...,NZGEZLW,WHSRKIM,41774.0,0.0,BHBIZCL,IRUDRFB,13.0,QHYLSBX,0.0,QTATWAY


### EDA 2 : Imbalance

In [6]:
click = train['Click'].value_counts(normalize=True)

click_figure = px.bar(click,
             x=['Not Clicked : 0', 'Clicked : 1'],
             y=click.values.tolist(),
             labels={'x': 'Value', 'y': 'Percentage'},
             width = 450,
             height = 500
            )

# 그래프 표시
click_figure.show()

### Data Preprocessing 1 : Select x, y

In [7]:
train_x = train.drop(columns=['ID', 'Click'])
train_y = train['Click']

test_x = test.drop(columns=['ID'])

### Data Preprocessing 2 : Fill NaN

In [8]:
for col in tqdm(train_x.columns):
    if train_x[col].isnull().sum() != 0:
        train_x[col].fillna(0, inplace=True)
        test_x[col].fillna(0, inplace=True)

100%|██████████| 39/39 [00:36<00:00,  1.06it/s]


### Data Preprocessing 3 : Count Encoding

In [9]:
encoding_target = list(train_x.dtypes[train_x.dtypes == "object"].index)

enc = ce.CountEncoder(cols = encoding_target).fit(train_x, train_y)
X_train_encoded = enc.transform(train_x)
X_test_encoded = enc.transform(test_x)

### Model Setting

In [10]:
xgb = XGBClassifier(n_estimators=3690, max_depth=11, min_child_weight=274, gamma=3, colsample_bytree=1.0,alpha=4.736755571263444, subsample=0.6)
lgb = lgb.LGBMClassifier(learning_rate=0.00947599327270648, max_depth=10,min_child_samples=99, n_estimators=2939, num_leaves=256,subsample=0.7989612028795253)

In [11]:
voting_clf_soft = VotingClassifier(
    estimators=[('xgb', xgb), ('lgb', lgb)],
    voting='soft'  # 'soft' voting을 사용
)

### Model Train and Inference

In [12]:
voting_clf_soft.fit(X_train_encoded, train_y)

[LightGBM] [Info] Number of positive: 5569860, number of negative: 23035531
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.443833 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6454
[LightGBM] [Info] Number of data points in the train set: 28605391, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.194714 -> initscore=-1.419668
[LightGBM] [Info] Start training from score -1.419668


In [13]:
pred = voting_clf_soft.predict_proba(X_test_encoded)
display(voting_clf_soft.classes_)
display(pred)

array([0, 1], dtype=int64)

array([[0.72271372, 0.27728626],
       [0.89854399, 0.10145602],
       [0.87845678, 0.12154324],
       ...,
       [0.90160104, 0.09839895],
       [0.77561292, 0.22438706],
       [0.76884208, 0.23115792]])

### Submission

In [14]:
sample_submission = pd.read_csv('sample_submission.csv')
sample_submission

Unnamed: 0,ID,Click
0,TEST_0000000,0
1,TEST_0000001,0
2,TEST_0000002,0
3,TEST_0000003,0
4,TEST_0000004,0
...,...,...
4538536,TEST_4538536,0
4538537,TEST_4538537,0
4538538,TEST_4538538,0
4538539,TEST_4538539,0


In [15]:
sample_submission['Click'] = pred[:,1]
sample_submission

Unnamed: 0,ID,Click
0,TEST_0000000,0.277286
1,TEST_0000001,0.101456
2,TEST_0000002,0.121543
3,TEST_0000003,0.428456
4,TEST_0000004,0.397544
...,...,...
4538536,TEST_4538536,0.210091
4538537,TEST_4538537,0.319347
4538538,TEST_4538538,0.098399
4538539,TEST_4538539,0.224387


In [16]:
sample_submission.to_csv('baseline_submission.csv', index=False)