In [2]:
# Data Wrangling
import pandas as pd
import numpy as np
import gzip
import gc

# Visualization, EDA
import missingno as msno
import matplotlib.pylab as plt
from matplotlib import font_manager, rc
import seaborn as sns
%matplotlib inline

# Korean font
#font_name = font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
#rc('font', family=font_name)

# Preprocessing & Feature Engineering
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.feature_selection import SelectPercentile

# Hyperparameter Optimization
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import optuna

# Modeling
from lightgbm import LGBMClassifier
from sklearn.dummy import DummyClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import *
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import ClassifierMixin

# Evaluation
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

from keras.callbacks import ModelCheckpoint

# Utility
import re
import os
import time
import datetime
import random
import tensorflow as tf
import pickle
import joblib
import platform
import warnings; warnings.filterwarnings("ignore")
from itertools import combinations
from tqdm import tqdm, tqdm_notebook
from IPython.display import Image
from scipy.stats.mstats import gmean
from tensorflow import keras
from PIL import Image
from tqdm.auto import trange

from sklearn.datasets import make_classification

from sklearn.ensemble import BaggingClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import KFold

import shap # v2.0부터 추가
from featurizer import KMeansFeaturizer # v2.0부터 추가
import gzip
import pickle

In [5]:
with gzip.open('last dance_3.zip','rb') as f:
   train,test,X_train, y_train, X_test, ID_test, cat_features, num_features = pickle.load(f)

## Model building

# DNN

In [8]:
# seed 값을 재설정하는 함수
def reset_seeds(SEED, reset_graph_with_backend=None):
    if reset_graph_with_backend is not None:
        K = reset_graph_with_backend
        K.clear_session()
        tf.compat.v1.reset_default_graph()
        print("KERAS AND TENSORFLOW GRAPHS RESET")  # optional

    np.random.seed(SEED)
    random.seed(SEED)
    tf.compat.v1.set_random_seed(SEED)
#    os.environ['CUDA_VISIBLE_DEVICES'] = ''  # for GPU
    print("RANDOM SEEDS RESET: ", SEED)  # optional

In [9]:
# 예측값을 저장할 폴더 생성
folder = 'last dance'
if not os.path.isdir(folder):
    os.mkdir(folder)

In [10]:
# 사용할 seed 값 목록
seed_values = [96, 513, 661, 933, 1030, 1549, 2525, 3521, 3674, 4484, 4686, 4944, 6678, 6781, 8888, 8947, 9570, 9624, 9639, 9715]

# 각 seed 값에 대해 모델 학습 및 예측 수행
for SEED in seed_values:
    reset_seeds(SEED)
    
    # 신경망 아케텍처 정의
    input = keras.Input(shape=(X_train.shape[1],))
    x = keras.layers.Dense(32, activation='relu')(input)
    x = keras.layers.Dense(16, activation='relu')(x)
    output = keras.layers.Dense(1, activation='sigmoid')(x)
    model = keras.Model(input, output)    

    # Optimizer와 Loss Function 정의
    model.compile(loss='binary_crossentropy', optimizer=keras.optimizers.Nadam(learning_rate=0.001), metrics=['accuracy'])
    
    # 모델 학습
    callbacks = [keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)]
    hist = model.fit(X_train, y_train, validation_split=0.2, batch_size=4096, epochs=50, 
                     callbacks=callbacks, shuffle=False, verbose=0)
    print("validation accuracy = ", hist.history['val_accuracy'][-1])
    
    # 예측값 저장
    submission = pd.DataFrame({
        "ID": ID_test, 
        "STATUS": (model.predict(X_test).flatten() >= 0.495).astype(int)
    })
    t = pd.Timestamp.now()
    fname = f"{folder}/loop_submission_{t.month:02}{t.day:02}_{SEED:05}.csv"
    submission.to_csv(fname, index=False)

RANDOM SEEDS RESET:  96
validation accuracy =  0.9029005169868469
RANDOM SEEDS RESET:  513
validation accuracy =  0.9018681049346924
RANDOM SEEDS RESET:  661


In [None]:
# 저장된 예측값 파일들을 종합하여 최종 예측값 생성
nf = 0
for f in os.listdir(folder):
    ext = os.path.splitext(f)[-1]
    if ext == '.csv': 
        s = pd.read_csv(folder+"/"+f)
    else: 
        continue
    if len(s.columns) !=2:
        continue
    if nf == 0: 
        slist = s
    else: 
        slist = pd.merge(slist, s, on="ID", suffixes=('',f'_{nf}'))
    nf += 1

# 여러 예측값을 평균내어 최종 예측값 생성
if nf >= 2:
    pred = 0
    for j in range(nf): pred = pred + slist.iloc[:,j+1]
    pred = pred / nf   
    pred = pred.apply(lambda x: 1 if x >= 0.5 else 0)    
    submission = pd.DataFrame({'ID': slist.ID, 'STATUS': pred})
    t = pd.Timestamp.now()
    fname = f"dnn_submission_{t.month:02}{t.day:02}_{t.hour:02}{t.minute:02}.csv"
    submission.to_csv(fname, index=False)

# LGBM OOF

In [6]:
# LightGBM 모델 설정 및 K-Fold 교차 검증 수행
kf = KFold(n_splits=5, shuffle=True, random_state=0)
model = LGBMClassifier(random_state=0, n_estimators=1000)

lgbm_pred = np.zeros((X_test.shape[0]))
accuracy_list = []
threshold=0.495 # 이진 분류 임계값 설정

# 5-Fold 교차 검증 루프
for tr_idx, val_idx in tqdm(kf.split(X_train, y_train)):
    tr_x, tr_y = X_train.iloc[tr_idx], y_train.iloc[tr_idx] # 학습 데이터 분할
    val_x, val_y = X_train.iloc[val_idx], y_train.iloc[val_idx] # 검증 데이터 분할
    
    model.fit(tr_x, tr_y) # 모델 학습
    pred_probs = model.predict_proba(val_x)[:, 1]  # 검증 데이터에 대한 예측 확률값
    
    
    # 예측 클래스를 임계값(threshold)에 따라 이진화
    pred = (pred_probs >= threshold).astype(int)
    
    accuracy = accuracy_score(val_y, pred) # 정확도 계산
    accuracy_list.append(accuracy)
    print(f'Fold Accuracy: {accuracy:.4f}')
    
    sub_pred = model.predict_proba(X_test)[:, 1] / 5  # 테스트 데이터에 대한 예측 확률값 누적
    lgbm_pred += sub_pred

# 테스트 데이터에 대한 최종 예측 클래스를 임계값(threshold)에 따라 이진화
lgbm_pred_binary = (lgbm_pred >= threshold).astype(int)

print(f'{model.__class__.__name__}의 5-fold 평균 Accuracy는 {np.mean(accuracy_list)}')

0it [00:00, ?it/s]

[LightGBM] [Info] Number of positive: 253313, number of negative: 397607
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.039815 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2269
[LightGBM] [Info] Number of data points in the train set: 650920, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.389161 -> initscore=-0.450838
[LightGBM] [Info] Start training from score -0.450838
Fold Accuracy: 0.8669


1it [00:24, 24.54s/it]

[LightGBM] [Info] Number of positive: 253014, number of negative: 397906
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.037133 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2278
[LightGBM] [Info] Number of data points in the train set: 650920, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.388702 -> initscore=-0.452771
[LightGBM] [Info] Start training from score -0.452771
Fold Accuracy: 0.8678


2it [00:52, 26.62s/it]

[LightGBM] [Info] Number of positive: 253311, number of negative: 397609
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.037660 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2279
[LightGBM] [Info] Number of data points in the train set: 650920, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.389158 -> initscore=-0.450851
[LightGBM] [Info] Start training from score -0.450851
Fold Accuracy: 0.8688


3it [01:20, 27.31s/it]

[LightGBM] [Info] Number of positive: 253289, number of negative: 397631
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.036558 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2270
[LightGBM] [Info] Number of data points in the train set: 650920, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.389125 -> initscore=-0.450993
[LightGBM] [Info] Start training from score -0.450993
Fold Accuracy: 0.8672


4it [01:49, 27.74s/it]

[LightGBM] [Info] Number of positive: 253333, number of negative: 397587
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.036813 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2269
[LightGBM] [Info] Number of data points in the train set: 650920, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.389192 -> initscore=-0.450709
[LightGBM] [Info] Start training from score -0.450709
Fold Accuracy: 0.8681


5it [02:17, 27.54s/it]

LGBMClassifier의 5-fold 평균 Accuracy는 0.867762551465618





In [None]:
# LightGBM 모델의 예측 결과 저장
t = pd.Timestamp.now()
fname = f"lgbm_submission_{t.month:02}{t.day:02}{t.hour:02}{t.minute:02}.csv"
pd.DataFrame({'ID': ID_test, 'STATUS': lgbm_pred_binary}).to_csv(fname, index=False)
print(f"'{fname}' is ready to submit.")

# CAT OOF

In [7]:
# CatBoost 모델 설정 및 K-Fold 교차 검증 수행
kf = KFold(n_splits=5, shuffle=True, random_state=0)
model = CatBoostClassifier(random_state=0, n_estimators=1000)

cat_pred = np.zeros((X_test.shape[0]))
accuracy_list = []
threshold=0.495 # 이진 분류 임계값 설정

# 5-Fold 교차 검증 루프
for tr_idx, val_idx in tqdm(kf.split(X_train, y_train)):
    tr_x, tr_y = X_train.iloc[tr_idx], y_train.iloc[tr_idx]
    val_x, val_y = X_train.iloc[val_idx], y_train.iloc[val_idx]
    
    model.fit(tr_x, tr_y) # 모델 학습
    pred_probs = model.predict_proba(val_x)[:, 1] # 검증 데이터에 대한 예측 확률값
    
    
    # 예측 클래스를 임계값(threshold)에 따라 이진화
    pred = (pred_probs >= threshold).astype(int)
    
    accuracy = accuracy_score(val_y, pred) # 정확도 계산
    accuracy_list.append(accuracy)
    print(f'Fold Accuracy: {accuracy}')
    
    sub_pred = model.predict_proba(X_test)[:, 1] / 5  # 테스트 데이터에 대한 예측 확률값 누적
    cat_pred += sub_pred

# 테스트 데이터에 대한 최종 예측 클래스를 임계값(threshold)에 따라 이진화
cat_pred_binary = (cat_pred >= threshold).astype(int)

print(f'{model.__class__.__name__}의 5-fold 평균 Accuracy는 {np.mean(accuracy_list)}')

0it [00:00, ?it/s]

Learning rate set to 0.163799
0:	learn: 0.4981496	total: 319ms	remaining: 5m 18s
1:	learn: 0.4443432	total: 387ms	remaining: 3m 13s
2:	learn: 0.4018206	total: 464ms	remaining: 2m 34s
3:	learn: 0.3802343	total: 549ms	remaining: 2m 16s
4:	learn: 0.3647510	total: 627ms	remaining: 2m 4s
5:	learn: 0.3535435	total: 712ms	remaining: 1m 58s
6:	learn: 0.3456894	total: 790ms	remaining: 1m 52s
7:	learn: 0.3396548	total: 858ms	remaining: 1m 46s
8:	learn: 0.3351058	total: 922ms	remaining: 1m 41s
9:	learn: 0.3316733	total: 979ms	remaining: 1m 36s
10:	learn: 0.3290235	total: 1.04s	remaining: 1m 33s
11:	learn: 0.3268102	total: 1.11s	remaining: 1m 31s
12:	learn: 0.3247463	total: 1.17s	remaining: 1m 29s
13:	learn: 0.3178877	total: 1.28s	remaining: 1m 29s
14:	learn: 0.3164242	total: 1.36s	remaining: 1m 29s
15:	learn: 0.3152106	total: 1.42s	remaining: 1m 27s
16:	learn: 0.3142075	total: 1.5s	remaining: 1m 26s
17:	learn: 0.3135305	total: 1.58s	remaining: 1m 26s
18:	learn: 0.3127257	total: 1.66s	remaining: 1

1it [01:24, 84.02s/it]

Learning rate set to 0.163799
0:	learn: 0.4980154	total: 85.1ms	remaining: 1m 24s
1:	learn: 0.4444064	total: 169ms	remaining: 1m 24s
2:	learn: 0.4074533	total: 269ms	remaining: 1m 29s
3:	learn: 0.3844359	total: 365ms	remaining: 1m 30s
4:	learn: 0.3669552	total: 457ms	remaining: 1m 30s
5:	learn: 0.3564175	total: 549ms	remaining: 1m 30s
6:	learn: 0.3464021	total: 635ms	remaining: 1m 30s
7:	learn: 0.3408054	total: 724ms	remaining: 1m 29s
8:	learn: 0.3364118	total: 818ms	remaining: 1m 30s
9:	learn: 0.3330595	total: 902ms	remaining: 1m 29s
10:	learn: 0.3303541	total: 979ms	remaining: 1m 28s
11:	learn: 0.3281804	total: 1.06s	remaining: 1m 27s
12:	learn: 0.3203455	total: 1.15s	remaining: 1m 27s
13:	learn: 0.3186716	total: 1.22s	remaining: 1m 26s
14:	learn: 0.3171196	total: 1.3s	remaining: 1m 25s
15:	learn: 0.3159741	total: 1.38s	remaining: 1m 24s
16:	learn: 0.3150410	total: 1.46s	remaining: 1m 24s
17:	learn: 0.3141762	total: 1.53s	remaining: 1m 23s
18:	learn: 0.3129430	total: 1.6s	remaining: 

2it [02:44, 82.01s/it]

Learning rate set to 0.163799
0:	learn: 0.4986967	total: 73.3ms	remaining: 1m 13s
1:	learn: 0.4452710	total: 135ms	remaining: 1m 7s
2:	learn: 0.4085549	total: 203ms	remaining: 1m 7s
3:	learn: 0.3853726	total: 315ms	remaining: 1m 18s
4:	learn: 0.3692246	total: 398ms	remaining: 1m 19s
5:	learn: 0.3571116	total: 480ms	remaining: 1m 19s
6:	learn: 0.3489005	total: 559ms	remaining: 1m 19s
7:	learn: 0.3429088	total: 631ms	remaining: 1m 18s
8:	learn: 0.3383248	total: 703ms	remaining: 1m 17s
9:	learn: 0.3345725	total: 779ms	remaining: 1m 17s
10:	learn: 0.3250155	total: 866ms	remaining: 1m 17s
11:	learn: 0.3227059	total: 948ms	remaining: 1m 18s
12:	learn: 0.3207725	total: 1.02s	remaining: 1m 17s
13:	learn: 0.3193200	total: 1.08s	remaining: 1m 16s
14:	learn: 0.3182537	total: 1.16s	remaining: 1m 16s
15:	learn: 0.3172072	total: 1.23s	remaining: 1m 15s
16:	learn: 0.3160782	total: 1.31s	remaining: 1m 16s
17:	learn: 0.3153802	total: 1.4s	remaining: 1m 16s
18:	learn: 0.3144551	total: 1.48s	remaining: 1

3it [04:18, 87.30s/it]

Learning rate set to 0.163799
0:	learn: 0.4962408	total: 284ms	remaining: 4m 43s
1:	learn: 0.4432636	total: 799ms	remaining: 6m 38s
2:	learn: 0.4021690	total: 1.31s	remaining: 7m 16s
3:	learn: 0.3809346	total: 1.5s	remaining: 6m 13s
4:	learn: 0.3641822	total: 1.69s	remaining: 5m 35s
5:	learn: 0.3454660	total: 1.96s	remaining: 5m 24s
6:	learn: 0.3384635	total: 2.1s	remaining: 4m 58s
7:	learn: 0.3326310	total: 2.21s	remaining: 4m 33s
8:	learn: 0.3286752	total: 2.3s	remaining: 4m 13s
9:	learn: 0.3254739	total: 2.4s	remaining: 3m 57s
10:	learn: 0.3230631	total: 2.53s	remaining: 3m 47s
11:	learn: 0.3208823	total: 2.82s	remaining: 3m 51s
12:	learn: 0.3190311	total: 3.22s	remaining: 4m 4s
13:	learn: 0.3175497	total: 3.4s	remaining: 3m 59s
14:	learn: 0.3166911	total: 3.83s	remaining: 4m 11s
15:	learn: 0.3157475	total: 3.98s	remaining: 4m 4s
16:	learn: 0.3149188	total: 4.05s	remaining: 3m 54s
17:	learn: 0.3141301	total: 4.15s	remaining: 3m 46s
18:	learn: 0.3130664	total: 4.23s	remaining: 3m 38s

4it [06:03, 94.45s/it]

Learning rate set to 0.163799
0:	learn: 0.4982535	total: 117ms	remaining: 1m 57s
1:	learn: 0.4445792	total: 234ms	remaining: 1m 56s
2:	learn: 0.4084401	total: 348ms	remaining: 1m 55s
3:	learn: 0.3848795	total: 447ms	remaining: 1m 51s
4:	learn: 0.3697740	total: 556ms	remaining: 1m 50s
5:	learn: 0.3575271	total: 668ms	remaining: 1m 50s
6:	learn: 0.3489411	total: 764ms	remaining: 1m 48s
7:	learn: 0.3430559	total: 855ms	remaining: 1m 45s
8:	learn: 0.3376715	total: 945ms	remaining: 1m 44s
9:	learn: 0.3268455	total: 1.04s	remaining: 1m 43s
10:	learn: 0.3242183	total: 1.15s	remaining: 1m 43s
11:	learn: 0.3217008	total: 1.24s	remaining: 1m 41s
12:	learn: 0.3203577	total: 1.33s	remaining: 1m 41s
13:	learn: 0.3189288	total: 1.43s	remaining: 1m 40s
14:	learn: 0.3156070	total: 1.54s	remaining: 1m 41s
15:	learn: 0.3142463	total: 1.65s	remaining: 1m 41s
16:	learn: 0.3131711	total: 1.74s	remaining: 1m 40s
17:	learn: 0.3122817	total: 1.83s	remaining: 1m 40s
18:	learn: 0.3113794	total: 1.94s	remaining:

5it [07:56, 95.24s/it] 

CatBoostClassifier의 5-fold 평균 Accuracy는 0.8678006513857308





In [None]:
# CatBoost 모델의 예측 결과 저장
t = pd.Timestamp.now()
fname = f"cat_submission_{t.month:02}{t.day:02}{t.hour:02}{t.minute:02}.csv"
pd.DataFrame({'ID': ID_test, 'STATUS': cat_pred_binary}).to_csv(fname, index=False)
print(f"'{fname}' is ready to submit.")

# DNN + LGBM + CAT Ensemble

In [None]:
import os
import pandas as pd

# 아래 폴더가 있다는 가정 하에 수행
# 해당 폴더에 lgbm, cat, DNN submission 파일 넣어야 함
folder = 'submissions' # 예측 결과 파일들이 저장된 폴더 경로
nf = 0

for f in os.listdir(folder):
    ext = os.path.splitext(f)[-1]
    if ext == '.csv': 
        s = pd.read_csv(os.path.join(folder, f))
    else: 
        continue
    if len(s.columns) != 2:
        continue
    if nf == 0: 
        slist = s
    else: 
        slist = pd.merge(slist, s, on="ID")
    nf += 1

p = 1    # 이 값에 따라 성능이 달라짐 (p=1: 산술평균, p>1: 멱평균)
if nf >= 2:
    pred = 0
    for j in range(nf): 
        pred = pred + slist.iloc[:, j + 1] ** p 
    pred = pred / nf    
    pred = pred ** (1/p)

    # threshold 값을 기준으로 0과 1로 나누기
    threshold = 0.495
    submission = pd.DataFrame({'ID': slist.ID, 'STATUS': (pred >= threshold).astype(int)})

    t = pd.Timestamp.now()
    fname = f"p{p}mean_submission_{t.month:02}{t.day:02}_{t.hour:02}{t.minute:02}.csv"
    submission.to_csv(fname, index=False)