In [1]:
import warnings
warnings.filterwarnings(action='ignore')

import os
import gc
import math
import random
import pickle
import pandas as pd
import numpy as np
import multiprocessing
from tqdm.auto import tqdm

from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler

from transformers import get_cosine_schedule_with_warmup

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset, sampler

In [2]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda')

In [3]:
random_seed = 41

def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed) 
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    os.environ["PYTHONHASHSEED"] = str(seed)
    
def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

seed_everything(seed=random_seed) # Seed 고정

In [4]:
train = pd.read_csv("./data/df_train.csv")
test = pd.read_csv("./data/df_test.csv")
train.shape, test.shape

((262, 48), (175, 47))

In [5]:
ae = pd.read_csv("./data/ae_values.csv")
ae

Unnamed: 0,encodings_0,encodings_1,encodings_2,encodings_3,encodings_4,encodings_5,encodings_6,encodings_7,encodings_8,encodings_9,...,encodings_18,encodings_19,encodings_20,encodings_21,encodings_22,encodings_23,encodings_24,encodings_25,encodings_26,errors
0,-3.235218,4.172423,-3.853053,-1.533935,2.644730,3.716698,-0.299315,-2.650086,0.783824,-0.406989,...,1.082240,-1.530903,-1.215420,-3.295104,0.245303,-0.055747,-1.089265,-1.649269,0.001300,0.008067
1,-3.164804,6.937546,-0.615424,0.798070,-2.365728,-1.246152,0.571691,0.340047,-0.374678,2.935323,...,-0.065099,-0.072708,-2.343938,0.866952,-0.382145,-2.748686,0.459386,0.983944,3.129371,0.009998
2,-3.450719,6.478036,-4.994792,-1.083578,-1.050726,-0.028033,-1.456105,-1.262530,-1.064003,2.599940,...,-3.403145,-1.100375,-1.358432,-1.019482,3.274151,3.080497,2.445034,1.530065,0.137097,0.040302
3,-5.078038,10.407413,-5.961476,-0.954541,-2.045539,0.913142,3.408923,-2.241139,1.254574,1.749235,...,-0.824590,1.962076,-0.140499,-1.557903,3.796697,3.281870,3.378306,1.271765,-2.542374,0.008206
4,-4.685338,6.908415,-5.594373,1.017355,-2.475780,1.170744,-1.731986,2.448224,-0.242145,-0.236407,...,-2.392161,-1.289728,-2.297753,2.706107,0.366826,1.816569,1.143190,4.631185,-0.518664,0.036693
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
432,-6.713977,4.797306,-6.859674,-0.427648,-1.465041,0.584409,-1.570887,0.707021,0.202451,-1.051615,...,0.948983,-1.454844,3.104134,-1.628868,1.587918,-1.379632,-0.742597,3.711644,0.533880,0.028781
433,-3.505607,6.129325,-2.930341,-0.359151,1.557368,0.367298,1.640623,2.402381,-2.167556,0.352788,...,-3.493095,0.081309,-3.520641,-0.392295,2.330890,1.521016,0.899050,4.925332,0.413980,0.015013
434,-1.853786,3.641998,0.470991,-1.175941,3.441618,1.693989,2.112146,1.143281,-0.975709,-2.530986,...,-1.952293,1.092013,-1.333903,1.238227,1.457066,0.588651,-1.395687,1.455708,-0.818322,0.050879
435,-3.249345,8.500051,-4.339647,-2.227715,-1.511902,0.807777,-0.969590,-1.554576,0.884828,-0.540825,...,-0.068143,-3.087835,0.714141,0.378201,2.740462,-0.029870,-0.452864,-0.030688,2.088013,0.007352


In [6]:
train2 = pd.concat([train, ae[:len(train)]], axis=1)
test2 = pd.concat([test, ae[len(train):]], axis=1)

train2.shape, test2.shape

((262, 76), (350, 75))

In [17]:
y = torch.LongTensor(train2['class'].values)
X = train2.drop(['id', 'class'], axis=1).to_numpy()
X_test = test2.drop(['id'], axis=1).to_numpy()
y

tensor([1, 2, 1, 0, 2, 1, 1, 1, 1, 2, 1, 1, 2, 2, 1, 0, 0, 0, 0, 2, 1, 1, 0, 1,
        2, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 2, 1, 0, 1, 1, 1, 1, 1, 0, 2, 1, 1, 0,
        2, 0, 0, 2, 0, 2, 1, 1, 0, 0, 2, 2, 1, 0, 2, 2, 0, 2, 2, 0, 1, 2, 2, 2,
        0, 1, 1, 1, 2, 1, 2, 0, 1, 1, 2, 0, 2, 0, 2, 2, 2, 1, 1, 1, 2, 2, 1, 1,
        2, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 0, 2, 2,
        1, 1, 2, 0, 0, 1, 1, 0, 2, 2, 2, 0, 1, 2, 0, 1, 2, 0, 2, 1, 1, 2, 2, 1,
        1, 1, 2, 2, 0, 1, 0, 2, 2, 2, 1, 0, 1, 1, 1, 1, 1, 0, 1, 2, 1, 1, 1, 0,
        0, 2, 2, 0, 0, 2, 2, 2, 2, 0, 2, 0, 0, 2, 0, 2, 2, 1, 1, 2, 1, 2, 2, 1,
        1, 1, 0, 1, 1, 2, 1, 2, 0, 1, 0, 1, 0, 2, 2, 1, 2, 1, 2, 1, 0, 0, 1, 2,
        0, 0, 2, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 2, 2, 1, 2, 1, 2, 1, 0, 2, 0,
        0, 2, 2, 1, 1, 1, 0, 1, 0, 1, 0, 2, 1, 0, 1, 1, 1, 1, 2, 0, 0, 1])

In [100]:
from xgboost import XGBClassifier, XGBRegressor
from imblearn.over_sampling import SMOTE

xgb_params = {
    'booster': 'gbtree',
    'grow_policy': 'depthwise',
    'max_depth': 4,
    'learning_rate': 0.4,
    'n_estimators': 30,
    'reg_lambda': 100,
    'subsample': 0.9,
    'num_parallel_tree': 1,
    # 'rate_drop': 0.3
}

In [101]:
def calc_f1_score(y_hat, data):
    y_true = data.get_label()
    y_hat = np.argmax(y_hat, axis=1)
    return 'f1', f1_score(y_true, y_hat, average="macro")

In [102]:
random_seed = 5833

y = train['class'].values
X = train.drop(['id', 'class'], axis=1).to_numpy()
X_test = test.drop(['id'], axis=1).to_numpy()

skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=random_seed)

oof_val_preds = np.zeros((X.shape[0], 3))
oof_test_preds = np.zeros((X_test.shape[0], 3))

# OOF
for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y)):

    # print('#'*30, f'Fold [{fold+1}/{skf.n_splits}]', '#'*30)

    # train, valid data 설정
    X_train, y_train = X[train_idx], y[train_idx]
    X_valid, y_valid = X[valid_idx], y[valid_idx]
    
    smote = SMOTE(random_state=random_seed)
    X_train, y_train = smote.fit_resample(X_train, y_train)

    # 불균형 데이터 가중치 조정 값 => 음성(0) 타깃값 개수 / 양성(1) 타깃값 개수
    _, counts = np.unique(np.array(y_train), return_counts=True)
    scale_weight = counts[0] / counts[1]

    # XGBoost 모델 훈련
    xgb_model = XGBClassifier(
        **xgb_params,
        tree_method='gpu_hist',
        predictor='gpu_predictor',
        random_state=random_seed,
        n_jobs=-1
    )
    xgb_model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], eval_metric=calc_f1_score, verbose=True)

    oof_test_preds += xgb_model.predict_proba(X_test) / skf.n_splits
    oof_val_preds[valid_idx] += xgb_model.predict_proba(X_valid)
    
    # if fold == 1 :
    #     pred = xgb_model.predict(X_test)
    #     break
    
    #model save
    # xgb_model.save_model(f'./models/new_xgb_{skf.n_splits}_{fold}.json')
    del [[X_train, y_train, X_valid, y_valid, xgb_model]]
    gc.collect()

#     model score check
preds = np.argmax(oof_val_preds, axis=1)
print(f1_score(y, preds, average="macro"))

# # save OOF test preds
# np.save(f'./results/new_{skf.n_splits}_oof_test_preds.npy', oof_test_preds[:, 1])

[0]	validation_0-mlogloss:1.00455	validation_0-f1:0.93540
[1]	validation_0-mlogloss:0.92590	validation_0-f1:0.94282
[2]	validation_0-mlogloss:0.85614	validation_0-f1:0.94282
[3]	validation_0-mlogloss:0.79948	validation_0-f1:0.93683
[4]	validation_0-mlogloss:0.74157	validation_0-f1:0.92959
[5]	validation_0-mlogloss:0.69207	validation_0-f1:0.95017
[6]	validation_0-mlogloss:0.64906	validation_0-f1:0.94282
[7]	validation_0-mlogloss:0.60993	validation_0-f1:0.94282
[8]	validation_0-mlogloss:0.57561	validation_0-f1:0.95712
[9]	validation_0-mlogloss:0.54718	validation_0-f1:0.95017
[10]	validation_0-mlogloss:0.51964	validation_0-f1:0.95017
[11]	validation_0-mlogloss:0.49552	validation_0-f1:0.95017
[12]	validation_0-mlogloss:0.47675	validation_0-f1:0.95712
[13]	validation_0-mlogloss:0.45720	validation_0-f1:0.96411
[14]	validation_0-mlogloss:0.44009	validation_0-f1:0.96411
[15]	validation_0-mlogloss:0.42392	validation_0-f1:0.97141
[16]	validation_0-mlogloss:0.40854	validation_0-f1:0.97141
[17]	va

In [103]:
submit_pred = np.argmax(oof_test_preds, axis=1)
submit_file = pd.read_csv("./data/sample_submission.csv")
submit_file['class'] = submit_pred
submit_file['class'] = submit_file['class'].map(lambda x : "A" if x==0 else ( "B" if x==1 else "C"))
submit_file

Unnamed: 0,id,class
0,TEST_000,A
1,TEST_001,B
2,TEST_002,C
3,TEST_003,B
4,TEST_004,A
...,...,...
170,TEST_170,B
171,TEST_171,C
172,TEST_172,C
173,TEST_173,B


In [104]:
submit_file.to_csv("submit.csv", index=False)

In [60]:
for i in range(1, 100) :
    k = i+1
    random_seed = 2005

    y = train['class'].values
    X = train.drop(['id', 'class'], axis=1).to_numpy()
    X_test = test.drop(['id'], axis=1).to_numpy()

    skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=random_seed)

    oof_val_preds = np.zeros((X.shape[0], 3))
    oof_test_preds = np.zeros((X_test.shape[0], 3))

    # OOF
    for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y)):

        # print('#'*30, f'Fold [{fold+1}/{skf.n_splits}]', '#'*30)

        # train, valid data 설정
        X_train, y_train = X[train_idx], y[train_idx]
        X_valid, y_valid = X[valid_idx], y[valid_idx]

        smote = SMOTE(random_state=random_seed)
        X_train, y_train = smote.fit_resample(X_train, y_train)

        # 불균형 데이터 가중치 조정 값 => 음성(0) 타깃값 개수 / 양성(1) 타깃값 개수
        _, counts = np.unique(np.array(y_train), return_counts=True)
        scale_weight = counts[0] / counts[1]

        # XGBoost 모델 훈련
        xgb_model = XGBClassifier(
            **xgb_params,
            tree_method='gpu_hist',
            predictor='gpu_predictor',
            random_state=random_seed,
            n_jobs=-1
        )
        xgb_model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], eval_metric=calc_f1_score, verbose=False)

        oof_test_preds += xgb_model.predict_proba(X_test) / skf.n_splits
        oof_val_preds[valid_idx] += xgb_model.predict_proba(X_valid)

        # if fold == 1 :
        #     pred = xgb_model.predict(X_test)
        #     break

        #model save
        # xgb_model.save_model(f'./models/new_xgb_{skf.n_splits}_{fold}.json')
        del [[X_train, y_train, X_valid, y_valid, xgb_model]]
        gc.collect()

    #     model score check
    preds = np.argmax(oof_val_preds, axis=1)
    print(f'Fold : {k}, {f1_score(y, preds, average="macro")}')

    # # save OOF test preds
    # np.save(f'./results/new_{skf.n_splits}_oof_test_preds.npy', oof_test_preds[:, 1])

Fold : 2, 0.912768361581921
Fold : 3, 0.9238551500634071
Fold : 4, 0.9423794596208389
Fold : 5, 0.9418455743879472
Fold : 6, 0.9460969138388493
Fold : 7, 0.9567845947156292
Fold : 8, 0.9386425051661104
Fold : 9, 0.9421202579097315
Fold : 10, 0.9314239763621233
Fold : 11, 0.9530795627740843
Fold : 12, 0.9314239763621233
Fold : 13, 0.9495820271682341
Fold : 14, 0.9495820271682341
Fold : 15, 0.9532839919936694
Fold : 16, 0.9567845947156292
Fold : 17, 0.9460969138388493
Fold : 18, 0.9497956150130062
Fold : 19, 0.9460969138388493
Fold : 20, 0.9567845947156292
Fold : 21, 0.9532839919936694
Fold : 22, 0.9532839919936694
Fold : 23, 0.9497956150130062
Fold : 24, 0.9639871622630243
Fold : 25, 0.9567845947156292
Fold : 26, 0.9497956150130062
Fold : 27, 0.946318805106667
Fold : 28, 0.9604710701484894
Fold : 29, 0.9497956150130062
Fold : 30, 0.9534762977591115
Fold : 31, 0.9569676700111481
Fold : 32, 0.9497956150130062
Fold : 33, 0.9497956150130062
Fold : 34, 0.9534762977591115
Fold : 35, 0.9532839


KeyboardInterrupt



In [62]:
np.random.randint(10000)

1984

In [67]:
seeds = []

for _ in range(100) :
    while True :
        random_seed = np.random.randint(10000)
        if random_seed not in seeds :
            seeds.append(random_seed)
            break
        else :
            continue

    y = train['class'].values
    X = train.drop(['id', 'class'], axis=1).to_numpy()
    X_test = test.drop(['id'], axis=1).to_numpy()

    skf = StratifiedKFold(n_splits=24, shuffle=True, random_state=random_seed)

    oof_val_preds = np.zeros((X.shape[0], 3))
    oof_test_preds = np.zeros((X_test.shape[0], 3))

    # OOF
    for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y)):

        # print('#'*30, f'Fold [{fold+1}/{skf.n_splits}]', '#'*30)

        # train, valid data 설정
        X_train, y_train = X[train_idx], y[train_idx]
        X_valid, y_valid = X[valid_idx], y[valid_idx]

        smote = SMOTE(random_state=random_seed)
        X_train, y_train = smote.fit_resample(X_train, y_train)

        # 불균형 데이터 가중치 조정 값 => 음성(0) 타깃값 개수 / 양성(1) 타깃값 개수
        _, counts = np.unique(np.array(y_train), return_counts=True)
        scale_weight = counts[0] / counts[1]

        # XGBoost 모델 훈련
        xgb_model = XGBClassifier(
            **xgb_params,
            tree_method='gpu_hist',
            predictor='gpu_predictor',
            random_state=random_seed,
            n_jobs=-1
        )
        xgb_model.fit(X_train, y_train, verbose=False)

        oof_test_preds += xgb_model.predict_proba(X_test) / skf.n_splits
        oof_val_preds[valid_idx] += xgb_model.predict_proba(X_valid)

        # if fold == 1 :
        #     pred = xgb_model.predict(X_test)
        #     break

        #model save
        # xgb_model.save_model(f'./models/new_xgb_{skf.n_splits}_{fold}.json')
        del [[X_train, y_train, X_valid, y_valid, xgb_model]]
        gc.collect()

    #     model score check
    preds = np.argmax(oof_val_preds, axis=1)
    print(f"seed : {random_seed},", f1_score(y, preds, average="macro"))

    # # save OOF test preds
    # np.save(f'./results/new_{skf.n_splits}_oof_test_preds.npy', oof_test_preds[:, 1])

seed : 6147, 0.9499962987637871
seed : 6141, 0.9426235600148644
seed : 3315, 0.9532839919936694
seed : 9973, 0.9534762977591115
seed : 121, 0.9534762977591115
seed : 9421, 0.9389098356840293
seed : 6730, 0.9497956150130062
seed : 542, 0.9497956150130062
seed : 6204, 0.9495820271682341
seed : 5579, 0.9458610339700974
seed : 4531, 0.9426235600148644
seed : 2456, 0.9423794596208389
seed : 8807, 0.9497956150130062
seed : 3419, 0.9495820271682341
seed : 6361, 0.946318805106667
seed : 4341, 0.9534762977591115
seed : 7336, 0.9426235600148644
seed : 4439, 0.9426235600148644
seed : 7261, 0.9495820271682341
seed : 8254, 0.9567845947156292


KeyboardInterrupt: 

In [71]:
high = 0
for k in range(2, 21) :
    seeds = []
    for _ in range(100) :
        while True :
            random_seed = np.random.randint(10000)
            if random_seed not in seeds :
                seeds.append(random_seed)
                break
            else :
                continue

        y = train['class'].values
        X = train.drop(['id', 'class'], axis=1).to_numpy()
        X_test = test.drop(['id'], axis=1).to_numpy()

        skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=random_seed)

        oof_val_preds = np.zeros((X.shape[0], 3))
        oof_test_preds = np.zeros((X_test.shape[0], 3))

        # OOF
        for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y)):

            # print('#'*30, f'Fold [{fold+1}/{skf.n_splits}]', '#'*30)

            # train, valid data 설정
            X_train, y_train = X[train_idx], y[train_idx]
            X_valid, y_valid = X[valid_idx], y[valid_idx]

            smote = SMOTE(random_state=random_seed)
            X_train, y_train = smote.fit_resample(X_train, y_train)

            # 불균형 데이터 가중치 조정 값 => 음성(0) 타깃값 개수 / 양성(1) 타깃값 개수
            _, counts = np.unique(np.array(y_train), return_counts=True)
            scale_weight = counts[0] / counts[1]

            # XGBoost 모델 훈련
            xgb_model = XGBClassifier(
                **xgb_params,
                tree_method='gpu_hist',
                predictor='gpu_predictor',
                random_state=random_seed,
                n_jobs=-1
            )
            xgb_model.fit(X_train, y_train, verbose=False)

            oof_test_preds += xgb_model.predict_proba(X_test) / skf.n_splits
            oof_val_preds[valid_idx] += xgb_model.predict_proba(X_valid)

            # if fold == 1 :
            #     pred = xgb_model.predict(X_test)
            #     break

            #model save
            # xgb_model.save_model(f'./models/new_xgb_{skf.n_splits}_{fold}.json')
            del [[X_train, y_train, X_valid, y_valid, xgb_model]]
            gc.collect()

        #     model score check
        preds = np.argmax(oof_val_preds, axis=1)
        score =  f1_score(y, preds, average="macro")
        if score > high :
            high = score
            text = f'k: {k}, seed : {random_seed}, {score}'
            print(text)

print(text)

k: 2, seed : 724, 0.9139353400222966
k: 2, seed : 7566, 0.9204153546258809
k: 2, seed : 9630, 0.9238551500634071
k: 2, seed : 9896, 0.9354515050167224
k: 2, seed : 3823, 0.9386425051661104
k: 2, seed : 3670, 0.9423794596208389
k: 2, seed : 4040, 0.9460969138388493
k: 2, seed : 2039, 0.9495820271682341
k: 2, seed : 3197, 0.952862711944014
k: 2, seed : 9072, 0.9534762977591115
k: 3, seed : 1177, 0.9567845947156292
k: 8, seed : 419, 0.9641397250092902
k: 14, seed : 5833, 0.9713117800074321
k: 14, seed : 5833, 0.9713117800074321


In [72]:
high = 0
for k in tqdm(range(2, 21)) :
    seeds = []
    for _ in range(1000) :
        while True :
            random_seed = np.random.randint(10000)
            if random_seed not in seeds :
                seeds.append(random_seed)
                break
            else :
                continue

        y = train['class'].values
        X = train.drop(['id', 'class'], axis=1).to_numpy()
        X_test = test.drop(['id'], axis=1).to_numpy()

        skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=random_seed)

        oof_val_preds = np.zeros((X.shape[0], 3))
        oof_test_preds = np.zeros((X_test.shape[0], 3))

        # OOF
        for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y)):

            # print('#'*30, f'Fold [{fold+1}/{skf.n_splits}]', '#'*30)

            # train, valid data 설정
            X_train, y_train = X[train_idx], y[train_idx]
            X_valid, y_valid = X[valid_idx], y[valid_idx]

            smote = SMOTE(random_state=random_seed)
            X_train, y_train = smote.fit_resample(X_train, y_train)

            # 불균형 데이터 가중치 조정 값 => 음성(0) 타깃값 개수 / 양성(1) 타깃값 개수
            _, counts = np.unique(np.array(y_train), return_counts=True)
            scale_weight = counts[0] / counts[1]

            # XGBoost 모델 훈련
            xgb_model = XGBClassifier(
                **xgb_params,
                tree_method='gpu_hist',
                predictor='gpu_predictor',
                random_state=random_seed,
                n_jobs=-1
            )
            xgb_model.fit(X_train, y_train, verbose=False)

            oof_test_preds += xgb_model.predict_proba(X_test) / skf.n_splits
            oof_val_preds[valid_idx] += xgb_model.predict_proba(X_valid)

            # if fold == 1 :
            #     pred = xgb_model.predict(X_test)
            #     break

            #model save
            # xgb_model.save_model(f'./models/new_xgb_{skf.n_splits}_{fold}.json')
            del [[X_train, y_train, X_valid, y_valid, xgb_model]]
            gc.collect()

        #     model score check
        preds = np.argmax(oof_val_preds, axis=1)
        score =  f1_score(y, preds, average="macro")
        if score > high :
            high = score
            text = f'k: {k}, seed : {random_seed}, {score}'
            print(text)

print("="*80)
print()
print(text)
print()
print("="*80)

k: 2, seed : 7999, 0.9453459066066321
k: 2, seed : 6982, 0.9456108214738622
k: 2, seed : 2524, 0.946318805106667
k: 2, seed : 4636, 0.9497956150130062
k: 2, seed : 6274, 0.9532839919936694
k: 2, seed : 3866, 0.9569676700111481
k: 2, seed : 121, 0.9604710701484894
k: 2, seed : 3049, 0.9607864867610525
k: 3, seed : 8701, 0.9676581483033097


KeyboardInterrupt: 