## 임포트

In [1]:
!pip install -U teddynote

from teddynote import models
# Data Wrangling
import pandas as pd
import numpy as np

#Utility
import random
import os

# Preprocessing & Feature Engineering
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectPercentile
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier

# Evaluation
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.model_selection import cross_val_score

# Optuna
import optuna
from optuna.samplers import TPESampler
from optuna import Trial

# Modeling
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostClassifier, Pool, cv

# Visualization
from matplotlib import pyplot as plt
import seaborn as sns

%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

pd.set_option('mode.chained_assignment',  None)



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class CFG:
    SEED = 42

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(CFG.SEED) # Seed 고정

## 데이터 불러오기

In [4]:
train = pd.read_csv('open/train.csv')
test = pd.read_csv('open/test.csv')

In [5]:
def get_x_y(df):
    if 'class' in df.columns:
        df_x = df.drop(columns=['id', 'class','father','mother','gender'])
        df_y = df['class']
        return df_x, df_y
    else:
        df_x = df.drop(columns=['id'])
        return df_x

In [6]:
train_x, train_y = get_x_y(train)
test_x = get_x_y(test)

## Data Pre-processing
### Label-Encoding

In [7]:
class_le = preprocessing.LabelEncoder()
snp_le = preprocessing.LabelEncoder()
snp_col = [f'SNP_{str(x).zfill(2)}' for x in range(1,16)]

In [8]:
snp_data = []
for col in snp_col:
    snp_data += list(train_x[col].values)

In [9]:
train_y = class_le.fit_transform(train_y)
snp_le.fit(snp_data)

LabelEncoder()

In [10]:
for col in train_x.columns:
    if col in snp_col:
        train_x[col] = snp_le.transform(train_x[col])
        test_x[col] = snp_le.transform(test_x[col])

## Model Fit

In [11]:
model = models.CatBoostClassifierOptuna()
params, preds = model.optimize(train_x, train_y, test_data=test_x, eval_metric='accuracy', cv=5, seed=CFG.SEED, n_rounds=100, n_trials=5)

[32m[I 2022-12-22 12:10:14,553][0m A new study created in memory with name: no-name-97548400-66dc-4746-bd9a-95b0607fcfb4[0m


metric type: accuracy, score: 1.00000
metric type: accuracy, score: 0.96226
metric type: accuracy, score: 0.92308


[32m[I 2022-12-22 12:10:15,153][0m Trial 0 finished with value: 0.950145137880987 and parameters: {'bootstrap_type': 'Bernoulli', 'boosting_type': 'Plain', 'od_type': 'IncToDec', 'colsample_bylevel': 0.02819735197572884, 'l2_leaf_reg': 0.009940677332888734, 'learning_rate': 0.16942741116770937, 'iterations': 672, 'min_child_samples': 22, 'depth': 12, 'subsample': 0.6741702165335565}. Best is trial 0 with value: 0.950145137880987.[0m


metric type: accuracy, score: 0.94231
metric type: accuracy, score: 0.92308
metric type: accuracy, score: 1.00000
metric type: accuracy, score: 0.96226
metric type: accuracy, score: 0.92308
metric type: accuracy, score: 0.92308


[32m[I 2022-12-22 12:10:18,744][0m Trial 1 finished with value: 0.950145137880987 and parameters: {'bootstrap_type': 'Bernoulli', 'boosting_type': 'Ordered', 'od_type': 'IncToDec', 'colsample_bylevel': 0.044437029950034526, 'l2_leaf_reg': 8.565778426023335e-08, 'learning_rate': 0.04095019338056778, 'iterations': 1321, 'min_child_samples': 32, 'depth': 11, 'subsample': 0.6166236493832883}. Best is trial 0 with value: 0.950145137880987.[0m


metric type: accuracy, score: 0.94231
metric type: accuracy, score: 0.98113
metric type: accuracy, score: 0.94340
metric type: accuracy, score: 0.90385
metric type: accuracy, score: 0.92308


[32m[I 2022-12-22 12:10:23,590][0m Trial 2 finished with value: 0.9387518142235123 and parameters: {'bootstrap_type': 'Bernoulli', 'boosting_type': 'Ordered', 'od_type': 'Iter', 'colsample_bylevel': 0.02079794791057904, 'l2_leaf_reg': 6.512988254493466e-08, 'learning_rate': 0.012627458309891, 'iterations': 1584, 'min_child_samples': 26, 'depth': 6, 'subsample': 0.8031971203612567}. Best is trial 0 with value: 0.950145137880987.[0m


metric type: accuracy, score: 0.94231
metric type: accuracy, score: 0.98113
metric type: accuracy, score: 0.96226
metric type: accuracy, score: 0.92308
metric type: accuracy, score: 0.92308


[32m[I 2022-12-22 12:10:24,574][0m Trial 3 finished with value: 0.9540638606676343 and parameters: {'bootstrap_type': 'Bernoulli', 'boosting_type': 'Ordered', 'od_type': 'IncToDec', 'colsample_bylevel': 0.09835691556286413, 'l2_leaf_reg': 1.3848179651142795e-05, 'learning_rate': 0.10818073290859173, 'iterations': 1778, 'min_child_samples': 12, 'depth': 2, 'subsample': 0.75677130856301}. Best is trial 3 with value: 0.9540638606676343.[0m


metric type: accuracy, score: 0.98077
metric type: accuracy, score: 0.98113
metric type: accuracy, score: 0.94340
metric type: accuracy, score: 0.92308
metric type: accuracy, score: 0.94231


[32m[I 2022-12-22 12:10:27,778][0m Trial 4 finished with value: 0.9541364296081278 and parameters: {'bootstrap_type': 'Bernoulli', 'boosting_type': 'Ordered', 'od_type': 'Iter', 'colsample_bylevel': 0.09142452455550573, 'l2_leaf_reg': 0.7617335248997505, 'learning_rate': 0.024363393238205553, 'iterations': 1889, 'min_child_samples': 23, 'depth': 3, 'subsample': 0.7074366392331795}. Best is trial 4 with value: 0.9541364296081278.[0m


metric type: accuracy, score: 0.98077
saving model...models\CatBoostClassifier-0.95414.npy


In [12]:
params

{'bootstrap_type': 'Bernoulli',
 'boosting_type': 'Ordered',
 'od_type': 'Iter',
 'colsample_bylevel': 0.09142452455550573,
 'l2_leaf_reg': 0.7617335248997505,
 'learning_rate': 0.024363393238205553,
 'iterations': 1889,
 'min_child_samples': 23,
 'depth': 3,
 'subsample': 0.7074366392331795}

In [13]:
preds

array([[0],
       [1],
       [2],
       [1],
       [0],
       [1],
       [2],
       [1],
       [0],
       [0],
       [2],
       [1],
       [1],
       [0],
       [1],
       [1],
       [0],
       [1],
       [1],
       [2],
       [1],
       [1],
       [1],
       [0],
       [1],
       [1],
       [1],
       [0],
       [0],
       [1],
       [0],
       [0],
       [1],
       [2],
       [0],
       [1],
       [2],
       [1],
       [1],
       [2],
       [0],
       [1],
       [2],
       [1],
       [1],
       [1],
       [1],
       [2],
       [1],
       [2],
       [0],
       [1],
       [0],
       [1],
       [1],
       [1],
       [2],
       [0],
       [1],
       [2],
       [0],
       [1],
       [2],
       [2],
       [2],
       [0],
       [1],
       [0],
       [0],
       [1],
       [1],
       [1],
       [0],
       [0],
       [2],
       [1],
       [2],
       [1],
       [1],
       [1],
       [2],
       [1],
       [0],
    

In [14]:
clf = CatBoostClassifier(bootstrap_type = 'MVS', boosting_type = 'Ordered',od_type = 'Iter',colsample_bylevel = 0.028929702498494095,
 l2_leaf_reg = 0.004736168800482262,
 learning_rate = 0.01598306419837247,
 iterations = 1315,
 min_child_samples= 22,
 depth = 10)

clf.fit(train_x, train_y)


0:	learn: 1.0978386	total: 1.14ms	remaining: 1.5s
1:	learn: 1.0970909	total: 2.62ms	remaining: 1.72s
2:	learn: 1.0909168	total: 3.78ms	remaining: 1.65s
3:	learn: 1.0902183	total: 4.67ms	remaining: 1.53s
4:	learn: 1.0738388	total: 6.37ms	remaining: 1.67s
5:	learn: 1.0671216	total: 7.38ms	remaining: 1.61s
6:	learn: 1.0664905	total: 8.42ms	remaining: 1.57s
7:	learn: 1.0604197	total: 9.22ms	remaining: 1.5s
8:	learn: 1.0532015	total: 10.6ms	remaining: 1.54s
9:	learn: 1.0441903	total: 11.3ms	remaining: 1.48s
10:	learn: 1.0436382	total: 12ms	remaining: 1.42s
11:	learn: 1.0346539	total: 12.7ms	remaining: 1.38s
12:	learn: 1.0341372	total: 13.4ms	remaining: 1.34s
13:	learn: 1.0336377	total: 14ms	remaining: 1.3s
14:	learn: 1.0331546	total: 15.2ms	remaining: 1.32s
15:	learn: 1.0272416	total: 15.9ms	remaining: 1.29s
16:	learn: 1.0202594	total: 16.7ms	remaining: 1.27s
17:	learn: 1.0198218	total: 18ms	remaining: 1.3s
18:	learn: 1.0073910	total: 18.8ms	remaining: 1.28s
19:	learn: 1.0069810	total: 19.4

171:	learn: 0.6243432	total: 134ms	remaining: 889ms
172:	learn: 0.6243396	total: 136ms	remaining: 895ms
173:	learn: 0.6243361	total: 137ms	remaining: 896ms
174:	learn: 0.6209686	total: 137ms	remaining: 894ms
175:	learn: 0.6185433	total: 138ms	remaining: 893ms
176:	learn: 0.6185401	total: 139ms	remaining: 891ms
177:	learn: 0.6185370	total: 140ms	remaining: 891ms
178:	learn: 0.6185340	total: 140ms	remaining: 890ms
179:	learn: 0.6185310	total: 141ms	remaining: 891ms
180:	learn: 0.6185282	total: 142ms	remaining: 891ms
181:	learn: 0.6185255	total: 143ms	remaining: 891ms
182:	learn: 0.6143292	total: 145ms	remaining: 894ms
183:	learn: 0.6143266	total: 145ms	remaining: 894ms
184:	learn: 0.6143241	total: 146ms	remaining: 894ms
185:	learn: 0.6143217	total: 147ms	remaining: 895ms
186:	learn: 0.6125067	total: 148ms	remaining: 895ms
187:	learn: 0.6125044	total: 149ms	remaining: 896ms
188:	learn: 0.6086334	total: 151ms	remaining: 897ms
189:	learn: 0.6033744	total: 152ms	remaining: 899ms
190:	learn: 

389:	learn: 0.4431744	total: 318ms	remaining: 754ms
390:	learn: 0.4431744	total: 319ms	remaining: 754ms
391:	learn: 0.4431744	total: 320ms	remaining: 753ms
392:	learn: 0.4411696	total: 320ms	remaining: 752ms
393:	learn: 0.4400745	total: 321ms	remaining: 751ms
394:	learn: 0.4400745	total: 322ms	remaining: 750ms
395:	learn: 0.4400745	total: 323ms	remaining: 749ms
396:	learn: 0.4381337	total: 323ms	remaining: 748ms
397:	learn: 0.4369641	total: 325ms	remaining: 748ms
398:	learn: 0.4369641	total: 325ms	remaining: 747ms
399:	learn: 0.4369641	total: 326ms	remaining: 746ms
400:	learn: 0.4361605	total: 327ms	remaining: 745ms
401:	learn: 0.4361605	total: 327ms	remaining: 744ms
402:	learn: 0.4361605	total: 328ms	remaining: 742ms
403:	learn: 0.4342066	total: 329ms	remaining: 741ms
404:	learn: 0.4338737	total: 329ms	remaining: 740ms
405:	learn: 0.4327413	total: 330ms	remaining: 740ms
406:	learn: 0.4319655	total: 332ms	remaining: 741ms
407:	learn: 0.4319655	total: 333ms	remaining: 740ms
408:	learn: 

637:	learn: 0.3075173	total: 508ms	remaining: 539ms
638:	learn: 0.3075173	total: 509ms	remaining: 539ms
639:	learn: 0.3075173	total: 510ms	remaining: 538ms
640:	learn: 0.3075173	total: 511ms	remaining: 537ms
641:	learn: 0.3075173	total: 511ms	remaining: 536ms
642:	learn: 0.3075173	total: 512ms	remaining: 535ms
643:	learn: 0.3075173	total: 513ms	remaining: 534ms
644:	learn: 0.3075173	total: 513ms	remaining: 533ms
645:	learn: 0.3064455	total: 514ms	remaining: 533ms
646:	learn: 0.3050876	total: 515ms	remaining: 532ms
647:	learn: 0.3048044	total: 516ms	remaining: 531ms
648:	learn: 0.3048044	total: 516ms	remaining: 530ms
649:	learn: 0.3048044	total: 517ms	remaining: 529ms
650:	learn: 0.3038206	total: 518ms	remaining: 528ms
651:	learn: 0.3038206	total: 518ms	remaining: 527ms
652:	learn: 0.3035473	total: 519ms	remaining: 526ms
653:	learn: 0.3035473	total: 520ms	remaining: 525ms
654:	learn: 0.3014960	total: 521ms	remaining: 525ms
655:	learn: 0.3014960	total: 522ms	remaining: 524ms
656:	learn: 

893:	learn: 0.2373244	total: 695ms	remaining: 327ms
894:	learn: 0.2367611	total: 696ms	remaining: 327ms
895:	learn: 0.2367611	total: 696ms	remaining: 326ms
896:	learn: 0.2367611	total: 697ms	remaining: 325ms
897:	learn: 0.2367611	total: 698ms	remaining: 324ms
898:	learn: 0.2366941	total: 698ms	remaining: 323ms
899:	learn: 0.2365204	total: 699ms	remaining: 322ms
900:	learn: 0.2365204	total: 700ms	remaining: 322ms
901:	learn: 0.2365204	total: 700ms	remaining: 321ms
902:	learn: 0.2365204	total: 701ms	remaining: 320ms
903:	learn: 0.2365204	total: 702ms	remaining: 319ms
904:	learn: 0.2365204	total: 702ms	remaining: 318ms
905:	learn: 0.2365204	total: 703ms	remaining: 317ms
906:	learn: 0.2359734	total: 703ms	remaining: 316ms
907:	learn: 0.2359734	total: 704ms	remaining: 316ms
908:	learn: 0.2359734	total: 705ms	remaining: 315ms
909:	learn: 0.2359734	total: 705ms	remaining: 314ms
910:	learn: 0.2357300	total: 706ms	remaining: 313ms
911:	learn: 0.2354174	total: 707ms	remaining: 312ms
912:	learn: 

1180:	learn: 0.1881953	total: 878ms	remaining: 99.6ms
1181:	learn: 0.1881953	total: 879ms	remaining: 98.9ms
1182:	learn: 0.1878832	total: 880ms	remaining: 98.2ms
1183:	learn: 0.1873310	total: 881ms	remaining: 97.4ms
1184:	learn: 0.1872077	total: 881ms	remaining: 96.7ms
1185:	learn: 0.1872077	total: 882ms	remaining: 95.9ms
1186:	learn: 0.1872077	total: 883ms	remaining: 95.2ms
1187:	learn: 0.1872077	total: 883ms	remaining: 94.4ms
1188:	learn: 0.1858936	total: 885ms	remaining: 93.8ms
1189:	learn: 0.1858936	total: 885ms	remaining: 93ms
1190:	learn: 0.1853019	total: 886ms	remaining: 92.3ms
1191:	learn: 0.1853019	total: 887ms	remaining: 91.5ms
1192:	learn: 0.1853019	total: 887ms	remaining: 90.7ms
1193:	learn: 0.1853019	total: 888ms	remaining: 90ms
1194:	learn: 0.1853019	total: 889ms	remaining: 89.2ms
1195:	learn: 0.1853019	total: 889ms	remaining: 88.5ms
1196:	learn: 0.1848434	total: 890ms	remaining: 87.7ms
1197:	learn: 0.1848434	total: 890ms	remaining: 87ms
1198:	learn: 0.1848434	total: 891m

<catboost.core.CatBoostClassifier at 0x14c101ea848>

In [15]:
preds = clf.predict(test_x)
print(preds)

[[0]
 [1]
 [2]
 [1]
 [0]
 [1]
 [2]
 [1]
 [0]
 [0]
 [2]
 [1]
 [1]
 [0]
 [1]
 [1]
 [0]
 [1]
 [1]
 [2]
 [1]
 [1]
 [1]
 [0]
 [1]
 [1]
 [1]
 [0]
 [0]
 [1]
 [0]
 [0]
 [1]
 [2]
 [0]
 [1]
 [2]
 [1]
 [1]
 [2]
 [0]
 [1]
 [2]
 [1]
 [1]
 [1]
 [1]
 [2]
 [1]
 [2]
 [0]
 [1]
 [0]
 [1]
 [1]
 [1]
 [2]
 [0]
 [1]
 [2]
 [0]
 [1]
 [2]
 [2]
 [2]
 [0]
 [1]
 [0]
 [0]
 [1]
 [1]
 [1]
 [0]
 [0]
 [2]
 [1]
 [2]
 [1]
 [1]
 [1]
 [2]
 [1]
 [0]
 [1]
 [1]
 [1]
 [1]
 [1]
 [2]
 [0]
 [1]
 [1]
 [2]
 [1]
 [1]
 [2]
 [0]
 [1]
 [0]
 [2]
 [0]
 [1]
 [1]
 [2]
 [0]
 [0]
 [2]
 [1]
 [0]
 [1]
 [2]
 [1]
 [1]
 [1]
 [1]
 [0]
 [0]
 [2]
 [1]
 [1]
 [0]
 [1]
 [1]
 [2]
 [2]
 [1]
 [2]
 [1]
 [0]
 [1]
 [0]
 [0]
 [1]
 [1]
 [1]
 [2]
 [0]
 [0]
 [1]
 [0]
 [0]
 [0]
 [2]
 [1]
 [1]
 [1]
 [0]
 [1]
 [2]
 [0]
 [0]
 [1]
 [0]
 [1]
 [1]
 [0]
 [0]
 [1]
 [2]
 [0]
 [1]
 [2]
 [2]
 [1]
 [0]
 [0]
 [2]
 [1]
 [1]
 [0]
 [1]
 [2]
 [2]
 [1]
 [1]]


In [16]:
submit = pd.read_csv('open/sample_submission.csv')
submit['class'] = class_le.inverse_transform(preds)
submit.to_csv('./answer/submit_catboost_optuna.csv', index=False)