In [1]:
import pandas as pd
import random
import os
import numpy as np

from catboost import CatBoostClassifier
from catboost import CatBoostRegressor
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier

In [2]:
class CFG:
    SEED = 42

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(CFG.SEED) # Seed 고정

In [4]:
train = pd.read_csv('open/train.csv')
test = pd.read_csv('open/test.csv')

In [5]:
def get_x_y(df):
    if 'class' in df.columns:
        df_x = df.drop(columns=['id', 'class'])
        df_y = df['class']
        return df_x, df_y
    else:
        df_x = df.drop(columns=['id'])
        return df_x

In [6]:
train_x, train_y = get_x_y(train)
test_x = get_x_y(test)

In [7]:
class_le = preprocessing.LabelEncoder()
snp_le = preprocessing.LabelEncoder()
snp_col = [f'SNP_{str(x).zfill(2)}' for x in range(1,16)]

In [8]:
snp_data = []
for col in snp_col:
    snp_data += list(train_x[col].values)

In [9]:
train_y = class_le.fit_transform(train_y)
snp_le.fit(snp_data)

LabelEncoder()

In [10]:
for col in train_x.columns:
    if col in snp_col:
        train_x[col] = snp_le.transform(train_x[col])
        test_x[col] = snp_le.transform(test_x[col])

In [31]:
clf = CatBoostClassifier(n_estimators=5000, max_depth=5, random_seed=42, learning_rate=0.01)
clf.fit(train_x, train_y)

0:	learn: 1.0882382	total: 1.15ms	remaining: 5.76s
200:	learn: 0.3302696	total: 136ms	remaining: 3.24s
400:	learn: 0.1831729	total: 248ms	remaining: 2.84s
600:	learn: 0.1264928	total: 341ms	remaining: 2.5s
800:	learn: 0.0945534	total: 465ms	remaining: 2.44s
1000:	learn: 0.0748292	total: 581ms	remaining: 2.32s
1200:	learn: 0.0615463	total: 715ms	remaining: 2.26s
1400:	learn: 0.0520046	total: 826ms	remaining: 2.12s
1600:	learn: 0.0448271	total: 968ms	remaining: 2.05s
1800:	learn: 0.0393106	total: 1.07s	remaining: 1.9s
2000:	learn: 0.0348725	total: 1.19s	remaining: 1.79s
2200:	learn: 0.0312845	total: 1.29s	remaining: 1.64s
2400:	learn: 0.0283294	total: 1.38s	remaining: 1.5s
2600:	learn: 0.0258132	total: 1.49s	remaining: 1.37s
2800:	learn: 0.0236614	total: 1.59s	remaining: 1.25s
3000:	learn: 0.0218762	total: 1.69s	remaining: 1.12s
3200:	learn: 0.0202304	total: 1.78s	remaining: 1s
3400:	learn: 0.0188204	total: 1.91s	remaining: 896ms
3600:	learn: 0.0176133	total: 2.02s	remaining: 786ms
3800:

<catboost.core.CatBoostClassifier at 0x245338fd908>

In [30]:
preds = clf.predict(test_x)
print(preds)

[[0]
 [1]
 [2]
 [1]
 [0]
 [1]
 [2]
 [1]
 [0]
 [0]
 [2]
 [1]
 [1]
 [0]
 [1]
 [1]
 [0]
 [1]
 [1]
 [2]
 [1]
 [1]
 [1]
 [0]
 [1]
 [1]
 [1]
 [0]
 [0]
 [1]
 [0]
 [0]
 [1]
 [2]
 [0]
 [1]
 [2]
 [1]
 [1]
 [2]
 [0]
 [1]
 [2]
 [1]
 [1]
 [1]
 [1]
 [2]
 [1]
 [2]
 [0]
 [1]
 [0]
 [1]
 [1]
 [1]
 [2]
 [0]
 [1]
 [2]
 [0]
 [1]
 [2]
 [2]
 [2]
 [0]
 [1]
 [0]
 [0]
 [1]
 [1]
 [1]
 [0]
 [0]
 [2]
 [1]
 [2]
 [1]
 [1]
 [1]
 [2]
 [1]
 [0]
 [1]
 [1]
 [1]
 [1]
 [1]
 [2]
 [0]
 [1]
 [1]
 [2]
 [1]
 [1]
 [2]
 [0]
 [1]
 [0]
 [2]
 [0]
 [1]
 [1]
 [2]
 [0]
 [0]
 [2]
 [1]
 [0]
 [1]
 [2]
 [1]
 [1]
 [1]
 [1]
 [0]
 [0]
 [2]
 [1]
 [2]
 [0]
 [1]
 [1]
 [2]
 [2]
 [1]
 [1]
 [1]
 [0]
 [1]
 [0]
 [0]
 [1]
 [1]
 [1]
 [2]
 [0]
 [0]
 [1]
 [0]
 [0]
 [0]
 [2]
 [1]
 [1]
 [1]
 [0]
 [1]
 [2]
 [0]
 [0]
 [1]
 [0]
 [1]
 [1]
 [0]
 [0]
 [1]
 [2]
 [0]
 [1]
 [2]
 [2]
 [1]
 [0]
 [0]
 [2]
 [1]
 [1]
 [0]
 [1]
 [2]
 [2]
 [1]
 [1]]


In [23]:
submit = pd.read_csv('open/sample_submission.csv')

In [24]:
submit['class'] = class_le.inverse_transform(preds)

In [25]:
submit.to_csv('./answer/catboost15.csv', index=False)