## 임포트

In [1]:
!pip install -U teddynote

from teddynote import models
# Data Wrangling
import pandas as pd
import numpy as np

#Utility
import random
import os

# Preprocessing & Feature Engineering
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectPercentile
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier

# Evaluation
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.model_selection import cross_val_score

# Optuna
import optuna
from optuna.samplers import TPESampler
from optuna import Trial

# Modeling
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostClassifier, Pool, cv

# Visualization
from matplotlib import pyplot as plt
import seaborn as sns

%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

pd.set_option('mode.chained_assignment',  None)



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class CFG:
    SEED = 42

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(CFG.SEED) # Seed 고정

## 데이터 불러오기

In [4]:
train = pd.read_csv('open/train.csv')
test = pd.read_csv('open/test.csv')

In [5]:
def get_x_y(df):
    if 'class' in df.columns:
        df_x = df.drop(columns=['id', 'class','father','mother','gender'])
        df_y = df['class']
        return df_x, df_y
    else:
        df_x = df.drop(columns=['id'])
        return df_x

In [6]:
train_x, train_y = get_x_y(train)
test_x = get_x_y(test)

## Data Pre-processing
### Label-Encoding

In [7]:
class_le = preprocessing.LabelEncoder()
snp_le = preprocessing.LabelEncoder()
snp_col = [f'SNP_{str(x).zfill(2)}' for x in range(1,16)]

In [8]:
snp_data = []
for col in snp_col:
    snp_data += list(train_x[col].values)

In [9]:
train_y = class_le.fit_transform(train_y)
snp_le.fit(snp_data)

LabelEncoder()

In [10]:
for col in train_x.columns:
    if col in snp_col:
        train_x[col] = snp_le.transform(train_x[col])
        test_x[col] = snp_le.transform(test_x[col])

## Model Fit

In [None]:
model = models.CatBoostClassifierOptuna()
params, preds = model.optimize(train_x, train_y, test_data=test_x, eval_metric='accuracy', cv=5, seed=CFG.SEED, n_rounds=100, n_trials=100)

[32m[I 2022-12-21 17:10:06,010][0m A new study created in memory with name: no-name-4b2588ae-5f99-4ef2-b1c3-d996eaf2151c[0m


metric type: accuracy, score: 1.00000
metric type: accuracy, score: 0.96226
metric type: accuracy, score: 0.92308
metric type: accuracy, score: 0.92308


[32m[I 2022-12-21 17:10:08,914][0m Trial 0 finished with value: 0.9578374455732946 and parameters: {'bootstrap_type': 'MVS', 'boosting_type': 'Ordered', 'od_type': 'IncToDec', 'colsample_bylevel': 0.07524669901342856, 'l2_leaf_reg': 0.1812243145332238, 'learning_rate': 0.03799521233235702, 'iterations': 1438, 'min_child_samples': 8, 'depth': 9}. Best is trial 0 with value: 0.9578374455732946.[0m


metric type: accuracy, score: 0.98077
metric type: accuracy, score: 1.00000
metric type: accuracy, score: 0.96226
metric type: accuracy, score: 0.94231
metric type: accuracy, score: 0.94231


[32m[I 2022-12-21 17:10:09,645][0m Trial 1 finished with value: 0.9693759071117561 and parameters: {'bootstrap_type': 'Bayesian', 'boosting_type': 'Plain', 'od_type': 'IncToDec', 'colsample_bylevel': 0.057192755843482164, 'l2_leaf_reg': 0.015485619911903416, 'learning_rate': 0.1271121001554646, 'iterations': 780, 'min_child_samples': 32, 'depth': 3, 'bagging_temperature': 31.59278073933575}. Best is trial 1 with value: 0.9693759071117561.[0m


metric type: accuracy, score: 1.00000
metric type: accuracy, score: 0.98113
metric type: accuracy, score: 0.96226


In [None]:
params

In [None]:
preds

In [None]:
clf = CatBoostClassifier(bootstrap_type = 'MVS', boosting_type = 'Ordered',od_type = 'Iter',colsample_bylevel = 0.028929702498494095,
 l2_leaf_reg = 0.004736168800482262,
 learning_rate = 0.01598306419837247,
 iterations = 1315,
 min_child_samples= 22,
 depth = 10)

clf.fit(train_x, train_y)


In [None]:
preds = clf.predict(test_x)
print(preds)

In [None]:
submit = pd.read_csv('./sample_submission.csv')
submit['class'] = class_le.inverse_transform(preds)
submit.to_csv('./answer/submit_catboost_optuna.csv', index=False)