In [1]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# 경로 및 전처리 설정

In [4]:
PREPROCESS_DATA = True
PREPROCESS_METHOD = "z_score" # ['z_score', 'min_max', 'robust']
PREPROCESS_TARGET = True
PREPROCESS_FEATURE = False

DROP_OUTLIERS = True
USE_PCA = True


DROP_X2 = False
USE_X4X10_FEATURE = False
random_seed =42

########## 경로 설정 ##########
path_to_submission_csv = "/content/drive/MyDrive/네이버 부스트캠프/daicon_black_box/data/test/sample_submission.csv" #sample_submission.csv 경로
path_to_result_csv = "./updated_submission.csv" # 결과 파일 저장 경로

path_to_train_csv = "/content/drive/MyDrive/네이버 부스트캠프/daicon_black_box/data/train/train.csv" # train.csv 경로
path_to_test_csv = "/content/drive/MyDrive/네이버 부스트캠프/daicon_black_box/data/test/test.csv" #test.csv 경로

# import module

In [5]:
import pickle
import os

import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso
import statistics

from sklearn.decomposition import PCA

from catboost import CatBoostRegressor, Pool

# util

In [6]:
def predict_and_submit(y_pred, y_scaler=None):
    if y_scaler is not None:
        try:
            y_pred = y_scaler.inverse_transform(y_pred)
        except:
            y_pred = y_scaler.inverse_transform(y_pred.reshape(1, -1))
        y_pred = y_pred.reshape(-1)

    # Identify top 33% of predicted values
    threshold = np.percentile(y_pred, 90)
    top_10_percent_mask = y_pred >= threshold

    # Create submission file
    submission_df = pd.read_csv(path_to_submission_csv)
    submission_df['y'] = y_pred
    submission_df.to_csv(path_to_result_csv, index=False)

    print(f"Top 10% threshold: {threshold:.4f}")
    print(f"Number of samples in top 10%: {sum(top_10_percent_mask)}")


def printer_dec(verbose):
    def printer(*args):
        if verbose:
            for s in args:
                print(s, end=" ")
    return printer

def dump_params_dict(file_path, params):
    with open(file_path, 'wb') as fw:
        pickle.dump(params, fw)

def load_params_dict(file_path):
    with open(file_path, 'rb') as fr:
        loaded = pickle.load(fr)
    return loaded

# load data

In [7]:
train_df = pd.read_csv(path_to_train_csv)
test_df = df = pd.read_csv(path_to_test_csv)

print("이상치 갯수 : ", train_df[train_df['y'] < 70].shape[0])
print("전체 데이터 수 : ", train_df.shape[0])

이상치 갯수 :  8
전체 데이터 수 :  40118


# preprocessing

In [12]:
def preprocess(n_components, vervose = False):
    train_df = pd.read_csv(path_to_train_csv)
    test_df = df = pd.read_csv(path_to_test_csv)
    print = printer_dec(vervose)

    if PREPROCESS_METHOD == "z_score":
        X_scaler = StandardScaler()
        y_scaler = StandardScaler()
        print("z_score")
    elif PREPROCESS_METHOD == "min_max":
        X_scaler = MinMaxScaler()
        y_scaler = MinMaxScaler()
    elif PREPROCESS_METHOD == "robust":
        X_scaler = RobustScaler()
        y_scaler = RobustScaler()

    if DROP_OUTLIERS:
        train_df.drop(train_df[train_df['y'] < 70].index, inplace=True)

    if DROP_X2:
        train_df = train_df.drop(['x_2'], axis=1)
        test_df = test_df.drop(['x_2'], axis=1)

    if USE_X4X10_FEATURE:
        train_df["x_11"] = (train_df['x_4'] + train_df['x_10']) / 2
        train_df = train_df.drop(['x_4', 'x_10'], axis=1)
        train_df = train_df[['ID', 'x_0', 'x_1', 'x_2', 'x_3', 'x_5', 'x_6', 'x_7', 'x_8', 'x_9', 'x_11', 'y']]
        test_df["x_11"] = (test_df['x_4'] + test_df['x_10']) / 2
        test_df = test_df.drop(['x_4', 'x_10'], axis=1)


    if USE_PCA:
        pca = PCA(n_components=n_components)
        X_train = pca.fit_transform(train_df.iloc[:, 1:-1])
        X_test = pca.transform(test_df.iloc[:, 1:])
        print(sum(pca.explained_variance_ratio_))
        print(X_train)
    else:
        print("차원축소를 사용하지 않습니다!")

    if PREPROCESS_FEATURE:
        if USE_PCA:
            X_train = X_scaler.fit_transform(X_train)
            X_test = X_scaler.transform(X_test)
        else:
            X_train = X_scaler.fit_transform(train_df.iloc[:, 1:-1])
            X_test = X_scaler.transform(test_df.iloc[:, 1:])

    if PREPROCESS_TARGET:
        y_train = y_scaler.fit_transform(train_df.iloc[:, [-1]])
    else:
        y_train = train_df.iloc[:, [-1]].values

    print("train features shape : ", X_train.shape)
    print("train labels shape : ", y_train.shape)
    print("test features shape ; ", X_test.shape)
    return X_train, y_train, X_test, X_scaler, y_scaler

# GridSearch

## search for pca dim 6

In [59]:
# search space
params = {'iterations': [1000],
          'learning_rate': np.logspace(-3, -1, 3),
          'depth': [4, 6],
          'loss_function': ['RMSE'],
          'l2_leaf_reg': np.logspace(-20, -19, 3),
          'leaf_estimation_iterations': [10],
          'eval_metric': ['RMSE'],
          'random_seed': [random_seed],
         }

In [60]:
X_train, y_train, X_test, X_scaler, y_scaler = preprocess(6)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=random_seed)
fit_params = {'early_stopping_rounds': 100, 'eval_set':[(X_val, y_val)]}

In [61]:
model_pca_dim6 = CatBoostRegressor()
scorer = make_scorer(mean_squared_error, greater_is_better=False)
grid_cv = GridSearchCV(model_pca_dim6, param_grid=params, cv=5, n_jobs=1, scoring=scorer, verbose=2)

X_train, y_train, X_test, X_scaler, y_scaler = preprocess(6)
grid_cv.fit(X_train, y_train, **fit_params)

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
25:	learn: 0.6051581	test: 0.6010277	best: 0.6010277 (25)	total: 189ms	remaining: 7.08s
26:	learn: 0.6043807	test: 0.6004592	best: 0.6004592 (26)	total: 196ms	remaining: 7.06s
27:	learn: 0.6038600	test: 0.6001417	best: 0.6001417 (27)	total: 204ms	remaining: 7.1s
28:	learn: 0.6033287	test: 0.5997530	best: 0.5997530 (28)	total: 217ms	remaining: 7.28s
29:	learn: 0.6028198	test: 0.5994580	best: 0.5994580 (29)	total: 226ms	remaining: 7.29s
30:	learn: 0.6023755	test: 0.5991799	best: 0.5991799 (30)	total: 233ms	remaining: 7.27s
31:	learn: 0.6020053	test: 0.5989016	best: 0.5989016 (31)	total: 240ms	remaining: 7.25s
32:	learn: 0.6016034	test: 0.5986151	best: 0.5986151 (32)	total: 247ms	remaining: 7.24s
33:	learn: 0.6012697	test: 0.5984171	best: 0.5984171 (33)	total: 255ms	remaining: 7.23s
34:	learn: 0.6010675	test: 0.5983339	best: 0.5983339 (34)	total: 262ms	remaining: 7.22s
35:	learn: 0.6007825	test: 0.5982428	best: 0.5982428 (35)	total: 269ms	

  _data = np.array(data, dtype=dtype, copy=copy,


0:	learn: 0.9940060	test: 0.9838846	best: 0.9838846 (0)	total: 19.3ms	remaining: 19.3s
1:	learn: 0.9881347	test: 0.9780169	best: 0.9780169 (1)	total: 36.7ms	remaining: 18.3s
2:	learn: 0.9823008	test: 0.9721951	best: 0.9721951 (2)	total: 56.5ms	remaining: 18.8s
3:	learn: 0.9766657	test: 0.9665675	best: 0.9665675 (3)	total: 73.5ms	remaining: 18.3s
4:	learn: 0.9710041	test: 0.9609256	best: 0.9609256 (4)	total: 90.2ms	remaining: 17.9s
5:	learn: 0.9654697	test: 0.9554292	best: 0.9554292 (5)	total: 108ms	remaining: 17.9s
6:	learn: 0.9599971	test: 0.9499966	best: 0.9499966 (6)	total: 127ms	remaining: 18s
7:	learn: 0.9546664	test: 0.9446797	best: 0.9446797 (7)	total: 144ms	remaining: 17.9s
8:	learn: 0.9493743	test: 0.9394327	best: 0.9394327 (8)	total: 162ms	remaining: 17.8s
9:	learn: 0.9440190	test: 0.9341119	best: 0.9341119 (9)	total: 180ms	remaining: 17.8s
10:	learn: 0.9387875	test: 0.9288952	best: 0.9288952 (10)	total: 197ms	remaining: 17.7s
11:	learn: 0.9336903	test: 0.9238018	best: 0.9238

In [62]:
X_train, y_train, X_test, X_scaler, y_scaler = preprocess(6)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=random_seed)

model_pca_dim_6 = CatBoostRegressor(**grid_cv.best_params_)

model_pca_dim_6.fit(X_train, y_train,
          eval_set=[(X_train, y_train), (X_val, y_val)],
          early_stopping_rounds=100,
          verbose=100)
model_pca_dim_6.save_model("pca_dim6.cbm")
print("best params\n", grid_cv.best_params_)

0:	learn: 0.9966145	test: 0.9966145	test1: 0.9840056	best: 0.9840056 (0)	total: 9.53ms	remaining: 9.52s
100:	learn: 0.6816360	test: 0.6816360	test1: 0.6732112	best: 0.6732112 (100)	total: 596ms	remaining: 5.3s
200:	learn: 0.6178860	test: 0.6178860	test1: 0.6141990	best: 0.6141990 (200)	total: 1.15s	remaining: 4.55s
300:	learn: 0.6053091	test: 0.6053091	test1: 0.6042385	best: 0.6042385 (300)	total: 1.71s	remaining: 3.97s
400:	learn: 0.6020358	test: 0.6020358	test1: 0.6022024	best: 0.6022024 (400)	total: 2.26s	remaining: 3.38s
500:	learn: 0.6006827	test: 0.6006827	test1: 0.6016886	best: 0.6016886 (500)	total: 2.81s	remaining: 2.8s
600:	learn: 0.5998763	test: 0.5998763	test1: 0.6015248	best: 0.6015248 (600)	total: 3.94s	remaining: 2.62s
700:	learn: 0.5992557	test: 0.5992557	test1: 0.6015358	best: 0.6015224 (645)	total: 5.06s	remaining: 2.16s
800:	learn: 0.5986917	test: 0.5986917	test1: 0.6015491	best: 0.6015069 (731)	total: 6.22s	remaining: 1.55s
Stopped by overfitting detector  (100 iter

## search for pca dim 7

In [63]:
# search space
params = {'iterations': [1000],
          'learning_rate': np.logspace(-3, -1, 3),
          'depth': [4, 6],
          'loss_function': ['RMSE'],
          'l2_leaf_reg': np.logspace(-20, -19, 3),
          'leaf_estimation_iterations': [10],
          'eval_metric': ['RMSE'],
          'random_seed': [random_seed],
         }

In [64]:
X_train, y_train, X_test, X_scaler, y_scaler = preprocess(7)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=random_seed)
fit_params = {'early_stopping_rounds': 100, 'eval_set':[(X_val, y_val)]}

In [65]:
model_pca_dim7 = CatBoostRegressor()
scorer = make_scorer(mean_squared_error, greater_is_better=False)
grid_cv = GridSearchCV(model_pca_dim7, param_grid=params, cv=5, n_jobs=1, scoring=scorer, verbose=10)

X_train, y_train, X_test, X_scaler, y_scaler = preprocess(7)
grid_cv.fit(X_train, y_train, **fit_params)

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
31:	learn: 0.6014386	test: 0.5995462	best: 0.5995462 (31)	total: 269ms	remaining: 8.14s
32:	learn: 0.6010800	test: 0.5992759	best: 0.5992759 (32)	total: 284ms	remaining: 8.32s
33:	learn: 0.6006795	test: 0.5990756	best: 0.5990756 (33)	total: 291ms	remaining: 8.28s
34:	learn: 0.6003356	test: 0.5988521	best: 0.5988521 (34)	total: 299ms	remaining: 8.25s
35:	learn: 0.6000419	test: 0.5987039	best: 0.5987039 (35)	total: 307ms	remaining: 8.22s
36:	learn: 0.5997590	test: 0.5985540	best: 0.5985540 (36)	total: 315ms	remaining: 8.2s
37:	learn: 0.5995028	test: 0.5984079	best: 0.5984079 (37)	total: 323ms	remaining: 8.18s
38:	learn: 0.5992836	test: 0.5982180	best: 0.5982180 (38)	total: 331ms	remaining: 8.15s
39:	learn: 0.5990040	test: 0.5980057	best: 0.5980057 (39)	total: 339ms	remaining: 8.14s
40:	learn: 0.5987369	test: 0.5978005	best: 0.5978005 (40)	total: 347ms	remaining: 8.13s
41:	learn: 0.5985310	test: 0.5976504	best: 0.5976504 (41)	total: 355ms	

  _data = np.array(data, dtype=dtype, copy=copy,


2:	learn: 0.9819751	test: 0.9718855	best: 0.9718855 (2)	total: 43.2ms	remaining: 14.4s
3:	learn: 0.9760754	test: 0.9660076	best: 0.9660076 (3)	total: 74.8ms	remaining: 18.6s
4:	learn: 0.9703209	test: 0.9602869	best: 0.9602869 (4)	total: 87.8ms	remaining: 17.5s
5:	learn: 0.9645737	test: 0.9545647	best: 0.9545647 (5)	total: 97.9ms	remaining: 16.2s
6:	learn: 0.9589618	test: 0.9489555	best: 0.9489555 (6)	total: 107ms	remaining: 15.2s
7:	learn: 0.9535280	test: 0.9435295	best: 0.9435295 (7)	total: 116ms	remaining: 14.4s
8:	learn: 0.9480177	test: 0.9379978	best: 0.9379978 (8)	total: 130ms	remaining: 14.3s
9:	learn: 0.9425576	test: 0.9325352	best: 0.9325352 (9)	total: 139ms	remaining: 13.8s
10:	learn: 0.9372702	test: 0.9272275	best: 0.9272275 (10)	total: 148ms	remaining: 13.3s
11:	learn: 0.9319908	test: 0.9219448	best: 0.9219448 (11)	total: 158ms	remaining: 13s
12:	learn: 0.9268193	test: 0.9167829	best: 0.9167829 (12)	total: 167ms	remaining: 12.7s
13:	learn: 0.9216963	test: 0.9116824	best: 0.9

In [66]:
X_train, y_train, X_test, X_scaler, y_scaler = preprocess(7)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=random_seed)

model_pca_dim_7 = CatBoostRegressor(**grid_cv.best_params_)

model_pca_dim_7.fit(X_train, y_train,
          eval_set=[(X_train, y_train), (X_val, y_val)],
          early_stopping_rounds=100,
          verbose=100)
model_pca_dim_7.save_model("pca_dim7.cbm")
print("best params\n", grid_cv.best_params_)

0:	learn: 0.9963807	test: 0.9963807	test1: 0.9837542	best: 0.9837542 (0)	total: 8.08ms	remaining: 8.07s
100:	learn: 0.6762577	test: 0.6762577	test1: 0.6677150	best: 0.6677150 (100)	total: 820ms	remaining: 7.3s
200:	learn: 0.6135170	test: 0.6135170	test1: 0.6104123	best: 0.6104123 (200)	total: 1.63s	remaining: 6.48s
300:	learn: 0.6013809	test: 0.6013809	test1: 0.6016109	best: 0.6016109 (300)	total: 2.46s	remaining: 5.71s
400:	learn: 0.5978664	test: 0.5978664	test1: 0.6002995	best: 0.6002995 (400)	total: 3.27s	remaining: 4.88s
500:	learn: 0.5960634	test: 0.5960634	test1: 0.6000442	best: 0.6000442 (500)	total: 4.04s	remaining: 4.03s
600:	learn: 0.5946770	test: 0.5946770	test1: 0.6001123	best: 0.6000127 (525)	total: 4.86s	remaining: 3.23s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.6000127267
bestIteration = 525

Shrink model to first 526 iterations.
best params
 {'depth': 6, 'eval_metric': 'RMSE', 'iterations': 1000, 'l2_leaf_reg': 1e-20, 'leaf_estimation_iteratio

# Ensemble

In [67]:
# load models
model_pca_dim6 = CatBoostRegressor().load_model("./pca_dim6.cbm")
model_pca_dim7 = CatBoostRegressor().load_model("./pca_dim7.cbm")

In [68]:
def pred_by_bestmodel(model, n_components):
    X_train, y_train, X_test, X_scaler, y_scaler = preprocess(n_components)
    kf = KFold(n_splits = 5, shuffle=True, random_state=random_seed)
    kf.get_n_splits(X_train)

    indices = [(train_idx, test_idx) for (train_idx, test_idx) in kf.split(X_train)]
    eval_pool = Pool(X_train[indices[4][1]], y_train[indices[4][1]])
    pred = y_scaler.inverse_transform(model.predict(X_test).reshape(-1, 1))
    return pred


pred_dim6 = pred_by_bestmodel(model_pca_dim6, 6)
pred_dim7 = pred_by_bestmodel(model_pca_dim7, 7)

In [69]:
predict_and_submit((pred_dim6 + pred_dim7) / 2.0)

Top 10% threshold: 91.5137
Number of samples in top 10%: [499]
