In [30]:
%pip install eli5==0.13.0
%pip install catboost
%pip install xgboost

# visualization
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = 'NanumGothic'
import matplotlib.font_manager as fm
import seaborn as sns

# utils
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle
import warnings;warnings.filterwarnings('ignore')

# Model
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

import eli5
from eli5.sklearn import PermutationImportance

import catboost as cb
from catboost import CatBoostRegressor

import xgboost as xgb
from xgboost.sklearn import XGBRegressor

import time
import joblib
import random
import os

[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.


In [39]:
# feature_preprocessing.ipynb의 실행의 결과물 data.csv를 사용한다.
df_total = pd.read_csv('/data/ephemeral/home/data.csv', encoding='utf-8-sig')

# 불러오고 나서는 범주형 결측치를 다시 'NULL'로 바꿔줘야한다.
categorical_columns = []

for column in df_total.columns:
    if pd.api.types.is_numeric_dtype(df_total[column]):
        pass
    else:
        categorical_columns.append(column)

df_total[categorical_columns] = df_total[categorical_columns].fillna('NULL')

# 본번과 부번도 다시 str 형으로 바꿔줘야한다.
df_total['본번'] = df_total['본번'].astype('str')
df_total['부번'] = df_total['부번'].astype('str')

In [40]:
df_total = df_total[['전용면적', '강남3구', '계약년', '군', '건축년도', '구', '본번', '부번', '도로명', '아파트명',
       'k-전용면적별세대현황(60㎡이하)', '층', '계약월', 'k-연면적', '좌표Y', '지하철역까지 거리(m)', '최근 학교까지 거리(m)', 
       '근처 학교 이름', '근처 지하철역 이름', '신축여부', '브랜드아파트유무', 'is_test', 'target']]

In [41]:
class config:
    seed=42

def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
seed_everything(config.seed)

In [42]:
# 이제 is_test 칼럼은 drop해줍니다.
df_train = df_total.query('is_test==0')
df_test = df_total.query('is_test==1')

df_train.drop(['is_test'], axis = 1, inplace=True)
df_test.drop(['is_test'], axis = 1, inplace=True)
print(df_train.shape, df_test.shape)

(1118822, 22) (9272, 22)


In [43]:
assert df_train.shape[1] == df_test.shape[1]

In [44]:
continuous_columns = []
categorical_columns = []

for column in df_train.columns:
    if pd.api.types.is_numeric_dtype(df_train[column]):
        continuous_columns.append(column)
    else:
        categorical_columns.append(column)

print("연속형 변수:", continuous_columns)
print("범주형 변수:", categorical_columns)

연속형 변수: ['강남3구', '계약년', '건축년도', '층', '계약월', '좌표Y', '지하철역까지 거리(m)', '최근 학교까지 거리(m)', '신축여부', '브랜드아파트유무', 'target']
범주형 변수: ['전용면적', '군', '구', '본번', '부번', '도로명', '아파트명', 'k-전용면적별세대현황(60㎡이하)', 'k-연면적', '근처 학교 이름', '근처 지하철역 이름']


In [45]:
label_encoders = {}

# Implement Label Encoding
for col in categorical_columns:
    lbl = LabelEncoder()

    # Label-Encoding을 fit
    lbl.fit(df_train[col].astype(str))
    df_train[col] = lbl.transform(df_train[col].astype(str))
    label_encoders[col] = lbl           # 나중에 후처리를 위해 레이블인코더를 저장

    # Test 데이터에만 존재하는 새로 출현한 데이터를 신규 클래스로 추가
    for label in np.unique(df_test[col]):
      if label not in lbl.classes_: # unseen label 데이터인 경우
        lbl.classes_ = np.append(lbl.classes_, label) # 미처리 시 ValueError발생

    df_test[col] = lbl.transform(df_test[col].astype(str))

#### Training

In [46]:
from sklearn.base import BaseEstimator, ClassifierMixin

class VotingModel(BaseEstimator, ClassifierMixin):
    def __init__(self, estimators):
        super().__init__()
        self.estimators = estimators
        
    def fit(self, X, y=None):
        return self
    
    def predict(self, X):
        y_preds = [estimator.predict(X) for estimator in self.estimators]
        return np.mean(y_preds, axis=0)
    
    def predict_proba(self, X):
        y_preds = [estimator.predict_proba(X) for estimator in self.estimators]
        return np.mean(y_preds, axis=0)

In [13]:
train, test = train_test_split(df_train, test_size=0.1, random_state=42, shuffle=True)

X_train = train.drop(['target'], axis=1).reset_index(drop=True)
y_train = train['target'].reset_index(drop=True)

X_test = test.drop(['target'], axis=1)
y_test = test['target']

print(f'Train Input : {X_train.shape}')
print(f'Train Target : {y_train.shape}')
print(f'Test Input : {X_test.shape}')
print(f'Test Target : {y_test.shape}')

Train Input : (1006907, 21)
Train Target : (1006907,)
Test Input : (111879, 21)
Test Target : (111879,)


In [15]:
# Kfold 함수를 선언합니다.
n_splits = 5
kf = KFold(n_splits=n_splits)

# 학습 데이터를 Kfold로 나눕니다.
train_folds = kf.split(X_train, y_train)

In [16]:
# 학습을 진행합니다.
start_time = time.time()

fold_save_files = []
fold_results = []

# 각 fold별로 RMSE를 저장할 리스트 생성
train_rmse_by_fold = []
valid_rmse_by_fold = []

cat_params = {
    'iterations': 21365,
    'learning_rate': 0.08563739497748364,
    #'subsample': 0.2950106937849051,
    'random_strength': 35.478306038804206,
    'depth': 11,
    'min_data_in_leaf': 24,
    'l2_leaf_reg': 8.8072471069366,
    'loss_function': 'RMSE',
    'random_state': 42,
    'task_type': 'GPU',
}

xgb_params = {
    'n_estimators': 21417,
    'learning_rate': 0.012054476313731775,
    'max_depth': 15,
    'gamma': 0.10388314431513003,
    'subsample': 0.7824406806188264,
    'colsample_bytree': 0.4085128123233835,
    'reg_lambda': 9.722325681826353,
    'min_child_weight': 6,
    'objective': 'reg:squarederror',
    'reg_lambda': 5,
    'tree_method': 'gpu_hist',
    'gpu_id': 0,
    'seed': 42
}

for fold_idx, (train_idx, valid_idx) in enumerate(train_folds):
    print(f"--------{fold_idx}번째 fold의 학습을 시작합니다.--------")

    # index를 통해 fold의 학습세트를 가져옵니다.
    X_train_fold = X_train.iloc[train_idx, :]
    y_train_fold = y_train[train_idx]

    # index를 통해 fold의 평가세트를 가져옵니다.
    X_valid_fold = X_train.iloc[valid_idx, :]
    Y_valid_fold = y_train[valid_idx]

    fitted_models = []

    cat_model = CatBoostRegressor(**cat_params)
    cat_model.fit(X_train_fold, y_train_fold,                                             
            eval_set=(X_valid_fold, Y_valid_fold),
            early_stopping_rounds=50,                                                             
            verbose=100,
    )
    xgb_model = XGBRegressor(**xgb_params)
    xgb_model.fit(X_train_fold, y_train_fold,                                             
            eval_set=[(X_train_fold, y_train_fold), (X_valid_fold, Y_valid_fold)],
            early_stopping_rounds=50,                                                             
            verbose=100,
    )
    fitted_models.append(cat_model)
    fitted_models.append(xgb_model)

model = VotingModel(fitted_models)

--------0번째 fold의 학습을 시작합니다.--------
0:	learn: 43586.6308971	test: 43784.9751205	best: 43784.9751205 (0)	total: 15.5ms	remaining: 5m 31s
100:	learn: 15019.9666454	test: 15408.0577990	best: 15408.0577990 (100)	total: 879ms	remaining: 3m 5s
200:	learn: 12575.3812058	test: 13056.0336249	best: 13056.0336249 (200)	total: 1.75s	remaining: 3m 4s
300:	learn: 11021.7691254	test: 11651.4435990	best: 11651.4435990 (300)	total: 2.75s	remaining: 3m 12s
400:	learn: 10219.7546120	test: 10955.0892661	best: 10955.0892661 (400)	total: 3.74s	remaining: 3m 15s
500:	learn: 9693.8242030	test: 10517.7851624	best: 10517.7851624 (500)	total: 4.73s	remaining: 3m 17s
600:	learn: 9325.4864384	test: 10255.2011035	best: 10255.2011035 (600)	total: 5.73s	remaining: 3m 17s
700:	learn: 8967.2527697	test: 9980.9631249	best: 9980.9631249 (700)	total: 6.72s	remaining: 3m 18s
800:	learn: 8698.9012671	test: 9800.1331290	best: 9800.1331290 (800)	total: 7.72s	remaining: 3m 18s
900:	learn: 8452.2571842	test: 9629.6011940	best:

In [17]:
predict = model.predict(X_train)
RMSE = mean_squared_error(y_train, predict) ** 0.5
print(f"Train rmse : {RMSE}")

predict = model.predict(X_test)
RMSE = mean_squared_error(y_test, predict) ** 0.5
print(f"Test rmse : {RMSE}")

Train rmse : 5749.996477896793
Test rmse : 8244.853831213595


In [48]:
predict = model.predict(df_test.drop(columns=['target']))
# preds_df = pd.Series(predict, name='target')
preds_df = pd.DataFrame(predict, columns=["target"])

output_directory = '/data/ephemeral/home/output/final/'
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

result = np.round(preds_df).astype('int')

result.to_csv(f'{output_directory}model_cat_xgb.csv', index=False)

In [19]:
import joblib

# 모델 저장
output_directory = '/data/ephemeral/home/model/final/'
if not os.path.exists(output_directory):
    os.makedirs(output_directory)
joblib.dump(model, f'{output_directory}model_cat_xgb.pkl')

['/data/ephemeral/home/model/final/model_cat_xgb.pkl']

In [47]:
# 모델 불러오기
model = joblib.load('/data/ephemeral/home/model/final/model_cat_xgb.pkl')