In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import RandomForestRegressor
from tqdm import tqdm
from sklearn.metrics import mean_squared_error

In [2]:
data = pd.read_csv('standard_dataset.csv')
data

Unnamed: 0.1,Unnamed: 0,area,floor,year_of_completion,price_log,gu_ec_price,top10_brand_ec2_price,top20_danji_ec2_price,transaction_year_month_ec
0,0,79.97,4,1987,11.775297,1.000000,0.079817,0.154972,0
1,1,79.97,2,1987,11.669938,1.000000,0.079817,0.154972,0
2,2,79.97,1,1987,11.775297,1.000000,0.079817,0.154972,0
3,3,79.97,2,1987,11.845827,1.000000,0.079817,0.154972,2
4,4,54.98,5,1987,11.585255,1.000000,0.079817,0.154972,3
...,...,...,...,...,...,...,...,...,...
240843,240916,67.57,2,2003,10.165890,0.030539,0.079817,0.087630,31
240844,240917,95.94,6,2007,10.691968,0.030539,0.079817,0.110988,30
240845,240918,100.17,5,2007,10.911464,0.030539,0.079817,0.110988,35
240846,240919,77.71,5,2007,10.596660,0.030539,0.079817,0.110988,32


In [3]:
data.rename(columns={'Unnamed: 0':'id'},inplace=True)

In [4]:
data

Unnamed: 0,id,area,floor,year_of_completion,price_log,gu_ec_price,top10_brand_ec2_price,top20_danji_ec2_price,transaction_year_month_ec
0,0,79.97,4,1987,11.775297,1.000000,0.079817,0.154972,0
1,1,79.97,2,1987,11.669938,1.000000,0.079817,0.154972,0
2,2,79.97,1,1987,11.775297,1.000000,0.079817,0.154972,0
3,3,79.97,2,1987,11.845827,1.000000,0.079817,0.154972,2
4,4,54.98,5,1987,11.585255,1.000000,0.079817,0.154972,3
...,...,...,...,...,...,...,...,...,...
240843,240916,67.57,2,2003,10.165890,0.030539,0.079817,0.087630,31
240844,240917,95.94,6,2007,10.691968,0.030539,0.079817,0.110988,30
240845,240918,100.17,5,2007,10.911464,0.030539,0.079817,0.110988,35
240846,240919,77.71,5,2007,10.596660,0.030539,0.079817,0.110988,32


In [5]:
data.drop('id',axis=1, inplace=True)

In [6]:
# 중복 데이터 확인 및 제거
sum(data.duplicated(keep=False))

7852

In [7]:
# 제거
data.drop_duplicates(keep='first',inplace=True)

In [8]:
data

Unnamed: 0,area,floor,year_of_completion,price_log,gu_ec_price,top10_brand_ec2_price,top20_danji_ec2_price,transaction_year_month_ec
0,79.97,4,1987,11.775297,1.000000,0.079817,0.154972,0
1,79.97,2,1987,11.669938,1.000000,0.079817,0.154972,0
2,79.97,1,1987,11.775297,1.000000,0.079817,0.154972,0
3,79.97,2,1987,11.845827,1.000000,0.079817,0.154972,2
4,54.98,5,1987,11.585255,1.000000,0.079817,0.154972,3
...,...,...,...,...,...,...,...,...
240842,106.21,6,2003,10.723289,0.030539,0.079817,0.087630,26
240843,67.57,2,2003,10.165890,0.030539,0.079817,0.087630,31
240844,95.94,6,2007,10.691968,0.030539,0.079817,0.110988,30
240845,100.17,5,2007,10.911464,0.030539,0.079817,0.110988,35


In [9]:
data.sort_values(by='transaction_year_month_ec',inplace=True)

In [10]:
data

Unnamed: 0,area,floor,year_of_completion,price_log,gu_ec_price,top10_brand_ec2_price,top20_danji_ec2_price,transaction_year_month_ec
0,79.9700,4,1987,11.775297,1.000000,0.079817,0.154972,0
38814,89.2000,5,2002,10.545368,0.147771,0.079817,0.087630,0
38809,49.6000,13,2005,10.118639,0.147771,0.079817,0.087630,0
38802,84.9200,2,2005,10.434145,0.147771,0.079817,0.087630,0
38800,84.8900,8,2005,10.442930,0.147771,0.079817,0.087630,0
...,...,...,...,...,...,...,...,...
224544,59.4000,10,1997,11.667370,0.293756,0.079817,0.087630,35
179608,64.4500,8,2005,10.725490,0.066155,0.079817,0.087630,35
179607,64.4500,18,2005,10.857093,0.066155,0.079817,0.087630,35
224512,77.0731,8,2019,11.711785,0.293756,0.228032,0.228032,35


In [11]:
train = data.loc[data['transaction_year_month_ec']<=30,:]
test = data.loc[data['transaction_year_month_ec']>30,:]

In [12]:
test

Unnamed: 0,area,floor,year_of_completion,price_log,gu_ec_price,top10_brand_ec2_price,top20_danji_ec2_price,transaction_year_month_ec
182973,115.8010,7,2014,11.451061,0.014223,0.206501,0.087630,31
159697,103.0700,5,2005,11.982935,1.000000,0.079817,0.087630,31
238684,67.6630,6,2006,11.122856,0.030539,0.079817,0.087630,31
179563,41.4000,6,2002,9.903538,0.066155,0.079817,0.087630,31
182979,113.3290,17,2014,11.691080,0.014223,0.206501,0.087630,31
...,...,...,...,...,...,...,...,...
224544,59.4000,10,1997,11.667370,0.293756,0.079817,0.087630,35
179608,64.4500,8,2005,10.725490,0.066155,0.079817,0.087630,35
179607,64.4500,18,2005,10.857093,0.066155,0.079817,0.087630,35
224512,77.0731,8,2019,11.711785,0.293756,0.228032,0.228032,35


In [13]:
train_X, train_y = train.drop('price_log',axis=1), train['price_log']
test_X, test_y = test.drop('price_log',axis=1), test['price_log']

In [14]:
train_X

Unnamed: 0,area,floor,year_of_completion,gu_ec_price,top10_brand_ec2_price,top20_danji_ec2_price,transaction_year_month_ec
0,79.97,4,1987,1.000000,0.079817,0.154972,0
38814,89.20,5,2002,0.147771,0.079817,0.087630,0
38809,49.60,13,2005,0.147771,0.079817,0.087630,0
38802,84.92,2,2005,0.147771,0.079817,0.087630,0
38800,84.89,8,2005,0.147771,0.079817,0.087630,0
...,...,...,...,...,...,...,...
204937,114.75,16,2000,0.375009,0.159448,0.163198,30
205436,30.01,4,2016,0.375009,0.129869,0.087630,30
205446,47.25,4,1970,0.375009,0.079817,0.087630,30
165885,59.98,14,2004,0.059213,0.108761,0.407625,30


In [15]:
# 타겟변수인 거래가격의 특이값에 휘둘리지 않는 RMSLE(로그변환된 RMSE)를 평가지표로 사용
def RMSE(y, y_pred):
    rmse = mean_squared_error(y, y_pred) ** 0.5
    return rmse

def rmse_cv(model):
    # cv별로 학습하는 함수
    tscv = TimeSeriesSplit()
    rmse_list = []
    model_name = model.__class__.__name__
    for _, (train_index, test_index) in tqdm(enumerate(tscv.split(train_X), start=1), desc=f'{model_name} Cross Validations...', total=5):
        X_train, X_test = train_X.iloc[train_index], train_X.iloc[test_index]
        y_train, y_test = train_y.iloc[train_index], train_y.iloc[test_index]
        clf = model.fit(X_train, y_train)
        pred = clf.predict(X_test)
        rmse = RMSE(y_test, pred) 
        rmse_list.append(rmse)
    return model_name, rmse_list

def print_rmse_score(model):
    # cv별 프린팅, 평균 저장
    model_name, score = rmse_cv(model)
    for i, r in enumerate(score, start=1):
        print(f'{i} FOLDS: {model_name} RMSLE: {r:.4f}')
    print(f'\n{model_name} mean RMSLE: {np.mean(score):.4f}')
    print('='*40)
    return model_name, np.mean(score)

In [16]:
rf = RandomForestRegressor(n_estimators=100, criterion='mse', max_depth=5, min_samples_split=50,
                           min_samples_leaf=5, random_state=1, n_jobs=-1)

In [90]:
models = []
scores = []
for model in [rf]:
    model_name, mean_score = print_rmse_score(model)
    models.append(model_name)
    scores.append(mean_score)

RandomForestRegressor Cross Validations...: 100%|████████████████████████████████████████| 5/5 [00:21<00:00,  4.23s/it]

1 FOLDS: RandomForestRegressor RMSLE: 0.2847
2 FOLDS: RandomForestRegressor RMSLE: 0.3173
3 FOLDS: RandomForestRegressor RMSLE: 0.3280
4 FOLDS: RandomForestRegressor RMSLE: 0.3096
5 FOLDS: RandomForestRegressor RMSLE: 0.3490

RandomForestRegressor mean RMSLE: 0.3177



