In [1]:
from xgboost import XGBRegressor
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# 데이터 불러오기

In [2]:
df = pd.read_csv('0507 df_for_holdout.csv')

In [3]:
df.set_index('Date',inplace = True)

In [4]:
df.head()

Unnamed: 0_level_0,시언감성점수,M2,comp basemoney rate,base rate,consumer price index,kospi close,ex-rate close,효준감성점수,시언서술포함감성점수,주택,아파트,분양,서울,가구,부동산,단지,시장,next_spi
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
201101월,-62683860.0,1676.4,4.52,2.75,3.4,2069.73,1119.25,1856.746125,-163561800.0,1160,1094,666,729,1180,685,461,459,82.5
201102월,-62558320.0,1674.4,4.72,2.75,3.9,1939.3,1124.65,2046.702906,-177762100.0,1160,1101,639,652,1265,669,383,496,83.4
201103월,-69574410.0,1677.5,4.54,3.0,4.1,2106.7,1095.5,1715.795289,-198030600.0,1341,1102,1014,607,961,659,576,578,83.9
201104월,-73057330.0,1684.8,4.54,3.0,3.8,2192.36,1068.0,1635.262791,-189884900.0,1405,1186,977,658,1031,720,534,577,84.1
201105월,-93566330.0,1690.5,4.44,3.0,3.9,2142.47,1078.0,1922.518709,-240270200.0,1537,1543,1267,854,1245,1013,770,756,84.4


In [10]:
min(df.next_spi[90:])

99.5

# 전처리 및 Train/Test Set 분할

In [None]:
X = df.iloc[:,:-1]
y = df['next_spi'].to_list()

In [None]:
X.head()

In [None]:
# Holdout으로 Train/Test 분할
X_train_temp = X.iloc[:90]
y_train_temp = y[:90]
X_test = X.iloc[90:]
y_test = y[90:]

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train_temp, y_train_temp, test_size = 0.25, random_state = 0)

tf_idf_words = ['주택', '아파트', '분양', '서울', '가구', '부동산', '단지', '시장']

# 거시경제변수만 포함된 Case

In [None]:
# 거시경제변수만 포함된 X 만들기
X_train_basic = X_train.drop(['시언감성점수','효준감성점수','시언서술포함감성점수','주택', '아파트', '분양', '서울', '가구', '부동산', '단지', '시장'], axis = 1)
X_valid_basic = X_valid.drop(['시언감성점수','효준감성점수','시언서술포함감성점수','주택', '아파트', '분양', '서울', '가구', '부동산', '단지', '시장'], axis = 1)
X_test_basic = X_test.drop(['시언감성점수','효준감성점수','시언서술포함감성점수','주택', '아파트', '분양', '서울', '가구', '부동산', '단지', '시장'], axis = 1)

In [None]:
from sklearn.model_selection import GridSearchCV

# defining parameter range
param_grid = {
        'n_estimators' : [50, 100, 150, 200],
        'gamma': [0.3, 0.6, 0.9, 1.2, 1.5],
        'subsample' : [i/10.0 for i in range(6,11)],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 5, 7, 9]
        }
  
grid_XGBR = GridSearchCV(XGBRegressor(), param_grid, refit = 'mse', verbose = 0, cv = 5)

# fitting the model for grid search
grid_XGBR.fit(X_train_basic, y_train)

In [None]:
# valid에 대한 성능
y_pred = grid_XGBR.predict(X_valid_basic)
RMSE = mean_squared_error(y_valid, y_pred)**0.5

# RMSE 출력
print(RMSE)

# print best parameter after tuning
print(grid_XGBR.best_params_)
  
# print how our model looks after hyper-parameter tuning
print(grid_XGBR.best_estimator_)

In [None]:
del param_grid, grid_XGBR, RMSE, y_pred

In [None]:
# Test에 대한 성능
y_pred = grid_XGBR.predict(X_test_basic)
RMSE = mean_squared_error(y_test, y_pred)**0.5

# RMSE 출력
print(RMSE)

# print best parameter after tuning
print(grid_XGBR.best_params_)
  
# print how our model looks after hyper-parameter tuning
print(grid_XGBR.best_estimator_)

In [None]:
del param_grid, grid_XGBR, RMSE, y_pred

# 시언 감성 점수 Case

## 시언 감성 점수 데이터 전처리

In [None]:
# 시언 감성 점수 (명사만 이용) 포함 X 만들기
X_train_si = X_train.drop(['효준감성점수','시언서술포함감성점수','주택', '아파트', '분양', '서울', '가구', '부동산', '단지', '시장'],axis = 1)
X_valid_si = X_valid.drop(['효준감성점수','시언서술포함감성점수','주택', '아파트', '분양', '서울', '가구', '부동산', '단지', '시장'],axis = 1)
X_test_si = X_test.drop(['효준감성점수','시언서술포함감성점수','주택', '아파트', '분양', '서울', '가구', '부동산', '단지', '시장'],axis = 1)

In [None]:
X_train_si_verb = X_train.drop(['시언감성점수','효준감성점수','주택', '아파트', '분양', '서울', '가구', '부동산', '단지', '시장'],axis = 1)
X_valid_si_verb = X_valid.drop(['시언감성점수','효준감성점수','주택', '아파트', '분양', '서울', '가구', '부동산', '단지', '시장'],axis = 1)
X_test_si_verb = X_test.drop(['시언감성점수','효준감성점수','주택', '아파트', '분양', '서울', '가구', '부동산', '단지', '시장'],axis = 1)

## (시언 감성 점수 (명사)) 이용 SVR 수립 및 평가

In [None]:
# defining parameter range
param_grid = {
        'n_estimators' : [50, 100, 150, 200],
        'gamma': [0.3, 0.6, 0.9, 1.2, 1.5],
        'subsample' : [i/10.0 for i in range(6,11)],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 5, 7, 9]
        }
  
grid_XGBR = GridSearchCV(XGBRegressor(), param_grid, refit = 'mse', verbose = 0, cv = 5)

# fitting the model for grid search
grid_XGBR.fit(X_train_si, y_train)

In [None]:
# valid에 대한 성능
y_pred = grid_XGBR.predict(X_valid_si)
RMSE = mean_squared_error(y_valid, y_pred)**0.5

# RMSE 출력
print(RMSE)

# print best parameter after tuning
print(grid_XGBR.best_params_)
  
# print how our model looks after hyper-parameter tuning
print(grid_XGBR.best_estimator_)

In [None]:
del param_grid, grid_XGBR, RMSE, y_pred

In [None]:
# test에 대한 성능
y_pred = grid_XGBR.predict(X_test_si)
RMSE = mean_squared_error(y_test, y_pred)**0.5

# RMSE 출력
print(RMSE)

# print best parameter after tuning
print(grid_XGBR.best_params_)
  
# print how our model looks after hyper-parameter tuning
print(grid_XGBR.best_estimator_)

In [None]:
del param_grid, grid_XGBR, RMSE, y_pred

## (시언 감성 점수 (서술어 포함)) 이용 SVR 수립 및 평가

In [None]:
# defining parameter range
param_grid = {
        'n_estimators' : [50, 100, 150, 200],
        'gamma': [0.3, 0.6, 0.9, 1.2, 1.5],
        'subsample' : [i/10.0 for i in range(6,11)],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 5, 7, 9]
        }
  
grid_XGBR = GridSearchCV(XGBRegressor(), param_grid, refit = 'mse', verbose = 0, cv = 5)

# fitting the model for grid search
grid_XGBR.fit(X_train_si_verb, y_train)

In [None]:
# valid에 대한 성능 
y_pred = grid_XGBR.predict(X_valid_si_verb)
RMSE = mean_squared_error(y_valid, y_pred)**0.5

# RMSE 출력
print(RMSE)

# print best parameter after tuning
print(grid_XGBR.best_params_)
  
# print how our model looks after hyper-parameter tuning
print(grid_XGBR.best_estimator_)

In [None]:
del param_grid, grid_XGBR, RMSE, y_pred

In [None]:
# test에 대한 성능 
y_pred = grid_XGBR.predict(X_test_si_verb)
RMSE = mean_squared_error(y_test, y_pred)**0.5

# RMSE 출력
print(RMSE)

# print best parameter after tuning
print(grid_XGBR.best_params_)
  
# print how our model looks after hyper-parameter tuning
print(grid_XGBR.best_estimator_)

In [None]:
del param_grid, grid_XGBR, RMSE, y_pred

# 효준 감성 점수(명사) Case

## 효준 감성 점수 포함 성능

In [None]:
X_train_hj = X_train.drop(['시언감성점수','시언서술포함감성점수','주택', '아파트', '분양', '서울', '가구', '부동산', '단지', '시장'],axis = 1)
X_valid_hj = X_valid.drop(['시언감성점수','시언서술포함감성점수','주택', '아파트', '분양', '서울', '가구', '부동산', '단지', '시장'],axis = 1)
X_test_hj = X_test.drop(['시언감성점수','시언서술포함감성점수','주택', '아파트', '분양', '서울', '가구', '부동산', '단지', '시장'],axis = 1)

In [None]:
# defining parameter range
param_grid = {
        'n_estimators' : [50, 100, 150, 200],
        'gamma': [0.3, 0.6, 0.9, 1.2, 1.5],
        'subsample' : [i/10.0 for i in range(6,11)],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 5, 7, 9]
        }
  
grid_XGBR = GridSearchCV(XGBRegressor(), param_grid, refit = 'mse', verbose = 0, cv = 5)

# fitting the model for grid search
grid_XGBR.fit(X_train_hj, y_train)

In [None]:
# valid에 대한 성능 
y_pred = grid_XGBR.predict(X_valid_hj)
RMSE = mean_squared_error(y_valid, y_pred)**0.5

# RMSE 출력
print(RMSE)

# print best parameter after tuning
print(grid_XGBR.best_params_)
  
# print how our model looks after hyper-parameter tuning
print(grid_XGBR.best_estimator_)

In [None]:
del param_grid, grid_XGBR, RMSE, y_pred

In [None]:
# test에 대한 성능 
y_pred = grid_XGBR.predict(X_test_hj)
RMSE = mean_squared_error(y_test, y_pred)**0.5

# RMSE 출력
print(RMSE)

# print best parameter after tuning
print(grid_XGBR.best_params_)
  
# print how our model looks after hyper-parameter tuning
print(grid_XGBR.best_estimator_)

In [None]:
del param_grid, grid_XGBR, RMSE, y_pred

# TF-IDF 상위 8개 단어 빈도수 추가 Case

In [None]:
X_train_tf_idf = X_train.drop(columns = [], axis = 1)
X_valid_tf_idf = X_valid.drop(columns = [], axis = 1)
X_test_tf_idf = X_test.drop(columns = [], axis = 1)

In [None]:
# defining parameter range
param_grid = {
        'n_estimators' : [50, 100, 150, 200],
        'gamma': [0.3, 0.6, 0.9, 1.2, 1.5],
        'subsample' : [i/10.0 for i in range(6,11)],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 5, 7, 9]
        }
  
grid_XGBR = GridSearchCV(XGBRegressor(), param_grid, refit = 'mse', verbose = 0, cv = 5)

# fitting the model for grid search
grid_XGBR.fit(X_train_tf_idf, y_train)

In [None]:
# valid에 대한 성능
y_pred = grid_XGBR.predict(X_valid_tf_idf)
RMSE = mean_squared_error(y_valid, y_pred)**0.5

# RMSE 출력
print(RMSE)

# print best parameter after tuning
print(grid_XGBR.best_params_)
  
# print how our model looks after hyper-parameter tuning
print(grid_XGBR.best_estimator_)

In [None]:
del param_grid, grid_XGBR, RMSE, y_pred

In [None]:
# test에 대한 성능
y_pred = grid_XGBR.predict(X_test_tf_idf)
RMSE = mean_squared_error(y_test, y_pred)**0.5

# RMSE 출력
print(RMSE)

# print best parameter after tuning
print(grid_XGBR.best_params_)
  
# print how our model looks after hyper-parameter tuning
print(grid_XGBR.best_estimator_)

In [None]:
del param_grid, grid_XGBR, RMSE, y_pred