In [1]:
from xgboost import XGBRegressor
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# 데이터 불러오기

In [2]:
train = pd.read_csv('0507_trainset_for_shuffle.csv')
test = pd.read_csv('0507_testset_for_shuffle.csv')

In [3]:
train.set_index('Date',inplace= True)
test.set_index('Date',inplace = True)

# 전처리 및 Train/Test Set 분할

In [4]:
X_train = train.copy()
y_train = X_train.pop('next_spi')

X_test = test.copy()
y_test = X_test.pop('next_spi')

In [5]:
X_train.columns

Index(['시언감성점수', 'M2', 'comp basemoney rate', 'base rate',
       'consumer price index', 'kospi close', 'ex-rate close', '효준감성점수',
       '시언서술포함감성점수', '주택', '아파트', '분양', '서울', '가구', '부동산', '단지', '시장'],
      dtype='object')

tf_idf_words = ['주택', '아파트', '분양', '서울', '가구', '부동산', '단지', '시장']

# 거시경제변수만 포함된 Case

In [16]:
# 거시경제변수만 포함된 X 만들기
X_train_basic = X_train.drop(['시언감성점수','효준감성점수','시언서술포함감성점수','주택', '아파트', '분양', '서울', '가구', '부동산', '단지', '시장'], axis = 1)
X_test_basic = X_test.drop(['시언감성점수','효준감성점수','시언서술포함감성점수','주택', '아파트', '분양', '서울', '가구', '부동산', '단지', '시장'], axis = 1)

In [17]:
from sklearn.model_selection import GridSearchCV

# defining parameter range
param_grid = {
        'n_estimators' : [50, 100, 150, 200],
        'gamma': [0.3, 0.6, 0.9, 1.2, 1.5],
        'subsample' : [i/10.0 for i in range(6,11)],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 5, 7, 9]
        }
  
grid_XGBR = GridSearchCV(XGBRegressor(), param_grid, refit = 'mse', verbose = 0, cv = 5)

# fitting the model for grid search
grid_XGBR.fit(X_train_basic, y_train)





































































































































































































































KeyboardInterrupt: 

In [None]:
y_pred = grid_XGBR.predict(X_test_basic)
RMSE = mean_squared_error(y_test, y_pred)**0.5

# RMSE 출력
print(RMSE)

# print best parameter after tuning
print(grid_XGBR.best_params_)
  
# print how our model looks after hyper-parameter tuning
print(grid_XGBR.best_estimator_)

In [None]:
del param_grid, grid_XGBR, RMSE, y_pred

# 시언 감성 점수 Case

## 시언 감성 점수 데이터 전처리

In [None]:
# 시언 감성 점수 (명사만 이용) 포함 X 만들기
X_train_si = X_train.drop(['효준감성점수','시언서술포함감성점수','주택', '아파트', '분양', '서울', '가구', '부동산', '단지', '시장'],axis = 1)
X_test_si = X_test.drop(['효준감성점수','시언서술포함감성점수','주택', '아파트', '분양', '서울', '가구', '부동산', '단지', '시장'],axis = 1)

In [None]:
X_train_si_verb = X_train.drop(['시언감성점수','효준감성점수','주택', '아파트', '분양', '서울', '가구', '부동산', '단지', '시장'],axis = 1)
X_test_si_verb = X_test.drop(['시언감성점수','효준감성점수','주택', '아파트', '분양', '서울', '가구', '부동산', '단지', '시장'],axis = 1)

## (시언 감성 점수 (명사)) 이용 SVR 수립 및 평가

In [None]:
# defining parameter range
param_grid = {
        'n_estimators' : [50, 100, 150, 200],
        'gamma': [0.3, 0.6, 0.9, 1.2, 1.5],
        'subsample' : [i/10.0 for i in range(6,11)],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 5, 7, 9]
        }
  
grid_XGBR = GridSearchCV(XGBRegressor(), param_grid, refit = 'mse', verbose = 0, cv = 5)

# fitting the model for grid search
grid_XGBR.fit(X_train_si, y_train)

In [None]:
y_pred = grid_XGBR.predict(X_test_si)
RMSE = mean_squared_error(y_test, y_pred)**0.5

# RMSE 출력
print(RMSE)

# print best parameter after tuning
print(grid_XGBR.best_params_)
  
# print how our model looks after hyper-parameter tuning
print(grid_XGBR.best_estimator_)

In [None]:
del param_grid, grid_XGBR, RMSE, y_pred

## (시언 감성 점수 (서술어 포함)) 이용 SVR 수립 및 평가

In [None]:
# defining parameter range
param_grid = {
        'n_estimators' : [50, 100, 150, 200],
        'gamma': [0.3, 0.6, 0.9, 1.2, 1.5],
        'subsample' : [i/10.0 for i in range(6,11)],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 5, 7, 9]
        }
  
grid_XGBR = GridSearchCV(XGBRegressor(), param_grid, refit = 'mse', verbose = 0, cv = 5)

# fitting the model for grid search
grid_XGBR.fit(X_train_si_verb, y_train)

In [None]:
y_pred = grid_XGBR.predict(X_test_si_verb)
RMSE = mean_squared_error(y_test, y_pred)**0.5

# RMSE 출력
print(RMSE)

# print best parameter after tuning
print(grid_XGBR.best_params_)
  
# print how our model looks after hyper-parameter tuning
print(grid_XGBR.best_estimator_)

In [None]:
del param_grid, grid_XGBR, RMSE, y_pred

# 효준 감성 점수(명사) Case

## 효준 감성 점수 포함 성능

In [None]:
X_train_hj = X_train.drop(['시언감성점수','시언서술포함감성점수','주택', '아파트', '분양', '서울', '가구', '부동산', '단지', '시장'],axis = 1)
X_test_hj = X_test.drop(['시언감성점수','시언서술포함감성점수','주택', '아파트', '분양', '서울', '가구', '부동산', '단지', '시장'],axis = 1)

In [None]:
# defining parameter range
param_grid = {
        'n_estimators' : [50, 100, 150, 200],
        'gamma': [0.3, 0.6, 0.9, 1.2, 1.5],
        'subsample' : [i/10.0 for i in range(6,11)],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 5, 7, 9]
        }
  
grid_XGBR = GridSearchCV(XGBRegressor(), param_grid, refit = 'mse', verbose = 0, cv = 5)

# fitting the model for grid search
grid_XGBR.fit(X_train_hj, y_train)

In [None]:
y_pred = grid_XGBR.predict(X_test_hj)
RMSE = mean_squared_error(y_test, y_pred)**0.5

# RMSE 출력
print(RMSE)

# print best parameter after tuning
print(grid_XGBR.best_params_)
  
# print how our model looks after hyper-parameter tuning
print(grid_XGBR.best_estimator_)

In [None]:
del param_grid, grid_XGBR, RMSE, y_pred

# TF-IDF 상위 8개 단어 빈도수 추가 Case

In [None]:
X_train_tf_idf = X_train.drop(columns = [], axis = 1)
X_test_tf_idf = X_test.drop(columns = [], axis = 1)

In [None]:
# defining parameter range
param_grid = {
        'n_estimators' : [50, 100, 150, 200],
        'gamma': [0.3, 0.6, 0.9, 1.2, 1.5],
        'subsample' : [i/10.0 for i in range(6,11)],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 5, 7, 9]
        }
  
grid_XGBR = GridSearchCV(XGBRegressor(), param_grid, refit = 'mse', verbose = 0, cv = 5)

# fitting the model for grid search
grid_XGBR.fit(X_train_tf_idf, y_train)

In [None]:
y_pred = grid_XGBR.predict(X_test_tf_idf)
RMSE = mean_squared_error(y_test, y_pred)**0.5

# RMSE 출력
print(RMSE)

# print best parameter after tuning
print(grid_XGBR.best_params_)
  
# print how our model looks after hyper-parameter tuning
print(grid_XGBR.best_estimator_)

In [None]:
del param_grid, grid_XGBR, RMSE, y_pred