In [1]:
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# 데이터 불러오기

In [2]:
train = pd.read_csv('0507_trainset_for_shuffle.csv')
test = pd.read_csv('0507_testset_for_shuffle.csv')

In [5]:
train.set_index('Date',inplace= True)
test.set_index('Date',inplace = True)

# 전처리 및 Train/Test Set 분할

In [7]:
X_train = train.copy()
y_train = X_train.pop('next_spi')

X_test = test.copy()
y_test = X_test.pop('next_spi')

In [8]:
X_train.columns

Index(['시언감성점수', 'M2', 'comp basemoney rate', 'base rate',
       'consumer price index', 'kospi close', 'ex-rate close', '효준감성점수',
       '시언서술포함감성점수', '주택', '아파트', '분양', '서울', '가구', '부동산', '단지', '시장'],
      dtype='object')

tf_idf_words = ['주택', '아파트', '분양', '서울', '가구', '부동산', '단지', '시장']

# 거시경제변수만 포함된 Case

In [13]:
# 거시경제변수만 포함된 X 만들기
X_train_basic = X_train.drop(['시언감성점수','효준감성점수','시언서술포함감성점수','주택', '아파트', '분양', '서울', '가구', '부동산', '단지', '시장'], axis = 1)
X_test_basic = X_test.drop(['시언감성점수','효준감성점수','시언서술포함감성점수','주택', '아파트', '분양', '서울', '가구', '부동산', '단지', '시장'], axis = 1)

In [14]:
# SVM 인풋 데이터 스케일링
sc = StandardScaler()
X_train_basic_svm = sc.fit_transform(X_train_basic)
X_test_basic_svm = sc.transform(X_test_basic)

In [17]:
from sklearn.model_selection import GridSearchCV

# defining parameter range
param_grid_svm = {'C': [0.1, 1, 10, 100, 1000], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['linear','poly','rbf','sigmoid']} 
  
grid_svm = GridSearchCV(SVR(), param_grid_svm, refit = 'mse', verbose = 0, cv = 5)

# fitting the model for grid search
grid_svm.fit(X_train_basic_svm, y_train)

GridSearchCV(cv=5, estimator=SVR(),
             param_grid={'C': [0.1, 1, 10, 100, 1000],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['linear', 'poly', 'rbf', 'sigmoid']},
             refit='mse')

In [18]:
y_pred = grid_svm.predict(X_test_basic_svm)
RMSE = mean_squared_error(y_test, y_pred)**0.5

# RMSE 출력
print(RMSE)

# print best parameter after tuning
print(grid_svm.best_params_)
  
# print how our model looks after hyper-parameter tuning
print(grid_svm.best_estimator_)

1.002343061216197
{'C': 1000, 'gamma': 0.01, 'kernel': 'rbf'}
SVR(C=1000, gamma=0.01)


In [23]:
del param_grid_svm, grid_svm, RMSE, y_pred

# 시언 감성 점수 Case

## 시언 감성 점수 데이터 전처리

In [24]:
# 시언 감성 점수 (명사만 이용) 포함 X 만들기
X_train_si = X_train.drop(['효준감성점수','시언서술포함감성점수','주택', '아파트', '분양', '서울', '가구', '부동산', '단지', '시장'],axis = 1)
X_test_si = X_test.drop(['효준감성점수','시언서술포함감성점수','주택', '아파트', '분양', '서울', '가구', '부동산', '단지', '시장'],axis = 1)

In [25]:
X_train_si_verb = X_train.drop(['시언감성점수','효준감성점수','주택', '아파트', '분양', '서울', '가구', '부동산', '단지', '시장'],axis = 1)
X_test_si_verb = X_test.drop(['시언감성점수','효준감성점수','주택', '아파트', '분양', '서울', '가구', '부동산', '단지', '시장'],axis = 1)

In [28]:
# SVM용 인풋 데이터 스케일링
sc = StandardScaler()

# 시언 감성 점수 (명사만 이용) => 표준화 
X_train_si_svm = sc.fit_transform(X_train_si)
X_test_si_svm = sc.transform(X_test_si)

# 시언 감성 점수 (서술어 포함) => 표준화
X_train_si_verb_svm = sc.fit_transform(X_train_si_verb)
X_test_si_verb_svm = sc.transform(X_test_si_verb)

## (시언 감성 점수 (명사)) 이용 SVR 수립 및 평가

In [29]:
# defining parameter range
param_grid_svm = {'C': [0.1, 1, 10, 100, 1000], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['linear','poly','rbf','sigmoid']} 
  
grid_svm = GridSearchCV(SVR(), param_grid_svm, refit = 'mse', verbose = 0, cv = 5)

# fitting the model for grid search
grid_svm.fit(X_train_si_svm, y_train)

GridSearchCV(cv=5, estimator=SVR(),
             param_grid={'C': [0.1, 1, 10, 100, 1000],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['linear', 'poly', 'rbf', 'sigmoid']},
             refit='mse')

In [30]:
y_pred = grid_svm.predict(X_test_si_svm)
RMSE = mean_squared_error(y_test, y_pred)**0.5

# RMSE 출력
print(RMSE)

# print best parameter after tuning
print(grid_svm.best_params_)
  
# print how our model looks after hyper-parameter tuning
print(grid_svm.best_estimator_)

0.9512734164525404
{'C': 1000, 'gamma': 0.01, 'kernel': 'rbf'}
SVR(C=1000, gamma=0.01)


In [31]:
del param_grid_svm, grid_svm, RMSE, y_pred

## (시언 감성 점수 (서술어 포함)) 이용 SVR 수립 및 평가

In [32]:
# defining parameter range
param_grid_svm = {'C': [0.1, 1, 10, 100, 1000], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['linear','poly','rbf','sigmoid']} 
  
grid_svm = GridSearchCV(SVR(), param_grid_svm, refit = 'mse', verbose = 0, cv = 5)

# fitting the model for grid search
grid_svm.fit(X_train_si_verb_svm, y_train)

GridSearchCV(cv=5, estimator=SVR(),
             param_grid={'C': [0.1, 1, 10, 100, 1000],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['linear', 'poly', 'rbf', 'sigmoid']},
             refit='mse')

In [33]:
y_pred = grid_svm.predict(X_test_si_svm)
RMSE = mean_squared_error(y_test, y_pred)**0.5

# RMSE 출력
print(RMSE)

# print best parameter after tuning
print(grid_svm.best_params_)
  
# print how our model looks after hyper-parameter tuning
print(grid_svm.best_estimator_)

7.222971492746435
{'C': 1000, 'gamma': 0.01, 'kernel': 'rbf'}
SVR(C=1000, gamma=0.01)


In [34]:
del param_grid_svm, grid_svm, RMSE, y_pred

# 효준 감성 점수(명사) Case

## 효준 감성 점수 포함 성능

In [35]:
X_train_hj = X_train.drop(['시언감성점수','시언서술포함감성점수','주택', '아파트', '분양', '서울', '가구', '부동산', '단지', '시장'],axis = 1)
X_test_hj = X_test.drop(['시언감성점수','시언서술포함감성점수','주택', '아파트', '분양', '서울', '가구', '부동산', '단지', '시장'],axis = 1)

In [36]:
sc = StandardScaler()
X_train_hj = sc.fit_transform(X_train_hj)
X_test_hj = sc.transform(X_test_hj)

In [37]:
# defining parameter range
param_grid_svm = {'C': [0.1, 1, 10, 100, 1000], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['linear','poly','rbf','sigmoid']} 
  
grid_svm = GridSearchCV(SVR(), param_grid_svm, refit = 'mse', verbose = 0, cv = 5)

# fitting the model for grid search
grid_svm.fit(X_train_hj, y_train)

GridSearchCV(cv=5, estimator=SVR(),
             param_grid={'C': [0.1, 1, 10, 100, 1000],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['linear', 'poly', 'rbf', 'sigmoid']},
             refit='mse')

In [38]:
y_pred = grid_svm.predict(X_test_hj)
RMSE = mean_squared_error(y_test, y_pred)**0.5

# RMSE 출력
print(RMSE)

# print best parameter after tuning
print(grid_svm.best_params_)
  
# print how our model looks after hyper-parameter tuning
print(grid_svm.best_estimator_)

1.013931719238197
{'C': 1000, 'gamma': 0.01, 'kernel': 'rbf'}
SVR(C=1000, gamma=0.01)


In [39]:
del param_grid_svm, grid_svm, RMSE, y_pred

# TF-IDF 상위 8개 단어 빈도수 추가 Case

In [40]:
X_train_tf_idf = X_train.drop(columns = [], axis = 1)
X_test_tf_idf = X_test.drop(columns = [], axis = 1)

In [41]:
sc = StandardScaler()
X_train_tf_idf = sc.fit_transform(X_train_tf_idf)
X_test_tf_idf = sc.transform(X_test_tf_idf)

In [42]:
# defining parameter range
param_grid_svm = {'C': [0.1, 1, 10, 100, 1000], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['linear','poly','rbf','sigmoid']} 
  
grid_svm = GridSearchCV(SVR(), param_grid_svm, refit = 'mse', verbose = 0, cv = 5)

# fitting the model for grid search
grid_svm.fit(X_train_tf_idf, y_train)

GridSearchCV(cv=5, estimator=SVR(),
             param_grid={'C': [0.1, 1, 10, 100, 1000],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['linear', 'poly', 'rbf', 'sigmoid']},
             refit='mse')

In [43]:
y_pred = grid_svm.predict(X_test_tf_idf)
RMSE = mean_squared_error(y_test, y_pred)**0.5

# RMSE 출력
print(RMSE)

# print best parameter after tuning
print(grid_svm.best_params_)
  
# print how our model looks after hyper-parameter tuning
print(grid_svm.best_estimator_)

0.9135862161413001
{'C': 1000, 'gamma': 0.01, 'kernel': 'rbf'}
SVR(C=1000, gamma=0.01)


In [44]:
del param_grid_svm, grid_svm, RMSE, y_pred