In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import label_binarize
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, roc_curve, auc, classification_report

In [2]:
import json
with open("df_1hour_Feb.json", 'r', encoding='UTF-8') as f:
    F_data = json.load(f)

In [3]:
with open("df_1hour_Mar.json", 'r', encoding='UTF-8') as f:
    M_data = json.load(f)

In [4]:
COLUMNS = ['name', 'code', 'time', 'price', 'time_1', 'price_1', 'price_dif_1', 'sell_1', 'buy_1', 'volume_1', 'variation_1', 'post_num_1', 'unique_id_1', 'click_1', 'like_1', 'dislike_1', 'time_2', 'price_2', 'price_dif_2', 'sell_2', 'buy_2', 'volume_2', 'variation_2', 'post_num_2', 'unique_id_2', 'click_2', 'like_2', 'dislike_2', 'time_3', 'price_3', 'price_dif_3', 'sell_3', 'buy_3', 'volume_3', 'variation_3', 'post_num_3', 'unique_id_3', 'click_3', 'like_3', 'dislike_3', 'mkt_cap', 'kospi', 'kosdaq', 'trash', 'yesterday_closing_price', 'is_maximum', 'is_minimum', 'price_volatility', 'price_trend', 'average_price_volatility', 'sell_minus_buy_1', 'sell_minus_buy_2', 'sell_minus_buy_3', 'is_price_gap_stable', 'price_gap_volatility', 'is_like_higher', 'volume_trend', 'post_num_trend', 'unique_id_trend', 'click_trend', 'price_increase', 'did_price_increase', 'did_price_033', 'did_price_100', 'did_price_150', 'kospi_ind', 'kosdaq_ind', 'time_slot', 'ko_inter', 'early_mor', 'morning', 'lunch', 'afternoon', 'late', 'mkt_change', 'alpha', 'per_now', 'kospi_1', 'kospi_2', 'kospi_3', 'kospi_answer', 'kosdaq_1', 'kosdaq_2', 'kosdaq_3', 'kosdaq_answer', 'kospi_trend', 'kosdaq_trend', 'kospi_increase', 'kosdaq_increase', 'market_increase', 'did_opening_price_increase', 'price_1_sq', 'price_dif_1_sq', 'sell_1_sq', 'buy_1_sq', 'volume_1_sq', 'variation_1_sq', 'post_num_1_sq', 'unique_id_1_sq', 'click_1_sq', 'like_1_sq', 'dislike_1_sq', 'price_2_sq', 'price_dif_2_sq', 'sell_2_sq', 'buy_2_sq', 'volume_2_sq', 'variation_2_sq', 'post_num_2_sq', 'unique_id_2_sq', 'click_2_sq', 'like_2_sq', 'dislike_2_sq', 'price_3_sq', 'price_dif_3_sq', 'sell_3_sq', 'buy_3_sq', 'volume_3_sq', 'variation_3_sq', 'post_num_3_sq', 'unique_id_3_sq', 'click_3_sq', 'like_3_sq', 'dislike_3_sq', 'mkt_cap_sq', 'yesterday_closing_price_sq', 'price_volatility_sq', 'price_trend_sq', 'average_price_volatility_sq', 'sell_minus_buy_1_sq', 'sell_minus_buy_2_sq', 'sell_minus_buy_3_sq', 'price_gap_volatility_sq', 'volume_trend_sq', 'post_num_trend_sq', 'unique_id_trend_sq', 'click_trend_sq', 'kospi_ind_sq', 'kosdaq_ind_sq', 'time_slot_sq', 'ko_inter_sq', 'mkt_change_sq', 'alpha_sq', 'per_now_sq', 'kospi_1_sq', 'kospi_2_sq', 'kospi_3_sq', 'kosdaq_1_sq', 'kosdaq_2_sq', 'kosdaq_3_sq', 'kospi_trend_sq', 'kosdaq_trend_sq']

In [5]:
df_F = pd.DataFrame(F_data, columns = COLUMNS)
df_M = pd.DataFrame(M_data, columns = COLUMNS)
df = pd.concat([df_F, df_M])

In [6]:
df = df.dropna(axis=0, how='any')

In [7]:
time_filter = (df['time'].str.startswith("2018-02-21")) | \
              (df['time'].str.startswith("2018-02-20")) | \
              (df['time'].str.startswith("2018-02-14")) 
        
train_df = df[time_filter].reset_index(drop = True)
test_df = df[~time_filter].reset_index(drop = True)

In [8]:
X_COL = ['price_1','price_dif_1','sell_1','buy_1','volume_1','variation_1','post_num_1','unique_id_1','click_1',
     'like_1','dislike_1','price_2','price_dif_2','sell_2','buy_2','volume_2','variation_2','post_num_2','unique_id_2',
     'click_2','like_2','dislike_2','price_3','price_dif_3','sell_3','buy_3','volume_3','variation_3','post_num_3',
     'unique_id_3','click_3','like_3','dislike_3','mkt_cap','kospi','kosdaq','trash','yesterday_closing_price',
     'is_maximum','is_minimum','price_volatility','price_trend','average_price_volatility','sell_minus_buy_1',
     'sell_minus_buy_2','sell_minus_buy_3','is_price_gap_stable','price_gap_volatility','is_like_higher',
     'volume_trend','post_num_trend','unique_id_trend','click_trend','kospi_ind','kosdaq_ind','time_slot',
     'ko_inter','early_mor','morning','lunch','afternoon','late','mkt_change','alpha','per_now','kospi_1','kospi_2',
     'kospi_3','kosdaq_1','kosdaq_2','kosdaq_3','kospi_trend','kosdaq_trend','did_opening_price_increase',
     'price_1_sq','price_dif_1_sq','sell_1_sq','buy_1_sq','volume_1_sq','variation_1_sq','post_num_1_sq',
     'unique_id_1_sq','click_1_sq','like_1_sq','dislike_1_sq','price_2_sq','price_dif_2_sq','sell_2_sq',
     'buy_2_sq','volume_2_sq','variation_2_sq','post_num_2_sq','unique_id_2_sq','click_2_sq','like_2_sq',
     'dislike_2_sq','price_3_sq','price_dif_3_sq','sell_3_sq','buy_3_sq','volume_3_sq','variation_3_sq',
     'post_num_3_sq','unique_id_3_sq','click_3_sq','like_3_sq','dislike_3_sq','mkt_cap_sq',
     'yesterday_closing_price_sq','price_volatility_sq','price_trend_sq','average_price_volatility_sq',
     'sell_minus_buy_1_sq','sell_minus_buy_2_sq','sell_minus_buy_3_sq','price_gap_volatility_sq',
     'volume_trend_sq','post_num_trend_sq','unique_id_trend_sq','click_trend_sq','kospi_ind_sq','kosdaq_ind_sq',
     'time_slot_sq','ko_inter_sq','mkt_change_sq','alpha_sq','per_now_sq','kospi_1_sq','kospi_2_sq','kospi_3_sq',
     'kosdaq_1_sq','kosdaq_2_sq','kosdaq_3_sq','kospi_trend_sq','kosdaq_trend_sq']

In [9]:
X = df[X_COL]
y = df['did_price_033']
X_train = train_df[X_COL]
X_test = test_df[X_COL]
y_train = train_df['did_price_033']
y_test = test_df['did_price_033']
y_test_in = test_df['price_increase']

In [10]:
X_train.shape

(5425, 135)

In [51]:
svm = SVC(C = 100, kernel = 'rbf', gamma = 1.0)
svm.fit(X_train, y_train_0)

SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=1.0, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [11]:
tuned_parameters = [{'C': [0.01, 0.1, 1, 10, 100, 1000],
                     'gamma': [0.5, 1, 2, 3, 4]}]
clf = GridSearchCV(SVC(kernel='rbf'), tuned_parameters, cv=10, scoring='accuracy', return_train_score=True)
clf.fit(X_train, y_train)
clf.cv_results_

{'mean_fit_time': array([ 12.63494341,  11.21201019,  11.5352387 ,  11.25336318,
         12.09857261,  12.01078858,  10.9139482 ,   9.8016854 ,
          8.41482921,   7.29229503,  10.32666109,  11.41002526,
         11.53939383,  11.68704681,  11.45703936,  11.79894192,
         12.09828672,  12.21193292,  10.64307663,  10.3108815 ,
          8.47382619,   8.28878191,   8.32804601,   7.87786095,
          7.49756436,   7.81989734,   9.1535228 ,   9.05624704,
          8.82217062,   8.59706254]),
 'mean_score_time': array([ 1.20875275,  1.08817232,  1.17448015,  1.18189058,  1.16652429,
         1.13390362,  1.02213709,  0.96969357,  0.88124714,  0.77683108,
         0.93162656,  1.05735149,  1.03549211,  1.0677599 ,  1.01947803,
         1.0181334 ,  1.05570107,  1.03328862,  0.9190598 ,  0.92063925,
         0.76397362,  0.76205549,  0.74509435,  0.70572605,  0.6692544 ,
         0.69158442,  0.84873357,  0.82814963,  0.74236898,  0.75882981]),
 'mean_test_score': array([ 0.70672811

In [12]:
clf.best_params_

{'C': 0.01, 'gamma': 0.5}

In [13]:
svm = SVC(C = 0.01, kernel = 'rbf', gamma = 0.5)
svm.fit(X_train, y_train)

SVC(C=0.01, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.5, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [20]:
confusion_matrix(y_test, clf.best_estimator_.predict(X_test))

array([[6401,    0],
       [1770,    0]], dtype=int64)

In [21]:
# 15% of test observations misclassified
clf.best_estimator_.score(X_test, y_test)

0.78338024721576305

In [22]:
y_pred = svm.predict(X_test)    

In [23]:
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.78      1.00      0.88      6401
          1       0.00      0.00      0.00      1770

avg / total       0.61      0.78      0.69      8171



  'precision', 'predicted', average, warn_for)


In [24]:
y_port = y_test[y_pred == 1]

In [25]:
y_port.shape

(0,)

In [40]:
y_pred_0 == 1

array([False, False, False, ..., False, False, False], dtype=bool)

In [26]:
svm_rbf = svm.SVC(C = 20, gamma = 0.1)
svm_rbf.fit(X_train, y_train_0)
y_pred_0 = svm_rbf.predict(X_test)    