In [91]:
import numpy as np
import pandas as pd

from sklearn import svm
from sklearn.pipeline import Pipeline

from sklearn.metrics import classification_report, mean_squared_error
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, BaggingClassifier

In [2]:
import json

In [64]:
with open("the_last_final_data_10min.json", 'r', encoding='UTF-8') as f:
    data = json.load(f)

In [65]:
NEW_COL = ['name', 'code', 'time', 'price', 'time_1', 'price_1', 'price_dif_1',
       'sell_1', 'buy_1', 'volume_1', 'variation_1', 'post_num_1',
       'unique_id_1', 'click_1', 'like_1', 'dislike_1', 'time_2', 'price_2',
       'price_dif_2', 'sell_2', 'buy_2', 'volume_2', 'variation_2',
       'post_num_2', 'unique_id_2', 'click_2', 'like_2', 'dislike_2', 'time_3',
       'price_3', 'price_dif_3', 'sell_3', 'buy_3', 'volume_3', 'variation_3',
       'post_num_3', 'unique_id_3', 'click_3', 'like_3', 'dislike_3',
       'mkt_cap', 'kospi', 'kosdaq', 'trash', 'yesterday_closing_price',
       'is_maximum', 'is_minimum', 'price_volatility', 'price_trend',
       'average_price_volatility', 'sell_minus_buy_1', 'sell_minus_buy_2',
       'sell_minus_buy_3', 'is_price_gap_stable', 'price_gap_volatility',
       'is_like_higher', 'volume_trend', 'post_num_trend', 'unique_id_trend',
       'click_trend', 'price_increase', 'did_price_increase', 'did_price_033',
       'did_price_100', 'did_price_150', 'kospi_ind', 'kosdaq_ind',
       'time_slot', 'ko_inter', 'early_mor', 'morning', 'lunch', 'afternoon',
       'late', 'mkt_change', 'alpha', 'per_now', 'did_opening_price_increase']

In [66]:
df = pd.DataFrame(data, columns = NEW_COL)
df = df.dropna(axis=0, how='any')
df = df[df.did_opening_price_increase == 1]

In [67]:
time_filter = (df['time'].str.startswith("2018-02-21")) | \
              (df['time'].str.startswith("2018-02-22")) | \
              (df['time'].str.startswith("2018-02-23")) | \
              (df['time'].str.startswith("2018-02-26")) 
        
train_df = df[time_filter].reset_index(drop = True)
test_df = df[~time_filter].reset_index(drop = True)

In [68]:
COL_DROP = ['name', 'code', 'time', 'price', 'time_1', 'time_2', 'time_3',
            'did_price_increase', 'did_price_033', 'did_price_100', 'did_price_150', 
            'price_increase', 'did_opening_price_increase']
X = df.drop(COL_DROP, axis = 1)
y_inc = df['price_increase']
y_0 = df['did_price_increase']
y_33 = df['did_price_033']
y_100 = df['did_price_100']
y_150 = df['did_price_150']
X_train = train_df.drop(COL_DROP, axis = 1)
X_test = test_df.drop(COL_DROP, axis = 1)
y_train_in = train_df['price_increase']
y_train_0 = train_df['did_price_increase']
y_train_33 = train_df['did_price_033']
y_train_100 = train_df['did_price_100']
y_train_150 = train_df['did_price_150']
y_test_in = test_df['price_increase']
y_test_0 = test_df['did_price_increase']
y_test_33 = test_df['did_price_033']
y_test_100 = test_df['did_price_100']
y_test_150 = test_df['did_price_150']

Random Forest

In [69]:
tree_random = RandomForestClassifier(n_estimators = 100, max_features = 15, \
                                     bootstrap = True, oob_score = True, 
                                     random_state = 25)
tree_random.fit(X_train, y_train_0)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=15, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=True, random_state=25, verbose=0, warm_start=False)

In [70]:
y_pred = tree_random.predict(X_test)

In [71]:
y_port = y_test_in[y_pred == 1]

In [72]:
y_test_in.mean()

-0.01650359124664549

In [73]:
y_port.mean()

0.03668912622635979

In [75]:
print(classification_report(y_test_0, y_pred))

             precision    recall  f1-score   support

          0       0.68      0.85      0.76      1884
          1       0.43      0.22      0.29       948

avg / total       0.60      0.64      0.60      2832



In [38]:
X.shape

(11698, 65)

In [22]:
def get_best_feature(num):
    rv = 0
    f_num = 1
    for i in range(1, num):
        tree_random = RandomForestClassifier(n_estimators = 100, \
                      max_features = i, bootstrap = True,\
                      oob_score = True, random_state = 0)
        tree_random.fit(X_train, y_train_0)
        y_pred = tree_random.predict(X_test)
        y_port = y_test_in[y_pred == 1]
        avg = y_port.mean()
        if y_port.mean() > rv:
            f_num = i
            rv = avg
        
    return f_num

In [23]:
get_best_feature(66)

1
2
2
2
2
2
2
2
2
2
2
2
2
2
15
15
15
15
15
15
15
15
15
15
15
15
15
15
15
15
15
15
15
15
15
15
15
15
15
15
15
15
15
15
15
15
15
15
15
15
15
15
15
15
15
15
15
15
15
15
15
15
15
15
15


15

Bagging

In [94]:
bag_tree = BaggingClassifier(n_estimators = 100, max_features = 15, bootstrap = True,\
                             oob_score = True, random_state = 0)
bag_tree.fit(X_train, y_train_0)

BaggingClassifier(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=15, max_samples=1.0,
         n_estimators=100, n_jobs=1, oob_score=True, random_state=0,
         verbose=0, warm_start=False)

In [95]:
y_pred = regr.predict(X_test)
y_port = y_test_in[y_pred == 1]
y_port.mean()

0.08222259882460935

In [87]:
regr = GradientBoostingClassifier(n_estimators = 500, learning_rate = 0.01, random_state=1)
regr.fit(X_train, y_train_0)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.01, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=500,
              presort='auto', random_state=1, subsample=1.0, verbose=0,
              warm_start=False)

In [88]:
y_pred = regr.predict(X_test)
y_port = y_test_in[y_pred == 1]

In [89]:
y_port.mean()

0.08222259882460935