In [16]:
import pandas as pd
import numpy as np

from sklearn.linear_model import RidgeClassifier, LogisticRegression, Lasso
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import confusion_matrix, classification_report, precision_score
from sklearn.preprocessing import StandardScaler, scale
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
plt.style.use('seaborn-white')

In [3]:
import json

In [4]:
with open("the_last_final_data_10min.json", 'r', encoding='UTF-8') as f:
    data = json.load(f)

In [5]:
NEW_COL = ['name', 'code', 'time', 'price', 'time_1', 'price_1', 'price_dif_1',
       'sell_1', 'buy_1', 'volume_1', 'variation_1', 'post_num_1',
       'unique_id_1', 'click_1', 'like_1', 'dislike_1', 'time_2', 'price_2',
       'price_dif_2', 'sell_2', 'buy_2', 'volume_2', 'variation_2',
       'post_num_2', 'unique_id_2', 'click_2', 'like_2', 'dislike_2', 'time_3',
       'price_3', 'price_dif_3', 'sell_3', 'buy_3', 'volume_3', 'variation_3',
       'post_num_3', 'unique_id_3', 'click_3', 'like_3', 'dislike_3',
       'mkt_cap', 'kospi', 'kosdaq', 'trash', 'yesterday_closing_price',
       'is_maximum', 'is_minimum', 'price_volatility', 'price_trend',
       'average_price_volatility', 'sell_minus_buy_1', 'sell_minus_buy_2',
       'sell_minus_buy_3', 'is_price_gap_stable', 'price_gap_volatility',
       'is_like_higher', 'volume_trend', 'post_num_trend', 'unique_id_trend',
       'click_trend', 'price_increase', 'did_price_increase', 'did_price_033',
       'did_price_100', 'did_price_150', 'kospi_ind', 'kosdaq_ind',
       'time_slot', 'ko_inter', 'early_mor', 'morning', 'lunch', 'afternoon',
       'late', 'mkt_change', 'alpha', 'per_now', 'did_opening_price_increase']

In [6]:
df = pd.DataFrame(data, columns = NEW_COL)
df = df.dropna(axis=0, how='any')

In [7]:
df = df[df.did_opening_price_increase == 1]

In [8]:
time_filter = (df['time'].str.startswith("2018-02-21")) | \
              (df['time'].str.startswith("2018-02-22")) | \
              (df['time'].str.startswith("2018-02-23")) | \
              (df['time'].str.startswith("2018-02-26")) 
        
train_df = df[time_filter].reset_index(drop = True)
test_df = df[~time_filter].reset_index(drop = True)

In [9]:
COL_DROP_1 = ['name', 'code', 'time', 'price', 'time_1', 'price_1', 'price_dif_1',
       'sell_1', 'buy_1', 'volume_1', 'variation_1', 'post_num_1',
       'unique_id_1', 'click_1', 'like_1', 'dislike_1', 'time_2', 'price_2',
       'price_dif_2', 'sell_2', 'buy_2', 'volume_2', 'variation_2',
       'post_num_2', 'unique_id_2', 'click_2', 'like_2', 'dislike_2', 'time_3',
       'price_3', 'price_dif_3', 'sell_3', 'buy_3', 'volume_3', 'variation_3',
       'post_num_3', 'unique_id_3', 'click_3', 'like_3', 'dislike_3', \
        'did_price_increase', 'price', 'did_price_033', 'did_price_100', 'did_price_150', 
            'price_increase', 'did_opening_price_increase']

COL_DROP_2 = ['name', 'code', 'time', 'price', 'time_1', 'time_2', 'time_3',
            'did_price_increase', 'did_price_033', 'did_price_100', 'did_price_150', 
            'price_increase', 'did_opening_price_increase']

X = df.drop(COL_DROP_2, axis = 1)
y_inc = df['price_increase']
y_0 = df['did_price_increase']
y_33 = df['did_price_033']
y_100 = df['did_price_100']
y_150 = df['did_price_150']
X_train = train_df.drop(COL_DROP_2, axis = 1)
X_test = test_df.drop(COL_DROP_2, axis = 1)
y_train_in = train_df['price_increase']
y_train_0 = train_df['did_price_increase']
y_train_33 = train_df['did_price_033']
y_train_100 = train_df['did_price_100']
y_train_150 = train_df['did_price_150']
y_test_in = test_df['price_increase']
y_test_0 = test_df['did_price_increase']
y_test_33 = test_df['did_price_033']
y_test_100 = test_df['did_price_100']
y_test_150 = test_df['did_price_150']

In [167]:
var_list = ['variation_1', 'price_1', 'yesterday_closing_price', 'price_dif_1', 'price_trend', 'sell_1', 'per_now']

In [168]:

X_sub = X_train[var_list]
X_sub_test = X_test[var_list]

In [169]:
clf = LogisticRegression(solver='newton-cg')
clf.fit(X_sub, y_train_0)
prob = clf.predict(X_sub_test)



In [170]:
y_port = y_test_in[prob == 1]
y_port.mean()

0.3483325235428571

In [171]:
y_port

163     0.453309
338     3.317536
398     0.480769
623    -0.179695
715     0.090009
991     0.000000
1083    0.000000
1258   -2.836879
1292    0.791557
1350    0.938478
1442    0.103306
1451    0.000000
1534    0.928793
1568    0.789474
Name: price_increase, dtype: float64

In [None]:
def find_best_mod(mse_min, var_list):
    
    best_var = None
    for var in COLUMNS:
        if var not in var_list:
            var_list.append(var)
            X_train_sub = X_train[var_list]
            clf.fit(X_train_sub, y_train_0)
            y_pred = clf.predict(X_test[var_list])
            y_port = y_test_in[y_pred == 1]
            size = y_port.shape[0]
            if size >= 10:
                mse = mean_squared_error(y_test_0, y_pred)
                mse_min = min(mse, mse_min)
                if mse_min == mse:
                    best_var = var
            var_list = var_list[:-1]
    
    return best_var, mse_min 

def get_best_list(var_list):
    
    num = 0
    mse_min=1
    while num < 65:
        best_var, mse_min_new = find_best_mod(mse_min, var_list)
        if best_var != None:
            if mse_min_new <= mse_min:
                mse_min = mse_min_new
                var_list.append(best_var)
                print(var_list, num, mse_min)
            else:
                var_list
        else:
            return var_list
        num += 1

    return var_list

In [172]:
COLUMNS = X.columns

In [None]:
get_best_list(['variation_1'])

In [None]:
['variation_1', 'price_1', 'yesterday_closing_price'] 
['variation_1', 'price_1', 'yesterday_closing_price', 'price_dif_1', 'price_trend']
['variation_1', 'price_1', 'yesterday_closing_price', 'price_dif_1', 'price_trend', 'sell_1', 'per_now']
['variation_1', 'price_1', 'yesterday_closing_price', 'price_dif_1', 'price_trend', 'sell_1', 'per_now', 'buy_1', 'post_num_trend']
['variation_1',
 'price_1',
 'yesterday_closing_price',
 'price_dif_1',
 'price_trend',
 'sell_1',
 'per_now',
 'buy_1',
 'post_num_trend',
 'volume_1']