In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, precision_score, mean_squared_error

In [2]:
import json
with open("df_1hour_Feb.json", 'r', encoding='UTF-8') as f:
    F_data = json.load(f)

In [3]:
with open("df_1hour_Mar_05.json", 'r', encoding='UTF-8') as f:
    M_data = json.load(f)

In [4]:
COLUMNS = ['name', 'code', 'time', 'price', 'time_1', 'price_1', 'price_dif_1', 'sell_1', 'buy_1', 'volume_1', 'variation_1', 'post_num_1', 'unique_id_1', 'click_1', 'like_1', 'dislike_1', 'time_2', 'price_2', 'price_dif_2', 'sell_2', 'buy_2', 'volume_2', 'variation_2', 'post_num_2', 'unique_id_2', 'click_2', 'like_2', 'dislike_2', 'time_3', 'price_3', 'price_dif_3', 'sell_3', 'buy_3', 'volume_3', 'variation_3', 'post_num_3', 'unique_id_3', 'click_3', 'like_3', 'dislike_3', 'mkt_cap', 'kospi', 'kosdaq', 'trash', 'yesterday_closing_price', 'is_maximum', 'is_minimum', 'price_volatility', 'price_trend', 'average_price_volatility', 'sell_minus_buy_1', 'sell_minus_buy_2', 'sell_minus_buy_3', 'is_price_gap_stable', 'price_gap_volatility', 'is_like_higher', 'volume_trend', 'post_num_trend', 'unique_id_trend', 'click_trend', 'price_increase', 'did_price_increase', 'did_price_033', 'did_price_100', 'did_price_150', 'kospi_ind', 'kosdaq_ind', 'time_slot', 'ko_inter', 'early_mor', 'morning', 'lunch', 'afternoon', 'late', 'mkt_change', 'alpha', 'per_now', 'kospi_1', 'kospi_2', 'kospi_3', 'kospi_answer', 'kosdaq_1', 'kosdaq_2', 'kosdaq_3', 'kosdaq_answer', 'kospi_trend', 'kosdaq_trend', 'kospi_increase', 'kosdaq_increase', 'market_increase', 'did_opening_price_increase', 'price_1_sq', 'price_dif_1_sq', 'sell_1_sq', 'buy_1_sq', 'volume_1_sq', 'variation_1_sq', 'post_num_1_sq', 'unique_id_1_sq', 'click_1_sq', 'like_1_sq', 'dislike_1_sq', 'price_2_sq', 'price_dif_2_sq', 'sell_2_sq', 'buy_2_sq', 'volume_2_sq', 'variation_2_sq', 'post_num_2_sq', 'unique_id_2_sq', 'click_2_sq', 'like_2_sq', 'dislike_2_sq', 'price_3_sq', 'price_dif_3_sq', 'sell_3_sq', 'buy_3_sq', 'volume_3_sq', 'variation_3_sq', 'post_num_3_sq', 'unique_id_3_sq', 'click_3_sq', 'like_3_sq', 'dislike_3_sq', 'mkt_cap_sq', 'yesterday_closing_price_sq', 'price_volatility_sq', 'price_trend_sq', 'average_price_volatility_sq', 'sell_minus_buy_1_sq', 'sell_minus_buy_2_sq', 'sell_minus_buy_3_sq', 'price_gap_volatility_sq', 'volume_trend_sq', 'post_num_trend_sq', 'unique_id_trend_sq', 'click_trend_sq', 'kospi_ind_sq', 'kosdaq_ind_sq', 'time_slot_sq', 'ko_inter_sq', 'mkt_change_sq', 'alpha_sq', 'per_now_sq', 'kospi_1_sq', 'kospi_2_sq', 'kospi_3_sq', 'kosdaq_1_sq', 'kosdaq_2_sq', 'kosdaq_3_sq', 'kospi_trend_sq', 'kosdaq_trend_sq']

In [5]:
df_F = pd.DataFrame(F_data, columns = COLUMNS)
df_M = pd.DataFrame(M_data, columns = COLUMNS)
df = pd.concat([df_F, df_M])

In [6]:
df = df.dropna(axis=0, how='any')

In [7]:
time_filter_train = (df['time'].str.startswith("2018-02-21")) | \
                    (df['time'].str.startswith("2018-02-20")) | \
                    (df['time'].str.startswith("2018-02-14")) | \
                    (df['time'].str.startswith("2018-02-22")) | \
                    (df['time'].str.startswith("2018-02-23")) | \
                    (df['time'].str.startswith("2018-02-26")) 

time_filter_test = (df['time'].str.startswith("2018-02-28")) | \
                   (df['time'].str.startswith("2018-03-02")) | \
                   (df['time'].str.startswith("2018-03-05")) | \
                   (df['time'].str.startswith("2018-02-27"))
        
train_df = df[time_filter_train].reset_index(drop = True)
test_df = df[time_filter_test].reset_index(drop = True)

In [8]:
X_COL = ['price_1','price_dif_1','sell_1','buy_1','volume_1','variation_1','post_num_1','unique_id_1','click_1',
     'like_1','dislike_1','price_2','price_dif_2','sell_2','buy_2','volume_2','variation_2','post_num_2','unique_id_2',
     'click_2','like_2','dislike_2','price_3','price_dif_3','sell_3','buy_3','volume_3','variation_3','post_num_3',
     'unique_id_3','click_3','like_3','dislike_3','mkt_cap','kospi','kosdaq','trash','yesterday_closing_price',
     'is_maximum','is_minimum','price_volatility','price_trend','average_price_volatility','sell_minus_buy_1',
     'sell_minus_buy_2','sell_minus_buy_3','is_price_gap_stable','price_gap_volatility','is_like_higher',
     'volume_trend','post_num_trend','unique_id_trend','click_trend','kospi_ind','kosdaq_ind','time_slot',
     'ko_inter','early_mor','morning','lunch','afternoon','late','mkt_change','alpha','per_now','kospi_1','kospi_2',
     'kospi_3','kosdaq_1','kosdaq_2','kosdaq_3','kospi_trend','kosdaq_trend','did_opening_price_increase',
     'price_1_sq','price_dif_1_sq','sell_1_sq','buy_1_sq','volume_1_sq','variation_1_sq','post_num_1_sq',
     'unique_id_1_sq','click_1_sq','like_1_sq','dislike_1_sq','price_2_sq','price_dif_2_sq','sell_2_sq',
     'buy_2_sq','volume_2_sq','variation_2_sq','post_num_2_sq','unique_id_2_sq','click_2_sq','like_2_sq',
     'dislike_2_sq','price_3_sq','price_dif_3_sq','sell_3_sq','buy_3_sq','volume_3_sq','variation_3_sq',
     'post_num_3_sq','unique_id_3_sq','click_3_sq','like_3_sq','dislike_3_sq','mkt_cap_sq',
     'yesterday_closing_price_sq','price_volatility_sq','price_trend_sq','average_price_volatility_sq',
     'sell_minus_buy_1_sq','sell_minus_buy_2_sq','sell_minus_buy_3_sq','price_gap_volatility_sq',
     'volume_trend_sq','post_num_trend_sq','unique_id_trend_sq','click_trend_sq','kospi_ind_sq','kosdaq_ind_sq',
     'time_slot_sq','ko_inter_sq','mkt_change_sq','alpha_sq','per_now_sq','kospi_1_sq','kospi_2_sq','kospi_3_sq',
     'kosdaq_1_sq','kosdaq_2_sq','kosdaq_3_sq','kospi_trend_sq','kosdaq_trend_sq']

In [9]:
X = df[X_COL]
y = df['did_price_033']
X_train = train_df[X_COL]
X_test = test_df[X_COL]
y_train = train_df['did_price_033']
y_test = test_df['did_price_033']
y_test_in = test_df['price_increase']

In [10]:
X_train.shape

(11629, 135)

In [11]:
def find_first_mod():
    rv = 1
    for var in X_COL:
        X_train_sub = X_train[var]
        clf = LogisticRegression(solver='newton-cg')            
        clf.fit(X_train_sub.reshape(-1, 1), y_train)
        y_pred = clf.predict(X_test[var].reshape(-1, 1))
        y_port = y_test_in[y_pred == 1] 
        error = (y_test != y_pred).astype(int)
        avg = error[y_pred == 1].mean()
        size = y_port.shape[0]
        if (avg < rv) and (size >= 10):
            max_rv = avg
            var_best = var
            
    return var_best

In [17]:
VAR_1 = find_first_mod()
VAR_1

  
  import sys


'kospi_trend_sq'

In [16]:
def find_best_mod(mse_min, var_list):
    
    best_var = None
    for var in X_COL:
        if var not in var_list:
            var_list.append(var)
            X_train_sub = X_train[var_list]
            clf = LogisticRegression(solver='newton-cg')            
            clf.fit(X_train_sub, y_train)
            prob = clf.predict(X_test[var_list])
            y_port = y_test_in[prob == 1]
            size = y_port.shape[0]
            if size >= 10:
                error = (y_test != prob).astype(int)
                avg = error[prob == 1].mean()
                if mse_min > avg:
                    best_var = var
                    mse_min = avg
            var_list = var_list[:-1]
    
    return best_var, mse_min 

def get_best_list(var_list):
    
    num = 0
    mse_min=1
    while num < 136:
        best_var, mse_min_new = find_best_mod(mse_min, var_list)
        if best_var != None:
            if mse_min_new <= mse_min:
                mse_min = mse_min_new
                var_list.append(best_var)
                print(var_list, num, mse_min)
            else:
                var_list
        else:
            return var_list
        num += 1

    return var_list

In [18]:
get_best_list(['kospi_trend_sq'])



['kospi_trend_sq', 'price_1', 'average_price_volatility'] 0 0.71875
['kospi_trend_sq', 'price_1', 'average_price_volatility', 'price_dif_1', 'price_dif_3_sq'] 1 0.7127659574468085


['kospi_trend_sq',
 'price_1',
 'average_price_volatility',
 'price_dif_1',
 'price_dif_3_sq',
 'sell_1']

In [19]:
var_filter = ['kospi_trend_sq',
 'price_1',
 'average_price_volatility',
 'price_dif_1',
 'price_dif_3_sq',
 'sell_1']

In [20]:
X_sub = X_train[var_filter]
X_sub_test = X_test[var_filter]

In [21]:
clf = LogisticRegression(solver='newton-cg')
clf.fit(X_sub, y_train)
prob = clf.predict(X_sub_test)



In [22]:
print(classification_report(y_test, prob))

             precision    recall  f1-score   support

          0       0.78      0.97      0.87      3148
          1       0.24      0.03      0.05       888

avg / total       0.66      0.77      0.69      4036



In [23]:
y_port = y_test_in[prob == 1]

In [24]:
y_port.shape

(113,)

In [25]:
y_port.mean()

-0.02149618048053099

In [26]:
y_test_in.mean()

-0.14849392740473225

In [35]:
Logistic_mod = pd.DataFrame(prob, columns = ['Logistic'])

In [36]:
Logistic_mod['Logistic'].value_counts()

0    3923
1     113
Name: Logistic, dtype: int64

In [37]:
Logistic_mod.to_json('Logistic_mod.json', orient='values')

In [22]:
y_test_in.mean()

-0.12199511714522493

In [31]:
y_train.shape

(11629,)