In [31]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, precision_score, mean_squared_error

In [2]:
import json
with open("df_1hour_Feb.json", 'r', encoding='UTF-8') as f:
    F_data = json.load(f)

In [3]:
with open("df_1hour_Mar.json", 'r', encoding='UTF-8') as f:
    M_data = json.load(f)

In [10]:
COLUMNS = ['name', 'code', 'time', 'price', 'time_1', 'price_1', 'price_dif_1', 'sell_1', 'buy_1', 'volume_1', 'variation_1', 'post_num_1', 'unique_id_1', 'click_1', 'like_1', 'dislike_1', 'time_2', 'price_2', 'price_dif_2', 'sell_2', 'buy_2', 'volume_2', 'variation_2', 'post_num_2', 'unique_id_2', 'click_2', 'like_2', 'dislike_2', 'time_3', 'price_3', 'price_dif_3', 'sell_3', 'buy_3', 'volume_3', 'variation_3', 'post_num_3', 'unique_id_3', 'click_3', 'like_3', 'dislike_3', 'mkt_cap', 'kospi', 'kosdaq', 'trash', 'yesterday_closing_price', 'is_maximum', 'is_minimum', 'price_volatility', 'price_trend', 'average_price_volatility', 'sell_minus_buy_1', 'sell_minus_buy_2', 'sell_minus_buy_3', 'is_price_gap_stable', 'price_gap_volatility', 'is_like_higher', 'volume_trend', 'post_num_trend', 'unique_id_trend', 'click_trend', 'price_increase', 'did_price_increase', 'did_price_033', 'did_price_100', 'did_price_150', 'kospi_ind', 'kosdaq_ind', 'time_slot', 'ko_inter', 'early_mor', 'morning', 'lunch', 'afternoon', 'late', 'mkt_change', 'alpha', 'per_now', 'kospi_1', 'kospi_2', 'kospi_3', 'kospi_answer', 'kosdaq_1', 'kosdaq_2', 'kosdaq_3', 'kosdaq_answer', 'kospi_trend', 'kosdaq_trend', 'kospi_increase', 'kosdaq_increase', 'market_increase', 'did_opening_price_increase', 'price_1_sq', 'price_dif_1_sq', 'sell_1_sq', 'buy_1_sq', 'volume_1_sq', 'variation_1_sq', 'post_num_1_sq', 'unique_id_1_sq', 'click_1_sq', 'like_1_sq', 'dislike_1_sq', 'price_2_sq', 'price_dif_2_sq', 'sell_2_sq', 'buy_2_sq', 'volume_2_sq', 'variation_2_sq', 'post_num_2_sq', 'unique_id_2_sq', 'click_2_sq', 'like_2_sq', 'dislike_2_sq', 'price_3_sq', 'price_dif_3_sq', 'sell_3_sq', 'buy_3_sq', 'volume_3_sq', 'variation_3_sq', 'post_num_3_sq', 'unique_id_3_sq', 'click_3_sq', 'like_3_sq', 'dislike_3_sq', 'mkt_cap_sq', 'yesterday_closing_price_sq', 'price_volatility_sq', 'price_trend_sq', 'average_price_volatility_sq', 'sell_minus_buy_1_sq', 'sell_minus_buy_2_sq', 'sell_minus_buy_3_sq', 'price_gap_volatility_sq', 'volume_trend_sq', 'post_num_trend_sq', 'unique_id_trend_sq', 'click_trend_sq', 'kospi_ind_sq', 'kosdaq_ind_sq', 'time_slot_sq', 'ko_inter_sq', 'mkt_change_sq', 'alpha_sq', 'per_now_sq', 'kospi_1_sq', 'kospi_2_sq', 'kospi_3_sq', 'kosdaq_1_sq', 'kosdaq_2_sq', 'kosdaq_3_sq', 'kospi_trend_sq', 'kosdaq_trend_sq']

In [11]:
df_F = pd.DataFrame(F_data, columns = COLUMNS)
df_M = pd.DataFrame(M_data, columns = COLUMNS)
df = pd.concat([df_F, df_M])

In [12]:
df = df.dropna(axis=0, how='any')

In [13]:
time_filter = (df['time'].str.startswith("2018-02-21")) | \
              (df['time'].str.startswith("2018-02-20")) | \
              (df['time'].str.startswith("2018-02-14")) 
        
train_df = df[time_filter].reset_index(drop = True)
test_df = df[~time_filter].reset_index(drop = True)

In [14]:
X_COL = ['price_1','price_dif_1','sell_1','buy_1','volume_1','variation_1','post_num_1','unique_id_1','click_1',
     'like_1','dislike_1','price_2','price_dif_2','sell_2','buy_2','volume_2','variation_2','post_num_2','unique_id_2',
     'click_2','like_2','dislike_2','price_3','price_dif_3','sell_3','buy_3','volume_3','variation_3','post_num_3',
     'unique_id_3','click_3','like_3','dislike_3','mkt_cap','kospi','kosdaq','trash','yesterday_closing_price',
     'is_maximum','is_minimum','price_volatility','price_trend','average_price_volatility','sell_minus_buy_1',
     'sell_minus_buy_2','sell_minus_buy_3','is_price_gap_stable','price_gap_volatility','is_like_higher',
     'volume_trend','post_num_trend','unique_id_trend','click_trend','kospi_ind','kosdaq_ind','time_slot',
     'ko_inter','early_mor','morning','lunch','afternoon','late','mkt_change','alpha','per_now','kospi_1','kospi_2',
     'kospi_3','kosdaq_1','kosdaq_2','kosdaq_3','kospi_trend','kosdaq_trend','did_opening_price_increase',
     'price_1_sq','price_dif_1_sq','sell_1_sq','buy_1_sq','volume_1_sq','variation_1_sq','post_num_1_sq',
     'unique_id_1_sq','click_1_sq','like_1_sq','dislike_1_sq','price_2_sq','price_dif_2_sq','sell_2_sq',
     'buy_2_sq','volume_2_sq','variation_2_sq','post_num_2_sq','unique_id_2_sq','click_2_sq','like_2_sq',
     'dislike_2_sq','price_3_sq','price_dif_3_sq','sell_3_sq','buy_3_sq','volume_3_sq','variation_3_sq',
     'post_num_3_sq','unique_id_3_sq','click_3_sq','like_3_sq','dislike_3_sq','mkt_cap_sq',
     'yesterday_closing_price_sq','price_volatility_sq','price_trend_sq','average_price_volatility_sq',
     'sell_minus_buy_1_sq','sell_minus_buy_2_sq','sell_minus_buy_3_sq','price_gap_volatility_sq',
     'volume_trend_sq','post_num_trend_sq','unique_id_trend_sq','click_trend_sq','kospi_ind_sq','kosdaq_ind_sq',
     'time_slot_sq','ko_inter_sq','mkt_change_sq','alpha_sq','per_now_sq','kospi_1_sq','kospi_2_sq','kospi_3_sq',
     'kosdaq_1_sq','kosdaq_2_sq','kosdaq_3_sq','kospi_trend_sq','kosdaq_trend_sq']

In [9]:
var_list = ['variation_1', 'price_1', 'yesterday_closing_price', 'price_dif_1', 'price_trend', 'sell_1', 'per_now']

In [15]:
X = df[X_COL]
y = df['did_price_033']
X_train = train_df[X_COL]
X_test = test_df[X_COL]
y_train = train_df['did_price_033']
y_test = test_df['did_price_033']
y_test_in = test_df['price_increase']

In [43]:
def find_first_mod():
    max_rv = 0 
    for var in COLUMNS:
        X_train_sub = X_train[var]
        clf = LogisticRegression(solver='newton-cg')            
        clf.fit(X_train_sub.reshape(-1, 1), y_train)
        y_pred = clf.predict(X_test[var].reshape(-1, 1))
        y_port = y_test_in[y_pred == 1] 
        avg = y_port.mean()
        size = y_port.shape[0]
        if (avg > max_rv) and (size >= 10):
            max_rv = avg
            var_best = var
            
    return var_best

In [44]:
VAR_1 = find_first_mod()
VAR_1

  
  import sys


'kospi_3_sq'

In [47]:
def find_best_mod(mse_min, var_list):
    
    best_var = None
    for var in COLUMNS:
        if var not in var_list:
            var_list.append(var)
            X_train_sub = X_train[var_list]
            clf = LogisticRegression(solver='newton-cg')            
            clf.fit(X_train_sub, y_train)
            y_pred = clf.predict(X_test[var_list])
            y_port = y_test_in[y_pred == 1]
            size = y_port.shape[0]
            if size >= 10:
                mse = mean_squared_error(y_test, y_pred)
                mse_min = min(mse, mse_min)
                if mse_min == mse:
                    best_var = var
            var_list = var_list[:-1]
    
    return best_var, mse_min 

def get_best_list(var_list):
    
    num = 0
    mse_min=1
    while num < 136:
        best_var, mse_min_new = find_best_mod(mse_min, var_list)
        if best_var != None:
            if mse_min_new <= mse_min:
                mse_min = mse_min_new
                var_list.append(best_var)
                print(var_list, num, mse_min)
            else:
                var_list
        else:
            return var_list
        num += 1

    return var_list

In [25]:
COLUMNS = X.columns

In [48]:
get_best_list(['kospi_3_sq'])



['kospi_3_sq', 'price_1', 'morning'] 0 0.216252600661
['kospi_3_sq', 'price_1', 'morning', 'price_dif_1', 'mkt_change_sq'] 1 0.215763064496
['kospi_3_sq', 'price_1', 'morning', 'price_dif_1', 'mkt_change_sq', 'sell_1', 'price_volatility'] 2 0.215763064496
['kospi_3_sq', 'price_1', 'morning', 'price_dif_1', 'mkt_change_sq', 'sell_1', 'price_volatility', 'buy_1', 'kospi_1'] 3 0.215763064496
['kospi_3_sq', 'price_1', 'morning', 'price_dif_1', 'mkt_change_sq', 'sell_1', 'price_volatility', 'buy_1', 'kospi_1', 'volume_1', 'unique_id_trend'] 4 0.215395912373
['kospi_3_sq', 'price_1', 'morning', 'price_dif_1', 'mkt_change_sq', 'sell_1', 'price_volatility', 'buy_1', 'kospi_1', 'volume_1', 'unique_id_trend', 'variation_1', 'kospi_3'] 5 0.214661608126
['kospi_3_sq', 'price_1', 'morning', 'price_dif_1', 'mkt_change_sq', 'sell_1', 'price_volatility', 'buy_1', 'kospi_1', 'volume_1', 'unique_id_trend', 'variation_1', 'kospi_3', 'post_num_1', 'price_3'] 6 0.214661608126


['kospi_3_sq',
 'price_1',
 'morning',
 'price_dif_1',
 'mkt_change_sq',
 'sell_1',
 'price_volatility',
 'buy_1',
 'kospi_1',
 'volume_1',
 'unique_id_trend',
 'variation_1',
 'kospi_3',
 'post_num_1',
 'price_3',
 'unique_id_1']

In [49]:
var_filter = ['kospi_3_sq', 'price_1', 'morning', 'price_dif_1', \
              'mkt_change_sq', 'sell_1', 'price_volatility', 'buy_1', \
              'kospi_1', 'volume_1', 'unique_id_trend', 'variation_1', \
              'kospi_3', 'post_num_1', 'price_3', 'unique_id_1']

In [50]:
X_sub = X_train[var_filter]
X_sub_test = X_test[var_filter]

In [51]:
clf = LogisticRegression(solver='newton-cg')
clf.fit(X_sub, y_train_0)
prob = clf.predict(X_sub_test)



In [52]:
print(classification_report(y_test, prob))

             precision    recall  f1-score   support

          0       0.79      0.97      0.87      6401
          1       0.38      0.07      0.12      1770

avg / total       0.70      0.77      0.71      8171



In [53]:
y_port = y_test_in[prob == 1]

In [54]:
y_port.shape

(347,)

In [55]:
y_port.mean()

0.16748018376109516

In [56]:
Logistic_mod = pd.DataFrame(prob, columns = ['Logistic'])

In [59]:
Logistic_mod['Logistic'].value_counts()

0    7824
1     347
Name: Logistic, dtype: int64

In [60]:
Logistic_mod.to_json('Logistic_mod.json', orient='values')