In [1]:
import sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")

In [2]:
import pandas as pd 
import os
import numpy as np 
import time
from copy import deepcopy
import matplotlib.pyplot as plt 
from datetime import datetime
from datetime import timedelta
from scipy.optimize import minimize
plt.style.use('seaborn')

In [3]:
q2_data_dir = os.path.join(os.getcwd(),'SH600519\\')
q2_data_list_dir = [q2_data_dir + data for data in os.listdir(q2_data_dir) if data.split('_')[0] == 'trade']

In [4]:
def generate_q2_dataframe(df_lst):
    df = pd.read_csv(df_lst[0])
    df['tmp'] = df.date.apply(str) + df.time.apply(str)
    df['tmp'] = pd.to_datetime(df.tmp, format='%Y%m%d%H%M%f')
    df['timestamp'] = df.tmp.apply(lambda x: x.value)/(10e5)
    df.timestamp = df.timestamp - df.timestamp[0]
    df.timestamp = np.where(df.timestamp > 10000, df.timestamp - 5400, df.timestamp)

    for i in range(1, len(df_lst)):
        df2 = pd.read_csv(df_lst[i])
        df2['tmp'] = df2.date.apply(str) + df2.time.apply(str)
        df2['tmp'] = pd.to_datetime(df2.tmp, format='%Y%m%d%H%M%f')
        df2['timestamp'] = df2.tmp.apply(lambda x: x.value)/(10e5)
        df2.timestamp = df2.timestamp - df2.timestamp[0]
        df2.timestamp = np.where(df2.timestamp > 10000, df2.timestamp - 5400, df2.timestamp)
        df2.timestamp = df2.timestamp - df2.timestamp.iloc[0] + df.timestamp.iloc[-1]
        df = pd.concat([df, df2])
    
    df['BS'] = np.where(df.BS == 'B',1,0) + np.where(df.BS == 'S',-1,0)
    df['is_trade'] = np.where((df.BS == 1) | (df.BS == -1), 1, 0 )
    df.time = (df.timestamp/1000).apply(datetime.fromtimestamp)
    df = df.set_index(['time'])
    df = df.sort_index()
    df['5min_trade_count'] = df.ntrade.rolling('300s').count()
    df['5min_cumul_sign'] = df.BS.rolling('300s').sum()
    df['5min_prob_buy'] = (df['5min_trade_count'] + df['5min_cumul_sign'])/(2*df['5min_trade_count'])
    df = df[df.index > (df.index[0] + timedelta(minutes=5))]
    return(df.drop(['Unnamed: 0','tmp','timestamp','id_trade','date','sign'], axis=1))



In [5]:
df = generate_q2_dataframe(q2_data_list_dir)

In [6]:
def MLE_estimator_lambda(df, s):
    N = df['5min_trade_count'].mean()
    return(N/s)

def MLE_estimator_beta(df):
    n = len(df.ntrade)
    sum_xi = df.ntrade.sum()
    return(sum_xi/n)

def corr_estimate(df):
    tmp = df[df.BS != 0]
    return tmp.BS.corr(tmp.BS.shift(1))

def prob_buy_estimate(df):
    N = df.is_trade.sum()
    k = df.BS.sum()
    return((N+k)/(2*N)) * (1+len(df[df.is_trade == 0])/len(df))

def variable_pred_stats():
    print('Estimation of parameters')
    print('Lambda = {:.2f}'.format(MLE_estimator_lambda(df,300)))
    print('Beta = {:.2f}'.format(MLE_estimator_beta(df)))
    print('Corr = {:.2f}'.format(corr_estimate(df)))
    print('p = {:.2f}%'.format(100*prob_buy_estimate(df)))

In [7]:
variable_pred_stats()

Estimation of parameters
Lambda = 1.24
Beta = 354.27
Corr = 0.56
p = 49.27%


In [8]:
q3_data_dir = os.path.join(os.getcwd(),'SH601398\\')
q3_data_list_dir_v1 = [q2_data_dir + data for data in os.listdir(q2_data_dir) if data.split('_')[0] == 'quote']
q3_data_list_dir_v2 = [q3_data_dir + data for data in os.listdir(q3_data_dir) if data.split('_')[0] == 'quote']

In [9]:
def generate_q3_dataframe(df_list):
    df = pd.read_csv(df_list[0])
    df = df[['price','BS','BidPrice1','AskPrice1']]
    df['MidPrice1'] = (df.BidPrice1 + df.AskPrice1)/2
    df = df[df.price > 0]
    
    for i in range(1, len(df_list)):
        df2 = pd.read_csv(df_list[i])
        df2 = df2[['price','BS','BidPrice1','AskPrice1']]
        df2['MidPrice1'] = (df2.BidPrice1 + df2.AskPrice1)/2
        df2 = df2[df2.price > 0]
        df = pd.concat([df,df2])
        
    return df[(df.BS == 'B') | (df.BS =='S')]

In [10]:
df2 = generate_q3_dataframe(q3_data_list_dir_v1)

In [11]:
df3 = generate_q3_dataframe(q3_data_list_dir_v2)

In [12]:
def add_prediction_to_df(df):
    df3 = df.copy()
    df3['BS'] = np.where(df3.BS == 'B',1,0) + np.where(df3.BS == 'S',-1,0)
    df3['PredSide'] = np.where(df3.price > df3.MidPrice1,1,0) + np.where(df3.price < df3.MidPrice1,-1,0)
    return df3

In [13]:
df2 = add_prediction_to_df(df2)

In [14]:
df3 = add_prediction_to_df(df3)

In [15]:
def pct_same_pred(df):
    return len(df[df.PredSide == df.BS])/len(df)

def pct_same_buy(df):
    return(len(df[(df.PredSide == df.BS) & (df.PredSide == 1)])/len(df[df.BS==1]))

def pct_same_sell(df):
    return(len(df[(df.PredSide == df.BS) & (df.PredSide == -1)])/len(df[df.BS==-1]))

def pct_pred_zero(df):
    return(len(df[df.PredSide == 0])/len(df))

def pct_pred_zero_buy(df):
    return(len(df[(df.PredSide == 0) & (df.BS == 1)])/len(df[df.PredSide ==0]))

def pct_pred_zero_sell(df):
    return(len(df[(df.PredSide == 0) & (df.BS == -1)])/len(df[df.PredSide ==0]))


def print_statistics(df, text):
    print('Statistics for Data in {}'.format(text))
    print("Percentage Same Prediction: {:.2f}%".format(100*pct_same_pred(df)))
    print("Percentage Same Buy Prediction: {:.2f}%".format(100*pct_same_buy(df)))
    print("Percentage Same Sell Prediction: {:.2f}%".format(100*pct_same_sell(df)))
    print("Percentage Prediction 0: {:.2f}%".format(100*pct_pred_zero(df)))
    print("Percentage Prediction 0 is Buy: {:.2f}%".format(100*pct_pred_zero_buy(df)))
    print("Percentage Prediction 0 is Sell: {:.2f}%".format(100*pct_pred_zero_sell(df)))

In [16]:
print_statistics(df2, 'SH60051')

Statistics for Data in SH60051
Percentage Same Prediction: 79.10%
Percentage Same Buy Prediction: 79.51%
Percentage Same Sell Prediction: 78.68%
Percentage Prediction 0: 0.68%
Percentage Prediction 0 is Buy: 50.82%
Percentage Prediction 0 is Sell: 49.18%


In [17]:
print_statistics(df3, 'SH601398')

Statistics for Data in SH601398
Percentage Same Prediction: 98.34%
Percentage Same Buy Prediction: 98.29%
Percentage Same Sell Prediction: 98.37%
Percentage Prediction 0: 0.02%
Percentage Prediction 0 is Buy: 50.41%
Percentage Prediction 0 is Sell: 49.59%


In [18]:
def generate_q4_dataframe(df_lst):
    df = pd.read_csv(df_lst[0])
    df['tmp'] = df.date.apply(str) + df.time.apply(str)
    df['tmp'] = pd.to_datetime(df.tmp, format='%Y%m%d%H%M%f')
    df['timestamp'] = df.tmp.apply(lambda x: x.value)/(10e5)
    df.timestamp = df.timestamp - df.timestamp[0]
    df.timestamp = np.where(df.timestamp > 10000, df.timestamp - 5400, df.timestamp)

    for i in range(1, len(df_lst)):
        df2 = pd.read_csv(df_lst[i])
        df2 = df2[df2.time >0]
        df2['tmp'] = df2.date.apply(str) + df2.time.apply(str)
        df2['tmp'] = pd.to_datetime(df2.tmp, format='%Y%m%d%H%M%f')
        
        df2['timestamp'] = df2.tmp.apply(lambda x: x.value)/(10e5)
        try:
            df2.timestamp = df2.timestamp - df2.timestamp.iloc[0]
        except KeyError:
            print(df2.timestamp)
            break
        df2.timestamp = np.where(df2.timestamp > 10000, df2.timestamp - 5400, df2.timestamp)
        df2.timestamp = df2.timestamp - df2.timestamp.iloc[0] + df.timestamp.iloc[-1]
        df = pd.concat([df, df2])
        
    df['BS'] = np.where(df.BS == 'B',1,0) + np.where(df.BS == 'S',-1,0)
    df['is_trade'] = np.where((df.BS == 1) | (df.BS == -1), 1, 0 )
    
    df['MidQuote'] = (df['BidPrice1'] * df['AskVolume1'] + df['AskPrice1']*df['BidVolume1'])/(df['AskVolume1']+df['BidVolume1'])
    df.time = (df.timestamp/1000).apply(datetime.fromtimestamp)
    df = df.set_index(['time'])
    df = df.sort_index()
    meanVolume = df[df.volume > 0].volume.mean()
    df = df[df.volume > 0]
    df['SignVolume'] = df.BS * df.volume
    
    df['5min_ret'] = df.MidQuote.rolling('300s').apply(lambda x : (x[-1]-x[0])/x[0])
    df['1min_ret'] = df.MidQuote.rolling('60s').apply(lambda x : (x[-1]-x[0])/x[0])
    df['5min_imb'] = df.SignVolume.rolling('300s').sum()/meanVolume
    df['1min_imb'] = df.SignVolume.rolling('60s').sum()/meanVolume
    df['5min_ind'] = np.where(df['5min_imb'] > 0, 1, -1)
    df['1min_ind'] = np.where(df['1min_imb'] > 0, 1, -1)
    df = df[(df['5min_ret'] <1) & (df['5min_ret'] > -1)]
    df = df[(df['1min_ret'] <1) & (df['1min_ret'] > -1)]
    df = df[['5min_ret','1min_ret','5min_imb','1min_imb','5min_ind','1min_ind','SignVolume','MidQuote','BS','is_trade']]
    df = df[df.index > (df.index[0] + timedelta(minutes=5))]
    return df.dropna()
    

In [19]:
df4 = generate_q4_dataframe(q3_data_list_dir_v1)

In [20]:
df5 = generate_q4_dataframe(q3_data_list_dir_v2)

In [21]:
def mse(parameters, df, window, ind):
    
    if window == 1:
        df = df[df['1min_ind'] == ind]
        y = df['1min_ret'].values
        std = y.std()
        x = df['1min_imb'].abs().values
    if window == 5:
        df = df[df['5min_ind'] == ind]
        y = df['5min_ret'].values
        std = y.std()
        x = df['5min_imb'].abs().values
        
    a = parameters[0]
    b = parameters[1]
    y_pred = a*std*(x**b)
    
    return sum((y-y_pred)**2)

In [22]:
def optimize_parameters(df, window ,ind):
    x0 = [1,1]
    t1 = time.time()
    res = minimize(mse, x0, args = (df, window,ind), method='SLSQP', options={'disp': False})
    print('Optimization Runtime: {:.3f}s'.format(time.time() - t1))
    return(res.x)

In [23]:
df4_1_pos_params = optimize_parameters(df4,1,1)
df4_1_neg_params = optimize_parameters(df4,1,-1)
df4_5_pos_params = optimize_parameters(df4,5,1)
df4_5_neg_params = optimize_parameters(df4,5,-1)

df5_1_pos_params = optimize_parameters(df5,1,1)
df5_1_neg_params = optimize_parameters(df5,1,-1)
df5_5_pos_params = optimize_parameters(df5,5,1)
df5_5_neg_params = optimize_parameters(df5,5,-1)

Optimization Runtime: 2.345s
Optimization Runtime: 5.818s
Optimization Runtime: 10.887s
Optimization Runtime: 10.788s
Optimization Runtime: 7.491s
Optimization Runtime: 2.453s
Optimization Runtime: 3.727s
Optimization Runtime: 3.974s


In [24]:
def print_params():
    print('Statistics for Data in SH600519\n')
    print(' /**********   1min Window   **********\ ')
    print('Positive: beta = {:.4f}, gamma = {:.4f}'.format(df4_1_pos_params[0], df4_1_pos_params[1]))
    print('Negative: beta = {:.4f}, gamma = {:.4f} \n'.format(df4_1_neg_params[0], df4_1_neg_params[1]))
    
    print(' /**********   5min Window   **********\ ')
    print('Positive: beta = {:.4f}, gamma = {:.4f}'.format(df4_5_pos_params[0], df4_5_pos_params[1]))
    print('Negative: beta = {:.4f}, gamma = {:.4f} \n \n \n'.format(df4_5_neg_params[0], df4_5_neg_params[1]))
    
    print('Statistics for Data in SH601398\n')
    print(' /**********   1min Window   **********\ ')
    print('Positive: beta = {:.4f}, gamma = {:.4f}'.format(df5_1_pos_params[0], df5_1_pos_params[1]))
    print('Negative: beta = {:.4f}, gamma = {:.4f} \n'.format(df5_1_neg_params[0], df5_1_neg_params[1]))
    
    print(' /**********   5min Window   **********\ ')
    print('Positive: beta = {:.4f}, gamma = {:.4f}'.format(df5_5_pos_params[0], df5_5_pos_params[1]))
    print('Negative: beta = {:.4f}, gamma = {:.4f} \n'.format(df5_5_neg_params[0], df5_5_neg_params[1]))

In [25]:
print_params()

Statistics for Data in SH600519

 /**********   1min Window   **********\ 
Positive: beta = 0.1736, gamma = 0.4040
Negative: beta = -0.0686, gamma = 0.7834 

 /**********   5min Window   **********\ 
Positive: beta = 0.1006, gamma = 0.5382
Negative: beta = -0.0764, gamma = 0.6664 
 
 

Statistics for Data in SH601398

 /**********   1min Window   **********\ 
Positive: beta = 0.0104, gamma = 1.1310
Negative: beta = -0.0663, gamma = 0.6456 

 /**********   5min Window   **********\ 
Positive: beta = 0.0268, gamma = 0.8052
Negative: beta = -0.0062, gamma = 1.0914 

