In [1]:
import torch
import torch.nn.functional as F
from torch.utils import data
from torchinfo import summary
import torch.nn as nn
import torch.optim as optim
import sys
import pandas as pd
import pickle
import numpy as np
from tqdm import tqdm
from datetime import datetime
import copy
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, classification_report

In [2]:
df = pd.read_csv('/Users/jandh/Downloads/UChicago_ProjLab2024Fall/2603_md_202106_202106.csv')
sys.getsizeof(df)/1000000

1876.167674

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,date,time,lastPx,size,volume,SP5,SP4,SP3,SP2,...,SV5,SV4,SV3,SV2,SV1,BV1,BV2,BV3,BV4,BV5
0,0,2021-06-01,90000112,10150.0,8111.0,8111,10400.0,10350.0,10300.0,10250.0,...,2596,2138,3740,2185,3294,61,1240,1403,2195,685
1,1,2021-06-01,90000139,10150.0,2.0,8113,10400.0,10350.0,10300.0,10250.0,...,2596,2138,3740,2185,3294,59,1240,1403,2195,685
2,2,2021-06-01,90000156,,,8113,10400.0,10350.0,10300.0,10250.0,...,2596,2138,3740,2185,3324,59,1240,1403,2195,685
3,3,2021-06-01,90000195,10150.0,16.0,8129,10400.0,10350.0,10300.0,10250.0,...,2596,2138,3740,2185,3324,43,1240,1403,2195,685
4,4,2021-06-01,90000197,,,8129,10400.0,10350.0,10300.0,10250.0,...,2596,2138,3740,2185,3324,43,1240,1403,2195,687


In [4]:
df= df[df["SV1"]!=0]
df= df[df["SP1"]!=0]
df= df[df["BP1"]!=0]
df= df[df["BV1"]!=0]
df.isna().any()

Unnamed: 0    False
date          False
time          False
lastPx         True
size           True
volume        False
SP5           False
SP4           False
SP3           False
SP2           False
SP1           False
BP1           False
BP2           False
BP3           False
BP4           False
BP5           False
SV5           False
SV4           False
SV3           False
SV2           False
SV1           False
BV1           False
BV2           False
BV3           False
BV4           False
BV5           False
dtype: bool

In [5]:
def get_forward_rets(df,period = 10):
    mid_Price = (df.loc[:,'SP1']+df.loc[:,'BP1'])/2
    mid_Price = mid_Price[mid_Price!=0]
    foward_mean = mid_Price[::-1].rolling(window = period,min_periods = period).mean()[::-1].shift(-1)
    pc_mid = (foward_mean-mid_Price)/mid_Price
    # print(df.index.equals(pc_mid.index))
    pc_mid = pc_mid[pc_mid.notnull()]
    # print(pc_mid.index[-1])
    return pc_mid

In [6]:
def label_data(df,colname,thresh):

    '''Fixed threshold for labelling 20bps'''
    # print(test_data.head())
    def label_thresholder(x):
        if x>=thresh:
            return 0
        elif x<=-thresh:
            return 2
        else:
            return 1

    labels = df[colname].apply(label_thresholder)
    print(labels.unique(),labels.value_counts(),'labels distribution')
    return labels

def normalize_data(df,cols,norm):
    #Normalizing using z-score
    if norm=='Z':
        scaler = StandardScaler()
        scaler.fit(df[cols])
        data = scaler.transform(df[cols])
    
    
    #Normalizing using DecPrec
    if norm=='DecPrec':
        k_len = np.ceil(np.log10(df[cols].abs().max()))
        # print(k_len)
        data = df[cols]/(10**k_len)

    return data,scaler


In [7]:
def get_fw_labels(df,k,threshold):
    fw_ = pd.DataFrame(get_forward_rets(df,k),columns=['fw_%d'%k])
    # fw_.tail()
    temp = df.merge(fw_,left_index=True,right_index=True)
    print(len(temp[temp['fw_%d'%k].isna()]))
    labels = label_data(temp,'fw_%d'%k,threshold)
    labels = pd.DataFrame(labels)
    labels.columns = ['label']
    # print(labels.value_counts())
    temp = temp.merge(labels,left_index=True,right_index=True)
    pushdf = temp.drop(columns=['Unnamed: 0','time','lastPx','size','volume','fw_%d'%k])
    cols = []
    for i in range(1,6):
        cols += ['SP%s'%i,'SV%s'%i,'BP%s'%i,'BV%s'%i]
    cols+=['label']
    pushdf = pushdf[['date']+cols]
    return pushdf

In [8]:
pushdf = get_fw_labels(df,10,0.0015)
# pushdf.to_csv('/Users/jandh/Desktop/Old Desktop/od/1 quater/Project Lab/labelled_data.csv', index=False)

0
[1 2 0] fw_10
1    7181488
0      17491
2      16851
Name: count, dtype: int64 labels distribution


In [11]:
pushdf.head()

Unnamed: 0,date,SP1,SV1,BP1,BV1,SP2,SV2,BP2,BV2,SP3,...,BV3,SP4,SV4,BP4,BV4,SP5,SV5,BP5,BV5,label
0,2021-06-01,10200.0,3294,10150.0,61,10250.0,2185,10100.0,1240,10300.0,...,1403,10350.0,2138,10000.0,2195,10400.0,2596,9990.0,685,1
1,2021-06-01,10200.0,3294,10150.0,59,10250.0,2185,10100.0,1240,10300.0,...,1403,10350.0,2138,10000.0,2195,10400.0,2596,9990.0,685,1
2,2021-06-01,10200.0,3324,10150.0,59,10250.0,2185,10100.0,1240,10300.0,...,1403,10350.0,2138,10000.0,2195,10400.0,2596,9990.0,685,1
3,2021-06-01,10200.0,3324,10150.0,43,10250.0,2185,10100.0,1240,10300.0,...,1403,10350.0,2138,10000.0,2195,10400.0,2596,9990.0,685,1
4,2021-06-01,10200.0,3324,10150.0,43,10250.0,2185,10100.0,1240,10300.0,...,1403,10350.0,2138,10000.0,2195,10400.0,2596,9990.0,687,1


In [36]:
pushdf = get_fw_labels(df,500,0.0015)

0
[1 0 2] fw_500
1    6488344
0     373135
2     353861
Name: count, dtype: int64 labels distribution


In [37]:
# pushdf.to_csv('/Users/jandh/Desktop/Old Desktop/od/1 quater/Project Lab/labelled_data_500.csv', index=False)

In [38]:
df.columns

Index(['Unnamed: 0', 'date', 'time', 'lastPx', 'size', 'volume', 'SP5', 'SP4',
       'SP3', 'SP2', 'SP1', 'BP1', 'BP2', 'BP3', 'BP4', 'BP5', 'SV5', 'SV4',
       'SV3', 'SV2', 'SV1', 'BV1', 'BV2', 'BV3', 'BV4', 'BV5'],
      dtype='object')