In [68]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

In [16]:
df=pd.read_csv('train.csv')
df

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,time_id,row_id
0,0,0,0,3180602.69,1,0.999812,13380276.64,,,0.999812,60651.50,1.000026,8493.03,1.000000,-3.029704,0,0_0_0
1,1,0,0,166603.91,-1,0.999896,1642214.25,,,0.999896,3233.04,1.000660,20605.09,1.000000,-5.519986,0,0_0_1
2,2,0,0,302879.87,-1,0.999561,1819368.03,,,0.999403,37956.00,1.000298,18995.00,1.000000,-8.389950,0,0_0_2
3,3,0,0,11917682.27,-1,1.000171,18389745.62,,,0.999999,2324.90,1.000214,479032.40,1.000000,-4.010200,0,0_0_3
4,4,0,0,447549.96,-1,0.999532,17860614.95,,,0.999394,16485.54,1.000016,434.10,1.000000,-7.349849,0,0_0_4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5237975,195,480,540,2440722.89,-1,1.000317,28280361.74,0.999734,0.999734,1.000317,32257.04,1.000434,319862.40,1.000328,2.310276,26454,480_540_195
5237976,196,480,540,349510.47,-1,1.000643,9187699.11,1.000129,1.000386,1.000643,205108.40,1.000900,93393.07,1.000819,-8.220077,26454,480_540_196
5237977,197,480,540,0.00,0,0.995789,12725436.10,0.995789,0.995789,0.995789,16790.66,0.995883,180038.32,0.995797,1.169443,26454,480_540_197
5237978,198,480,540,1000898.84,1,0.999210,94773271.05,0.999210,0.999210,0.998970,125631.72,0.999210,669893.00,0.999008,-1.540184,26454,480_540_198


In [18]:
# train-test split
'''
take one stock out for testing (stock 199)
'''
df_other=df[df['stock_id']!=199]
df_test=df[df['stock_id']==199]
X_other=df_other.loc[:, df.columns != 'target'] 
y_other=df_other['target']
X_test=df_test.loc[:, df.columns != 'target'] 
y_test=df_test['target']

In [19]:
print(f'X_other shape: {X_other.shape}, y_other shape: {y_other.shape}')
print(f'X_test shape: {X_test.shape}, y_test shape: {y_test.shape}')

X_other shape: (5216365, 16), y_other shape: (5216365,)
X_test shape: (21615, 16), y_test shape: (21615,)


In [21]:
# train-validation split
'''
use GroupKFold for train-validation split (preserve the order by default)
'''
group_kfold = GroupKFold(n_splits=3)
stock_group = X_other['stock_id']
X_train_splits,X_val_splits,y_train_splits,y_val_splits=[],[],[],[]
i=1
for train_index, val_index in group_kfold.split(X_other, y_other, groups=stock_group):    
    X_train, X_val = X_other.iloc[train_index], X_other.iloc[val_index]
    y_train, y_val = y_other.iloc[train_index], y_other.iloc[val_index]
    X_train_splits.append(X_train)
    X_val_splits.append(X_val)
    y_train_splits.append(y_train)
    y_val_splits.append(y_val)
    print(f'{i}-th Fold:')
    print(f'X_train shape: {X_train.shape}, y_train shape: {y_train.shape}')
    print(f'X_val shape: {X_val.shape}, y_val shape: {y_val.shape}')
    i+=1

1-th Fold:
X_train shape: (3472700, 16), y_train shape: (3472700,)
X_val shape: (1743665, 16), y_val shape: (1743665,)
2-th Fold:
X_train shape: (3477485, 16), y_train shape: (3477485,)
X_val shape: (1738880, 16), y_val shape: (1738880,)
3-th Fold:
X_train shape: (3482545, 16), y_train shape: (3482545,)
X_val shape: (1733820, 16), y_val shape: (1733820,)


In [22]:
# missing values
print('data dimensions:',df.shape)
perc_missing_per_ftr = df.isnull().sum(axis=0)/df.shape[0]
print('fraction of missing values in features:')
print(perc_missing_per_ftr[perc_missing_per_ftr > 0])
print('data types of the features with missing values:')
print(df[perc_missing_per_ftr[perc_missing_per_ftr > 0].index].dtypes)
frac_missing = sum(df.isnull().sum(axis=1)!=0)/df.shape[0]
print('fraction of points with missing values:',frac_missing)

data dimensions: (5237980, 17)
fraction of missing values in features:
imbalance_size     0.000042
reference_price    0.000042
matched_size       0.000042
far_price          0.552568
near_price         0.545474
bid_price          0.000042
ask_price          0.000042
wap                0.000042
target             0.000017
dtype: float64
data types of the features with missing values:
imbalance_size     float64
reference_price    float64
matched_size       float64
far_price          float64
near_price         float64
bid_price          float64
ask_price          float64
wap                float64
target             float64
dtype: object
fraction of points with missing values: 0.5525683565038431


In [63]:
# handling missing values
'''
All features with missing values are continuous. 
For features that have a vary small fraction of missing values, use SimpleImputer

'''

def handle_missing_values(X,y):

    ftrs1 = ['far_price','near_price']
    ftrs2 = ['imbalance_size', 'reference_price', 'matched_size', 'far_price', 'near_price',\
              'bid_price', 'ask_price', 'wap']
    
    X_filled = X.copy() 
    
    for ftr in ftrs1:
        X_filled.loc[(df['seconds_in_bucket'] < 300) & (df[ftr].isna()), ftr] = 0
    
    for ftr in ftrs2:
        X_filled.loc[X_filled['seconds_in_bucket'] != 0, ftr] = X_filled.loc[X_filled['seconds_in_bucket'] != 0, ftr].ffill()
        X_filled.loc[X_filled['seconds_in_bucket'] == 0, ftr] = X_filled.loc[X_filled['seconds_in_bucket'] == 0, ftr].bfill()
        
    y_filled = y.ffill()
    
    print('data dimensions:',X_filled.shape)
    perc_missing_per_ftr = X_filled.isnull().sum(axis=0)/X_filled.shape[0]
    print('fraction of missing values in features:')
    print(perc_missing_per_ftr[perc_missing_per_ftr > 0])

    print('data dimensions:',y_filled.shape)
    perc_missing_per_ftr = y_filled.isnull().sum(axis=0)/y_filled.shape[0]
    print('fraction of missing values in features:')
    print(perc_missing_per_ftr[perc_missing_per_ftr > 0])

    return X_filled, y_filled


In [64]:
X_train_0, y_train_0 = X_train_splits[0], y_train_splits[0]
X_train_0_filled, y_train_0_filled = handle_missing_values(X_train_0,y_train_0)

data dimensions: (3472700, 16)
fraction of missing values in features:
Series([], dtype: float64)
data dimensions: (3472700,)
fraction of missing values in features:
[]


In [101]:
# preprocessing

def preprocessing(X,y):

    # apply one-hot encoding to imbalance_buy_sell_flag
    one_hot_ftrs = ['imbalance_buy_sell_flag']

    # initialize the encoder
    enc = OneHotEncoder(sparse=False) 
    enc.fit(X)
    # transform X
    X_ohe = enc.fit(X[one_hot_ftrs])
    X_ohe = enc.transform(X[one_hot_ftrs])
    # print(X_ohe)
    print('X transformed')

    # apply StandardScaler to the continuous features for each stock on each day
    cont_ftrs = ['imbalance_size','reference_price','matched_size','far_price','near_price',\
                    'bid_price','bid_size','ask_price','ask_size','wap']
    scaler_x = StandardScaler()
    X_scaled = X.groupby(['stock_id', 'date_id']).apply(lambda group: pd.DataFrame(scaler_x.fit_transform(group[cont_ftrs]), columns=cont_ftrs, index=group.index))
    X_scaled.reset_index(inplace=True, drop=True)
    print(X_scaled.head())

    # apply StandardScaler to target
    scaler_y = StandardScaler()
    y_output = scaler_y.fit_transform(y.to_numpy().reshape(-1, 1))
    print(y_output)

    X_output = pd.concat([pd.DataFrame(X_ohe), X_scaled], axis=1, ignore_index=True)
    X_output.columns = ['bal_flag_1', 'bal_flag_0', 'bal_flag_m1'] + list(X_scaled.columns)
    # X_output = X_output.drop(columns=['imbalance_buy_sell_flag','target'])
    print(f'shape of X after preprocessing: {X_output.shape}')

    return X_output, y_output

In [100]:
preprocessing(X_train_0_filled, y_train_0_filled)



X transformed
   imbalance_size  reference_price  matched_size  far_price  near_price  \
0        1.298526         0.343407     -1.297211  -0.912871   -0.912871   
1        0.046580         0.839589     -0.961555  -0.912871   -0.912871   
2        0.046580         0.591498     -0.961555  -0.912871   -0.912871   
3        0.046580         1.087680     -0.961555  -0.912871   -0.912871   
4       -0.007714         1.834272     -0.946999  -0.912871   -0.912871   

   bid_price  bid_size  ask_price  ask_size       wap  
0   0.622490  0.779613   0.726751 -1.054830  0.876399  
1   0.622490 -0.726612   0.726751 -0.436060  0.628869  
2   0.622490 -1.027857   0.490391 -0.904995  0.514271  
3   1.125250  0.629378   0.963111  0.498064  1.071214  
4   1.630358 -0.705322   1.674400 -0.308761  1.602946  
[[-0.32250675]
 [-0.59298249]
 [-0.90469644]
 ...
 [-0.41488802]
 [ 0.53223026]
 [-0.88624611]]
shape of X after preprocessing: (3472700, 13)


(         bal_flag_1  bal_flag_0  bal_flag_m1  imbalance_size  reference_price  \
 0               0.0         0.0          1.0        1.298526         0.343407   
 1               1.0         0.0          0.0        0.046580         0.839589   
 2               1.0         0.0          0.0        0.046580         0.591498   
 3               1.0         0.0          0.0        0.046580         1.087680   
 4               0.0         1.0          0.0       -0.007714         1.834272   
 ...             ...         ...          ...             ...              ...   
 3472695         1.0         0.0          0.0       -1.092359         0.632430   
 3472696         0.0         0.0          1.0       -1.092382         0.632430   
 3472697         0.0         0.0          1.0       -1.092382         0.632430   
 3472698         1.0         0.0          0.0       -1.092382         0.632430   
 3472699         1.0         0.0          0.0       -1.092382         0.632430   
 
          matc