### Data Loading

In [1]:
from pandas import read_csv

df_train = read_csv('train.csv')
df_test = read_csv('test.csv')

`opened_position_qty ` and `closed_position_qty` are sometimes empty, but we can derive them from other features.

In [2]:
def fix_position_qty_columns(df):
    df['opened_position_qty '] = (df['transacted_qty']+df['d_open_interest'])/2
    df['closed_position_qty'] = (df['transacted_qty']-df['d_open_interest'])/2
fix_position_qty_columns(df_train)
fix_position_qty_columns(df_test)

### Feature Engineering
From Kaggle, the starting features are:
- `id` - The timestep ID of the order book features. In the training set, id is ordered chronologically. However, in the test set, they are scrambled so that you cannot simply use the order book features of future data to make fake "predictions" in previous timesteps. To keep the competition more of a ML problem and less of a math problem, this scrambling also prevents timeseries modelling. If you would like, you could try to unscramble the ordering using some ML.
- `last_price` - the price at which the most recent order fill occurred.
- `mid` - the "mid" price, which is the average of the best bid (bid1) and the best ask (ask1) prices.
- `opened_position_qty ` - In the past 500ms, how many buy orders were filled?
- `closed_position_qty` - In the past 500ms, how many sell orders were filled?
- `transacted_qty` - In the past 500ms, how many buy+sell orders were filled?
- `d_open_interest` - In the past 500ms, what is (#buy orders filled)- (#sell orders filled)?
- `bid1` - What is the 1st bid price (the best/highest one)?
- `bid[2,3,4,5]` - What is the [2nd, 3rd, 4th, 5th] best/highest bid price?
- `ask1` - What is the 1st ask price (the best/lowest/cheapest one)?
- `ask[2,3,4,5]` - What is the [2nd, 3rd, 4th, 5th] best/lowest/cheapest ask price?
- `bid1vol` - What is the quantity of contracts in the order book at the 1st bid price (the best/highest one)?
- `bid[2,3,4,5]vol` - What is the quantity of contracts in the order book at the [2,3,4,5]th bid price (the [2,3,4,5]th best/highest one)?
- `ask1vol` - What is the quantity of contracts in the order book at the 1st ask price (the best/lowest/cheapest one)?
- `ask[2,3,4,5]vol` - What is the quantity of contracts in the order book at the [2,3,4,5]th ask price (the [2,3,4,5]th best/lowest/cheapest one)?
- `y` (unique to training data) - What is the change in the mid price from now to 2 timesteps (approx. 1 second) in the future? "1" means this change is strictly positive, and "0" means the change is 0 or negative.

In [3]:
import copy

def add_features(df):
    for i in range(1,6):
        for n in ('bid', 'ask'):
            df[f'{n}{i}mul'] = df[f'{n}{i}'] * df[f'{n}{i}vol']
    df['lpdm'] = df['last_price']/df['mid']
    df['spread'] = df['ask1']-df['bid1']
    bidvol = df['bid1vol']
    askvol = df['ask1vol']
    for i in range(2,6):
        bidvol += df[f'bid{i}vol']
        askvol += df[f'ask{i}vol']
    df['buy_fill_prop'] = df['opened_position_qty ']/bidvol
    df['sell_fill_prop'] = df['closed_position_qty']/askvol
    df['bid_ask_ratio'] = bidvol/askvol

def add_features2(df):
    cols = copy.copy(df.columns)
    for i in range(len(cols)-1):
        n1 = cols[i]
        if n1 in ('id', 'y'):
            continue
        for j in range(i+1, len(cols)):
            n2 = cols[j]
            if n2 in ('id', 'y'):
                continue
            p = f'{i},{j},'
            df[f'{p}*'] = df[n1]*df[n2]
            df[f'{p}/'] = df[n1]/df[n2]
    
add_features2(df_train)
add_features2(df_test)

### Cross-Validation Setup

In [22]:
import numpy as np

trainlen = 100000
fsz = trainlen//20
gap = 200000

idx_folds = [(np.concatenate((np.arange(0, i*fsz), np.arange((i+1)*fsz, trainlen))),
                 np.arange(i*fsz, (i+1)*fsz))
                for i in range(20)]

df_train_lean = df_train.drop(columns=['id', 'y']).to_numpy()
df_train_y = df_train['y'].to_numpy()

test_x = df_test.drop(columns='id').to_numpy()
test_id = df_test['id']

# 'global' validation set, never used for training
g_val_x = df_train_lean[trainlen+gap:,:]
g_val_y = df_train_y[trainlen+gap:]

# 'global' training set, includes cross-validation sets
g_train_x = df_train_lean[:trainlen,:]
g_train_y = df_train_y[:trainlen]

### Data Normalization

In [5]:
col_stds = np.std(df_train_lean, 0)

g_train_x = (g_train_x-np.mean(g_train_x, 0))/col_stds
g_val_x = (g_val_x-np.mean(g_val_x, 0))/col_stds
test_x = (test_x-np.mean(test_x, 0))/col_stds

  x = asanyarray(arr - arrmean)
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  """


### Training XGBoost Models
- https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/

In [29]:
import xgboost as xgb
from sklearn.metrics import roc_auc_score

xgb_params = {
    'eval_metric' : 'auc',
    'objective' : 'binary:logistic',
    'max_depth' : 3,
    'learning_rate' : 0.15,
    'subsample' : 0.5,
    'colsample_bytree' : 0.5,
}

xgb_ypred_train = np.zeros((len(g_train_y), len(idx_folds)))
xgb_ypred_val = np.zeros((len(g_val_y), len(idx_folds)))
xgb_ypred_test = np.zeros((len(test_id), len(idx_folds)))

xgb_dgval = xgb.DMatrix(g_val_x, label=g_val_y)
xgb_dtest = xgb.DMatrix(test_x)

fi = None

for f, (idx_train, idx_val) in enumerate(idx_folds):
    dtrain = xgb.DMatrix(g_train_x[idx_train], label=g_train_y[idx_train])
    dval = xgb.DMatrix(g_train_x[idx_val], label=g_train_y[idx_val])
    bst = xgb.train(xgb_params, dtrain, 100,
                    [(dtrain, 'train'), (dval, 'local-val'), (xgb_dgval, 'eval')],
                    early_stopping_rounds=20, verbose_eval=False)
    xgb_ypred_train[idx_val,f] = bst.predict(dval)
    xgb_ypred_val[:,f] = bst.predict(xgb_dgval)
    xgb_ypred_test[:,f] = bst.predict(xgb_dtest)
    auc_local = roc_auc_score(g_train_y[idx_val], xgb_ypred_train[idx_val,f])
    auc_global = roc_auc_score(g_val_y, xgb_ypred_val[:,f])
    print(f'Fold {f+1} finished: local val={auc_local}, global val={auc_global}')
    fi = bst.get_score(importance_type='gain')

xgb_val_score = roc_auc_score(g_val_y, np.mean(xgb_ypred_val, 1))
print(f'Ensemble score (pred. means): {xgb_val_score}')

Fold 1 finished: local val=0.6446323230992738, global val=0.6529872991891376


KeyboardInterrupt: 

### Training Ridge Regressors

In [12]:
from sklearn.linear_model import Ridge
from scipy.special import expit as sigmoid

ridge = Ridge()

ridge_ypred_train = np.zeros((len(g_train_y), len(idx_folds)))
ridge_ypred_val = np.zeros((len(g_val_y), len(idx_folds)))
ridge_ypred_test = np.zeros((len(test_id), len(idx_folds)))

for f, (idx_train, idx_val) in enumerate(idx_folds):
    ridge.fit(g_train_x[idx_train], g_train_y[idx_train])
    ridge_ypred_train[idx_val,f] = sigmoid(ridge.predict(g_train_x[idx_val]))
    ridge_ypred_val[:,f] = sigmoid(ridge.predict(g_val_x))
    ridge_ypred_test[:,f] = sigmoid(ridge.predict(test_x))
    auc_local = roc_auc_score(g_train_y[idx_val], ridge_ypred_train[idx_val,f])
    auc_global = roc_auc_score(g_val_y, ridge_ypred_val[:,f])
    print(f'Fold {f+1} finished: local val={auc_local}, global val={auc_global}')

ridge_val_score = roc_auc_score(g_val_y, np.mean(ridge_ypred_val, 1))
print(f'Ensemble score (pred. means): {ridge_val_score}')

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

### Train Ensemble Predictor

In [24]:
ens_params = {
    'eval_metric' : 'auc',
    'objective' : 'binary:logistic',
    'max_depth' : 3,
    'learning_rate' : 0.2,
    'subsample' : 0.5,
    'colsample_bytree' : 0.5,
    'lambda' : 0
}

dpred_train = xgb.DMatrix(np.concatenate((xgb_ypred_train, ridge_ypred_train), 1), label=g_train_y)
dpred_val = xgb.DMatrix(np.concatenate((xgb_ypred_val, ridge_ypred_val), 1), label=g_val_y)
dpred_test = xgb.DMatrix(np.concatenate((xgb_ypred_test, ridge_ypred_test), 1))

ens_bst = xgb.train(ens_params, dpred_train, 100, 
                 [(dpred_train, 'train'), (dpred_val, 'eval')],
                 early_stopping_rounds=15, verbose_eval=10)
ens_val_score = roc_auc_score(g_val_y, ens_bst.predict(dpred_val))
ens_ypred_test = ens_bst.predict(dpred_test)
print(f'Score: {ens_val_score}')

[0]	train-auc:0.514782	eval-auc:0.624998
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 15 rounds.
[10]	train-auc:0.573696	eval-auc:0.64742
[20]	train-auc:0.5932	eval-auc:0.652541
[30]	train-auc:0.595401	eval-auc:0.652629
[40]	train-auc:0.601448	eval-auc:0.652726
[50]	train-auc:0.602061	eval-auc:0.65289
[60]	train-auc:0.602966	eval-auc:0.65287
Stopping. Best iteration:
[48]	train-auc:0.601867	eval-auc:0.652894

Score: 0.6528587130315405


### Save Predictions

In [26]:
from pandas import DataFrame
submission_df = DataFrame()
submission_df['id'] = test_id
submission_df['Predicted'] = ens_ypred_test
submission_df.to_csv('submission.csv', index=False)

In [46]:
fi

{'f535': 1811.1331833,
 'f205': 425.41577149999995,
 'f315': 506.289062,
 'f39': 92.33435825000001,
 'f467': 154.19348584117643,
 'f51': 35.0949173,
 'f475': 425.498596,
 'f457': 2899.2002,
 'f27': 166.5925873277778,
 'f501': 146.51183215,
 'f247': 267.69910757,
 'f416': 302.313965,
 'f469': 82.11359074999999,
 'f595': 356.5757273151516,
 'f97': 203.11924360000003,
 'f37': 198.44230643124996,
 'f47': 111.29022092222222,
 'f309': 140.80539974,
 'f227': 113.046143,
 'f114': 1398.14404,
 'f49': 90.42183813333334,
 'f395': 122.33252705,
 'f161': 326.328979,
 'f557': 1156.10474,
 'f597': 238.56445300000001,
 'f87': 57.809205559999995,
 'f511': 347.54969800000003,
 'f307': 138.53743678888887,
 'f299': 44.020883500000004,
 'f473': 71.09305665,
 'f99': 140.2539216333333,
 'f196': 40.437332839999996,
 'f223': 84.3161011,
 'f238': 78.922581175,
 'f175': 65.3563232,
 'f236': 114.30585606666666,
 'f95': 65.41790358,
 'f91': 34.40648463333334,
 'f249': 57.610022900000004,
 'f611': 96.86212556666668

In [52]:
fi_sorted = []
df_train_dropped = df_train.drop(columns=['id', 'y'])
for l, v in fi.items():
    cols_drop = df_train_dropped.columns
    cols = df_train.columns
    n = cols_drop[int(l[1:])]
    if ',' not in n:
        fi_sorted.append((v, n, '', ''))
        continue
    f1, f2, op = n.split(',')
    fi_sorted.append((v, cols[int(f1)], cols[int(f2)], op))
fi_sorted.sort(key=lambda x: x[0], reverse=True)
df_fi = DataFrame()
for i in range(4):
    df_fi[['gain', 'f1', 'f2', 'op'][i]] = [x[i] for x in fi_sorted]
df_fi.to_csv('important-features.csv', index=False)

In [55]:
df_fi


def add_features3(data, features):
    for i in range(len(features)):
        f1, f2, op = [features[x][i] for x in features.columns[1:]]
        if op == '':
            continue
        if op == '+':
            data[f'{f1},{f2},{op}'] = data[f1] + data[f2]
        if op == '-':
            data[f'{f1},{f2},{op}'] = data[f1] - data[f2]
        if op == '*':
            data[f'{f1},{f2},{op}'] = data[f1] * data[f2]
        if op == '/':
            data[f'{f1},{f2},{op}'] = data[f1] / data[f2]
add_features3(df_train_dropped, df_fi)
        