In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV, KFold, train_test_split, validation_curve
from sklearn.linear_model import LinearRegression, LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, StackingClassifier

In [2]:
# Load in training and test set data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

train = train.dropna()
test_ids = test['id']
test = test.drop(['id'], axis=1)

# Replace NaN values with 0
test = test.fillna(test.mean())

In [3]:
# Preview training set data
train.head()

Unnamed: 0,id,last_price,mid,opened_position_qty,closed_position_qty,transacted_qty,d_open_interest,bid1,bid2,bid3,...,bid2vol,bid3vol,bid4vol,bid5vol,ask1vol,ask2vol,ask3vol,ask4vol,ask5vol,y
1,1,3842.8,3843.4,6.0,49.0,55.0,-43,3843.0,3842.8,3842.4,...,6,11,1,6,1,4,4,1,13,0
2,2,3844.0,3844.3,7.0,77.0,84.0,-69,3843.8,3843.6,3843.2,...,1,4,21,12,1,16,10,4,9,0
3,3,3843.8,3843.4,3.0,34.0,37.0,-30,3843.0,3842.8,3842.4,...,13,12,2,4,2,7,1,2,11,1
4,4,3843.2,3843.1,3.0,38.0,41.0,-35,3842.8,3842.4,3842.0,...,12,2,2,4,1,3,1,11,15,1
5,5,3843.6,3844.2,12.0,17.0,29.0,-5,3843.8,3843.4,3843.2,...,6,1,2,17,1,12,15,10,3,0


In [27]:
# Preview test set data
test.head()

Unnamed: 0,last_price,mid,opened_position_qty,closed_position_qty,transacted_qty,d_open_interest,bid1,bid2,bid3,bid4,...,bid1vol,bid2vol,bid3vol,bid4vol,bid5vol,ask1vol,ask2vol,ask3vol,ask4vol,ask5vol
0,5178.4,5178.3,1.0,11.0,12.0,-10,5178.2,5178.0,5177.8,5177.6,...,3,16,3,1,1,1,4,1,5,2
1,5133.0,5132.9,1.0,0.0,1.0,1,5132.8,5132.6,5132.4,5132.2,...,1,2,2,1,10,4,8,1,1,1
2,5177.4,5178.0,2.0,0.0,2.0,2,5177.2,5176.8,5176.6,5176.4,...,8,18,1,1,6,7,4,1,2,5
3,5093.6,5093.9,0.0,2.0,2.0,-1,5093.2,5093.0,5092.8,5092.2,...,8,5,4,3,1,1,1,11,6,2
4,5189.2,5189.2,1.444629,1.936579,0.0,0,5188.8,5188.6,5188.4,5188.2,...,1,5,1,3,3,3,2,1,3,4


In [4]:
print(train.shape)
print(test.shape)

(419920, 28)
(191859, 26)


In [3]:
train_X, test_X, train_y, test_y = train_test_split(train.loc[:,'last_price':'ask5vol'], 
                                                    train.loc[:,'y'], test_size=0.2, random_state=155155155)

In [4]:
# Normalize training and validation data
min_max_scaler = preprocessing.MinMaxScaler()
train_norm = min_max_scaler.fit_transform(train_X)
train_X = pd.DataFrame(train_norm)
val_norm = min_max_scaler.fit_transform(test_X)
test_X = pd.DataFrame(val_norm)

# Normalize test set data
test_norm = min_max_scaler.fit_transform(test.to_numpy())
test = pd.DataFrame(test_norm)

In [5]:
log_reg = LogisticRegression(max_iter=100000)
sgd_reg = SGDClassifier()
dec_tree = DecisionTreeClassifier(max_depth=9)

estimators = [('logistic', log_reg),('sgd', sgd_reg), ('dec_tree', dec_tree)]

reg = StackingClassifier(estimators=estimators, 
                        final_estimator=GradientBoostingClassifier(random_state=42))
reg.fit(train_X, train_y)
print("finished")

KeyboardInterrupt: 

In [18]:
print(reg.score(train_X, train_y))
print(1 - reg.score(train_X, train_y))
print(reg.score(test_X, test_y))
print(1 - reg.score(test_X, test_y))

0.6422175652505239
0.35778243474947613
0.6361687940560107
0.3638312059439893


In [30]:
log_reg = LogisticRegression(max_iter=1000000)
sgd_reg = SGDClassifier(early_stopping=True)
# dec_tree = RandomForestClassifier()

estimators = [('logistic', log_reg),('sgd', sgd_reg)]

reg = StackingClassifier(estimators=estimators, 
                        final_estimator=LogisticRegression(random_state=42),
                        n_jobs=8)
reg.fit(train.loc[:,'last_price':'ask5vol'], train.loc[:,'y'])
print("finished")

finished


In [31]:
# Predict probabilities on validation set
pred = reg.predict_proba(test.to_numpy())

print(pred)

# Probabilities of being labeled 1
pred = pd.DataFrame(pred[:,1], columns=['Predicted'])
pred.insert(0, 'id', test_ids)
pred.to_csv("stacked_probs_submission.csv", index=False)

[[0.49999699 0.50000301]
 [0.50002717 0.49997283]
 [0.50002033 0.49997967]
 ...
 [0.5000231  0.4999769 ]
 [0.50001844 0.49998156]
 [0.50001694 0.49998306]]


In [22]:
log_reg = LinearRegression(n_jobs=16)
log_reg.fit(train_X, train_y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=16, normalize=False)

In [11]:
from sklearn.neural_network import MLPClassifier

nn = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(32,32,32,32,32), random_state=1, early_stopping=True, max_iter=500)

nn.fit(train_X, train_y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=True, epsilon=1e-08,
              hidden_layer_sizes=(32, 32, 32, 32, 32), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=500,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=1, shuffle=True, solver='lbfgs',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [14]:
print(1 - nn.score(train_X,train_y))
print(1 - nn.score(test_X,test_y))

0.3628399457039436
0.3669032196608878


In [15]:
# Predict probabilities on validation set
pred = nn.predict_proba(test.to_numpy())

print(pred)

# Probabilities of being labeled 1
pred = pd.DataFrame(pred[:,1], columns=['Predicted'])
pred.insert(0, 'id', test_ids)
pred.to_csv("nn_probs_submission.csv", index=False)

[[0.29833274 0.70166726]
 [0.70552216 0.29447784]
 [0.65499622 0.34500378]
 ...
 [0.33314516 0.66685484]
 [0.66562678 0.33437322]
 [0.31312016 0.68687984]]


In [19]:
from sklearn.neural_network import MLPClassifier

nn = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(32,32,32,32,32), random_state=1, early_stopping=True, max_iter=500)

nn.fit(train.loc[:,'last_price':'ask5vol'], train.loc[:,'y'])

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=True, epsilon=1e-08,
              hidden_layer_sizes=(32, 32, 32, 32, 32), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=500,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=1, shuffle=True, solver='lbfgs',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [20]:
# Predict probabilities on validation set
pred = nn.predict_proba(test.to_numpy())

print(pred)

# Probabilities of being labeled 1
pred = pd.DataFrame(pred[:,1], columns=['Predicted'])
pred.insert(0, 'id', test_ids)
pred.to_csv("nn2_probs_submission.csv", index=False)

[[0.44734814 0.55265186]
 [0.45563805 0.54436195]
 [0.45173546 0.54826454]
 ...
 [0.4519228  0.5480772 ]
 [0.46173113 0.53826887]
 [0.45361245 0.54638755]]


In [28]:
rndf = RandomForestClassifier(criterion='gini', max_depth=20, n_estimators=400).fit(train_X, train_y)

In [31]:
print(rndf.score(train_X,train_y))
print(rndf.score(test_X,test_y))

0.8566750809678034
0.6400742998666412


In [32]:
# Predict probabilities on validation set
pred = rndf.predict_proba(test.to_numpy())

print(pred)

# Probabilities of being labeled 1
pred = pd.DataFrame(pred[:,1], columns=['Predicted'])
pred.insert(0, 'id', test_ids)
pred.to_csv("rndf_probs_submission.csv", index=False)

[[0.455     0.545    ]
 [0.4825    0.5175   ]
 [0.4825    0.5175   ]
 ...
 [0.4703125 0.5296875]
 [0.465     0.535    ]
 [0.4825    0.5175   ]]


Learning rate set to 0.092572
0:	learn: 0.6831796	total: 117ms	remaining: 1m 57s
1:	learn: 0.6751798	total: 171ms	remaining: 1m 25s
2:	learn: 0.6684406	total: 247ms	remaining: 1m 22s
3:	learn: 0.6630481	total: 302ms	remaining: 1m 15s
4:	learn: 0.6583795	total: 357ms	remaining: 1m 11s
5:	learn: 0.6546202	total: 407ms	remaining: 1m 7s
6:	learn: 0.6516004	total: 472ms	remaining: 1m 7s
7:	learn: 0.6489808	total: 538ms	remaining: 1m 6s
8:	learn: 0.6467576	total: 591ms	remaining: 1m 5s
9:	learn: 0.6447904	total: 649ms	remaining: 1m 4s
10:	learn: 0.6434202	total: 721ms	remaining: 1m 4s
11:	learn: 0.6420961	total: 777ms	remaining: 1m 3s
12:	learn: 0.6411269	total: 835ms	remaining: 1m 3s
13:	learn: 0.6402227	total: 888ms	remaining: 1m 2s
14:	learn: 0.6394021	total: 952ms	remaining: 1m 2s
15:	learn: 0.6386984	total: 1.01s	remaining: 1m 2s
16:	learn: 0.6381353	total: 1.07s	remaining: 1m 1s
17:	learn: 0.6376049	total: 1.12s	remaining: 1m
18:	learn: 0.6371512	total: 1.18s	remaining: 1m
19:	learn: 0

164:	learn: 0.6318484	total: 10.8s	remaining: 54.7s
165:	learn: 0.6318331	total: 10.9s	remaining: 54.6s
166:	learn: 0.6318090	total: 10.9s	remaining: 54.5s
167:	learn: 0.6318008	total: 11s	remaining: 54.4s
168:	learn: 0.6317721	total: 11.1s	remaining: 54.4s
169:	learn: 0.6317527	total: 11.1s	remaining: 54.3s
170:	learn: 0.6317333	total: 11.2s	remaining: 54.3s
171:	learn: 0.6317219	total: 11.3s	remaining: 54.3s
172:	learn: 0.6317012	total: 11.3s	remaining: 54.2s
173:	learn: 0.6316744	total: 11.4s	remaining: 54.1s
174:	learn: 0.6316494	total: 11.5s	remaining: 54s
175:	learn: 0.6316323	total: 11.5s	remaining: 54s
176:	learn: 0.6316027	total: 11.6s	remaining: 53.9s
177:	learn: 0.6315883	total: 11.6s	remaining: 53.8s
178:	learn: 0.6315670	total: 11.7s	remaining: 53.7s
179:	learn: 0.6315521	total: 11.8s	remaining: 53.6s
180:	learn: 0.6315361	total: 11.8s	remaining: 53.5s
181:	learn: 0.6315085	total: 11.9s	remaining: 53.4s
182:	learn: 0.6314864	total: 11.9s	remaining: 53.3s
183:	learn: 0.6314

326:	learn: 0.6290170	total: 21.5s	remaining: 44.2s
327:	learn: 0.6289941	total: 21.5s	remaining: 44.1s
328:	learn: 0.6289759	total: 21.6s	remaining: 44.1s
329:	learn: 0.6289585	total: 21.7s	remaining: 44s
330:	learn: 0.6289421	total: 21.7s	remaining: 43.9s
331:	learn: 0.6289274	total: 21.8s	remaining: 43.8s
332:	learn: 0.6289042	total: 21.9s	remaining: 43.8s
333:	learn: 0.6288929	total: 21.9s	remaining: 43.7s
334:	learn: 0.6288762	total: 22s	remaining: 43.7s
335:	learn: 0.6288633	total: 22.1s	remaining: 43.6s
336:	learn: 0.6288544	total: 22.1s	remaining: 43.5s
337:	learn: 0.6288286	total: 22.2s	remaining: 43.5s
338:	learn: 0.6288121	total: 22.3s	remaining: 43.4s
339:	learn: 0.6287962	total: 22.3s	remaining: 43.3s
340:	learn: 0.6287809	total: 22.4s	remaining: 43.3s
341:	learn: 0.6287668	total: 22.4s	remaining: 43.2s
342:	learn: 0.6287489	total: 22.5s	remaining: 43.1s
343:	learn: 0.6287317	total: 22.6s	remaining: 43s
344:	learn: 0.6287175	total: 22.6s	remaining: 43s
345:	learn: 0.628706

487:	learn: 0.6266138	total: 32.1s	remaining: 33.6s
488:	learn: 0.6266041	total: 32.1s	remaining: 33.6s
489:	learn: 0.6265897	total: 32.2s	remaining: 33.5s
490:	learn: 0.6265745	total: 32.2s	remaining: 33.4s
491:	learn: 0.6265575	total: 32.3s	remaining: 33.4s
492:	learn: 0.6265410	total: 32.4s	remaining: 33.3s
493:	learn: 0.6265243	total: 32.4s	remaining: 33.2s
494:	learn: 0.6265055	total: 32.5s	remaining: 33.1s
495:	learn: 0.6264899	total: 32.5s	remaining: 33.1s
496:	learn: 0.6264750	total: 32.6s	remaining: 33s
497:	learn: 0.6264660	total: 32.7s	remaining: 32.9s
498:	learn: 0.6264509	total: 32.7s	remaining: 32.9s
499:	learn: 0.6264403	total: 32.8s	remaining: 32.8s
500:	learn: 0.6264283	total: 32.9s	remaining: 32.7s
501:	learn: 0.6264048	total: 32.9s	remaining: 32.7s
502:	learn: 0.6263932	total: 33s	remaining: 32.6s
503:	learn: 0.6263826	total: 33.1s	remaining: 32.6s
504:	learn: 0.6263640	total: 33.2s	remaining: 32.5s
505:	learn: 0.6263494	total: 33.2s	remaining: 32.4s
506:	learn: 0.62

648:	learn: 0.6243529	total: 42.8s	remaining: 23.1s
649:	learn: 0.6243390	total: 42.9s	remaining: 23.1s
650:	learn: 0.6243216	total: 43s	remaining: 23s
651:	learn: 0.6243089	total: 43.1s	remaining: 23s
652:	learn: 0.6242892	total: 43.1s	remaining: 22.9s
653:	learn: 0.6242802	total: 43.2s	remaining: 22.9s
654:	learn: 0.6242695	total: 43.3s	remaining: 22.8s
655:	learn: 0.6242614	total: 43.3s	remaining: 22.7s
656:	learn: 0.6242461	total: 43.4s	remaining: 22.7s
657:	learn: 0.6242292	total: 43.5s	remaining: 22.6s
658:	learn: 0.6242173	total: 43.5s	remaining: 22.5s
659:	learn: 0.6242065	total: 43.6s	remaining: 22.5s
660:	learn: 0.6242011	total: 43.6s	remaining: 22.4s
661:	learn: 0.6241869	total: 43.7s	remaining: 22.3s
662:	learn: 0.6241735	total: 43.8s	remaining: 22.3s
663:	learn: 0.6241581	total: 43.8s	remaining: 22.2s
664:	learn: 0.6241469	total: 43.9s	remaining: 22.1s
665:	learn: 0.6241373	total: 44s	remaining: 22.1s
666:	learn: 0.6241224	total: 44s	remaining: 22s
667:	learn: 0.6241149	to

810:	learn: 0.6222466	total: 53.4s	remaining: 12.5s
811:	learn: 0.6222365	total: 53.5s	remaining: 12.4s
812:	learn: 0.6222224	total: 53.6s	remaining: 12.3s
813:	learn: 0.6222121	total: 53.6s	remaining: 12.3s
814:	learn: 0.6221922	total: 53.7s	remaining: 12.2s
815:	learn: 0.6221804	total: 53.8s	remaining: 12.1s
816:	learn: 0.6221624	total: 53.8s	remaining: 12.1s
817:	learn: 0.6221545	total: 53.9s	remaining: 12s
818:	learn: 0.6221389	total: 53.9s	remaining: 11.9s
819:	learn: 0.6221272	total: 54s	remaining: 11.9s
820:	learn: 0.6221195	total: 54.1s	remaining: 11.8s
821:	learn: 0.6220995	total: 54.1s	remaining: 11.7s
822:	learn: 0.6220895	total: 54.2s	remaining: 11.7s
823:	learn: 0.6220781	total: 54.3s	remaining: 11.6s
824:	learn: 0.6220599	total: 54.3s	remaining: 11.5s
825:	learn: 0.6220446	total: 54.4s	remaining: 11.5s
826:	learn: 0.6220290	total: 54.5s	remaining: 11.4s
827:	learn: 0.6220111	total: 54.5s	remaining: 11.3s
828:	learn: 0.6220011	total: 54.6s	remaining: 11.3s
829:	learn: 0.62

969:	learn: 0.6201800	total: 1m 3s	remaining: 1.96s
970:	learn: 0.6201674	total: 1m 3s	remaining: 1.9s
971:	learn: 0.6201588	total: 1m 3s	remaining: 1.83s
972:	learn: 0.6201447	total: 1m 3s	remaining: 1.76s
973:	learn: 0.6201345	total: 1m 3s	remaining: 1.7s
974:	learn: 0.6201257	total: 1m 3s	remaining: 1.63s
975:	learn: 0.6201116	total: 1m 3s	remaining: 1.57s
976:	learn: 0.6201014	total: 1m 3s	remaining: 1.5s
977:	learn: 0.6200866	total: 1m 3s	remaining: 1.44s
978:	learn: 0.6200751	total: 1m 3s	remaining: 1.37s
979:	learn: 0.6200624	total: 1m 3s	remaining: 1.3s
980:	learn: 0.6200512	total: 1m 4s	remaining: 1.24s
981:	learn: 0.6200376	total: 1m 4s	remaining: 1.18s
982:	learn: 0.6200269	total: 1m 4s	remaining: 1.11s
983:	learn: 0.6200122	total: 1m 4s	remaining: 1.04s
984:	learn: 0.6200070	total: 1m 4s	remaining: 979ms
985:	learn: 0.6200014	total: 1m 4s	remaining: 914ms
986:	learn: 0.6199831	total: 1m 4s	remaining: 849ms
987:	learn: 0.6199708	total: 1m 4s	remaining: 784ms
988:	learn: 0.61