In [2]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV, KFold, train_test_split, validation_curve
from sklearn.linear_model import LinearRegression, LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier, VotingClassifier
from sklearn.naive_bayes import GaussianNB

In [9]:
# Load in training and test set data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
train = train.dropna()
train = train.drop(['id'], axis=1)

# train = train.fillna(train.mean())
test_ids = test['id']
test = test.drop(['id'], axis=1)

# Replace NaN values with 0
# test = test.fillna(0)
test = test.fillna(test.mean())

train_data = train.drop(['y'], axis=1)
# Normalize training and test data
min_max_scaler = preprocessing.MinMaxScaler().fit(train_data)
train_data = pd.DataFrame(min_max_scaler.transform(train_data))
train_labels = pd.DataFrame(train.loc[:,'y']).to_numpy().ravel()
test = pd.DataFrame(min_max_scaler.transform(test))

train_X, test_X, train_y, test_y = train_test_split(train_data, train_labels, 
                                                    test_size=0.2, random_state=155155155)

(592380, 28)


In [49]:
# Preview training set data
train.head()

Unnamed: 0,last_price,mid,opened_position_qty,closed_position_qty,transacted_qty,d_open_interest,bid1,bid2,bid3,bid4,...,bid2vol,bid3vol,bid4vol,bid5vol,ask1vol,ask2vol,ask3vol,ask4vol,ask5vol,y
1,3842.8,3843.4,6.0,49.0,55.0,-43,3843.0,3842.8,3842.4,3842.0,...,6,11,1,6,1,4,4,1,13,0
2,3844.0,3844.3,7.0,77.0,84.0,-69,3843.8,3843.6,3843.2,3843.0,...,1,4,21,12,1,16,10,4,9,0
3,3843.8,3843.4,3.0,34.0,37.0,-30,3843.0,3842.8,3842.4,3842.0,...,13,12,2,4,2,7,1,2,11,1
4,3843.2,3843.1,3.0,38.0,41.0,-35,3842.8,3842.4,3842.0,3841.8,...,12,2,2,4,1,3,1,11,15,1
5,3843.6,3844.2,12.0,17.0,29.0,-5,3843.8,3843.4,3843.2,3843.0,...,6,1,2,17,1,12,15,10,3,0


In [48]:
# Preview test set data
test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,4.497696,4.495558,0.0125,0.051887,0.043651,0.728507,4.494737,4.494737,4.500659,4.500659,...,0.018349,0.108696,0.014815,0.0,0.0,0.0,0.022901,0.0,0.030075,0.007463
1,4.348255,4.346167,0.0125,0.0,0.0,0.778281,4.345395,4.345395,4.35112,4.35112,...,0.0,0.007246,0.007407,0.0,0.07563,0.023256,0.053435,0.0,0.0,0.0
2,4.494404,4.494571,0.025,0.0,0.003968,0.782805,4.491447,4.490789,4.496706,4.496706,...,0.06422,0.123188,0.0,0.0,0.042017,0.046512,0.022901,0.0,0.007519,0.029851
3,4.218565,4.217835,0.0,0.009434,0.003968,0.769231,4.215132,4.215132,4.220685,4.219368,...,0.06422,0.028986,0.022222,0.016949,0.0,0.0,0.0,0.075758,0.037594,0.007463
4,4.533246,4.531425,0.018058,0.009135,-0.003968,0.773756,4.529605,4.529605,4.535573,4.535573,...,0.0,0.028986,0.0,0.016949,0.016807,0.015504,0.007634,0.0,0.015038,0.022388


In [5]:
print(train.shape)
print(test.shape)
print(train_X.shape)
print(test_X.shape)

(419920, 27)
(191859, 26)
(335936, 26)
(83984, 26)


In [11]:
# Feature importances
from sklearn.datasets import make_classification
from sklearn.ensemble import ExtraTreesClassifier

feature = ExtraTreesClassifier(n_estimators=250, random_state=0).fit(train_data, train_labels)
importances = feature.feature_importances_
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(train_data.shape[1]):
    print("%d. feature %s (%f)" % (f + 1, train.columns[indices[f]], importances[indices[f]]))

Feature ranking:
1. feature ask1vol (0.060043)
2. feature bid1vol (0.050533)
3. feature bid3vol (0.046947)
4. feature bid2vol (0.046786)
5. feature ask3vol (0.046635)
6. feature ask2vol (0.046567)
7. feature bid4vol (0.046290)
8. feature bid5vol (0.046104)
9. feature ask4vol (0.045787)
10. feature ask5vol (0.045689)
11. feature last_price (0.044939)
12. feature d_open_interest (0.039219)
13. feature mid (0.034316)
14. feature transacted_qty (0.033044)
15. feature ask1 (0.032160)
16. feature ask5 (0.031706)
17. feature bid1 (0.031567)
18. feature bid5 (0.031268)
19. feature closed_position_qty (0.030791)
20. feature ask4 (0.030493)
21. feature bid4 (0.030170)
22. feature ask2 (0.030164)
23. feature ask3 (0.030009)
24. feature bid2 (0.029707)
25. feature bid3 (0.029617)
26. feature opened_position_qty  (0.029450)


In [6]:
# Linear Regression
lin_reg = LinearRegression().fit(train_X, train_y)
print('Training classification error: ' + str(1 - lin_reg.score(train_X, train_y)))
print('Validation classification error: ' + str(1 - lin_reg.score(test_X, test_y)))

Training classification error: 0.9421030420052702
Validation classification error: 0.9414586293996641


In [9]:
# Linear SVC 
svc = LinearSVC().fit(train_X, train_y)
print('Training classification error: ' + str(1 - svc.score(train_X, train_y)))
print('Validation classification error: ' + str(1 - svc.score(test_X, test_y)))

Training classification error: 0.3683201562202324
Validation classification error: 0.36711754619927606


In [10]:
# Logistic Regression on original dataset
log_reg = LogisticRegression().fit(train.loc[:,'last_price':'ask5vol'], train.loc[:,'y'])
print('Training classification error: ' + str(1 - log_reg.score(train.loc[:,'last_price':'ask5vol'], train.loc[:,'y'])))
# print('Validation classification error: ' + str(1 - log_reg.score(test_X, test_y)))

# Predict probabilities on validation set
lr_pred = log_reg.predict_proba(test.to_numpy())
# Probabilities of being labeled 1
lr_pred = pd.DataFrame(lr_pred[:,1], columns=['Predicted'])
lr_pred.insert(0, 'id', test_ids)
lr_pred.to_csv("lr_probs_whole_submission.csv", index=False)

Training classification error: 0.36938702610020957


In [74]:
# Logistic Regression on train and test split
log_reg = LogisticRegression().fit(train_X, train_y)
print('Training classification error: ' + str(1 - log_reg.score(train_X, train_y)))
print('Validation classification error: ' + str(1 - log_reg.score(test_X, test_y)))

# # Predict labels on validation set
# lr_pred = log_reg.predict(test.to_numpy())
# lr_pred = pd.DataFrame(lr_pred, columns=['Predicted'])
# lr_pred.insert(0, 'id', test_ids)
# lr_pred.to_csv("lr_labels_submission.csv", index=False)

# Predict probabilities on validation set
lr_pred = log_reg.predict_proba(test.to_numpy())
# Probabilities of being labeled 1
lr_pred = pd.DataFrame(lr_pred[:,1], columns=['Predicted'])
lr_pred.insert(0, 'id', test_ids)
lr_pred.to_csv("lr_probs_split_submission3.csv", index=False)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Training classification error: 0.3693501143074871
Validation classification error: 0.3708920746808916


In [12]:
# SGD Classifier - high errors
sgd_reg = SGDClassifier(early_stopping=True)
sgd_reg.fit(train_X, train_y)
print('Training classification error: ' + str(1 - sgd_reg.score(train_X, train_y)))
print('Test classification error: ' + str(1 - sgd_reg.score(test_X, test_y)))

Training classification error: 0.37113021527910073
Test classification error: 0.37203514955229566


In [54]:
# Gradient Boosting Classifier - best so far
grad_boost = GradientBoostingClassifier().fit(train_X, train_y)
print('Training classification error: ' + str(1 - grad_boost.score(train_X, train_y)))
print('Validation classification error: ' + str(1 - grad_boost.score(test_X, test_y)))

# Predict probabilities on validation set
grad_pred = grad_boost.predict_proba(test.to_numpy())
# Probabilities of being labeled 1
grad_pred = pd.DataFrame(grad_pred[:,1], columns=['Predicted'])
grad_pred.insert(0, 'id', test_ids)
grad_pred.to_csv("grad_probs_submission.csv", index=False)

Training classification error: 0.3572168508287292
Validation classification error: 0.3605210516288817


In [None]:
max_depth = [5, 8, 15, 25, 30]
min_samples_split = [2, 5, 10, 15, 100]
min_samples_leaf = [1, 2, 5, 10] 

hyperF = dict(max_depth = max_depth,  
              min_samples_split = min_samples_split, 
             min_samples_leaf = min_samples_leaf)

gridF = GridSearchCV(GradientBoostingClassifier(), hyperF)
bestF = gridF.fit(train_X, train_y)

In [55]:
# Adaboost Classifier - predicted probabilities are similar
ada_boost = AdaBoostClassifier(n_estimators=100).fit(train_X, train_y)
print('Training classification error: ' + str(1 - ada_boost.score(train_X, train_y)))
print('Validation classification error: ' + str(1 - ada_boost.score(test_X, test_y)))

# Predict probabilities on validation set
ada_pred = ada_boost.predict_proba(test.to_numpy())
# Probabilities of being labeled 1
ada_pred = pd.DataFrame(ada_pred[:,1], columns=['Predicted'])
ada_pred.insert(0, 'id', test_ids)
ada_pred.to_csv("ada_probs_submission.csv", index=False)

Training classification error: 0.3593988140598209
Validation classification error: 0.361175938273957


In [15]:
# Gaussian NB Classifier
gauss_boost = GaussianNB().fit(train_X, train_y)
print('Training classification error: ' + str(1 - gauss_boost.score(train_X, train_y)))
print('Validation classification error: ' + str(1 - gauss_boost.score(test_X, test_y)))

Training classification error: 0.38692489045532485
Validation classification error: 0.4119117927224233


In [80]:
# Decision Tree Classifier
min_samples_leaf = np.arange(1, 26)

tree = DecisionTreeClassifier(max_depth=5, min_samples_leaf=16, 
                              min_samples_split=2, class_weight={0: 0.5, 1: 1}, 
                              random_state=0).fit(train_X, train_y)
print('Training classification error: ' + str(1 - tree.score(train_X, train_y)))
print('Validation classification error: ' + str(1 - tree.score(test_X, test_y)))
# Predict probabilities on validation set
tree_pred = tree.predict_proba(test.to_numpy())
# Probabilities of being labeled 1
tree_pred = pd.DataFrame(tree_pred[:,1], columns=['Predicted'])
tree_pred.insert(0, 'id', test_ids)
tree_pred.to_csv("tree_probs_submission_weights.csv", index=False)

Training classification error: 0.42812619070299107
Validation classification error: 0.43185606782244235


In [17]:
# Random Forest
forest = RandomForestClassifier(random_state=0).fit(train_X, train_y)
print('Training classification error: ' + str(1 - forest.score(train_X, train_y)))
print('Validation classification error: ' + str(1 - forest.score(test_X, test_y)))

# Predict probabilities on validation set
forest_pred = forest.predict_proba(test.to_numpy())
# Probabilities of being labeled 1
forest_pred = pd.DataFrame(forest_pred[:,1], columns=['Predicted'])
forest_pred.insert(0, 'id', test_ids)
forest_pred.to_csv("forest_probs_submission.csv", index=False)

Training classification error: 0.00038400171461228094
Validation classification error: 0.3703681653648314


In [18]:
# Voting Classifier 1
estimators=[('grad_boost', grad_boost), ('lr', log_reg), ('ada_boost', ada_boost)]
#create our voting classifier, inputting our models
ensemble1 = VotingClassifier(estimators, voting='soft')
ensemble1.fit(train_X, train_y)
print('Training classification error: ' + str(1 - ensemble1.score(train_X, train_y)))
print('Validation classification error: ' + str(1 - ensemble1.score(test_X, test_y)))

# Predict probabilities for 1st voting classifier
vot_pred1 = ensemble1.predict_proba(test.to_numpy())
# Probabilities of being labeled 1
vot_pred1 = pd.DataFrame(vot_pred1[:,1], columns=['Predicted'])
vot_pred1.insert(0, 'id', test_ids)
vot_pred1.to_csv("voting_probs_submission1.csv", index=False)

Training classification error: 0.3630691560297199
Validation classification error: 0.3623904553248237


In [72]:
# Voting Classifier 2
estimators=[('grad_boost', grad_boost), ('lr', log_reg), ('decision', tree)]
#create our voting classifier, inputting our models
ensemble2 = VotingClassifier(estimators, voting='soft')
ensemble2.fit(train_X, train_y)
print('Training classification error: ' + str(1 - ensemble2.score(train_X, train_y)))
print('Validation classification error: ' + str(1 - ensemble2.score(test_X, test_y)))

# Predict probabilities for 2nd voting classifier
vot_pred2 = ensemble2.predict_proba(test.to_numpy())
# Probabilities of being labeled 1
vot_pred2 = pd.DataFrame(vot_pred2[:,1], columns=['Predicted'])
vot_pred2.insert(0, 'id', test_ids)
vot_pred2.to_csv("voting_probs_submission2.csv", index=False)

Training classification error: 0.36692405696323105
Validation classification error: 0.3681772718613069
