In [98]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV, KFold, train_test_split, validation_curve
from sklearn.linear_model import LinearRegression, LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier, VotingClassifier
from sklearn.naive_bayes import GaussianNB

In [99]:
# Load in training and test set data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

train = train.fillna(train.mean())
test_ids = test['id']
test = test.drop(['id'], axis=1)

# Replace NaN values with 0
# test = test.fillna(0)
test = test.fillna(test.mean())

In [100]:
# Preview training set data
train.head()

Unnamed: 0,id,last_price,mid,opened_position_qty,closed_position_qty,transacted_qty,d_open_interest,bid1,bid2,bid3,...,bid2vol,bid3vol,bid4vol,bid5vol,ask1vol,ask2vol,ask3vol,ask4vol,ask5vol,y
0,0,3842.4,3842.6,1.400583,1.964277,103.0,0,3842.4,3842.0,3841.8,...,1,6,14,6,6,1,1,10,2,1
1,1,3842.8,3843.4,6.0,49.0,55.0,-43,3843.0,3842.8,3842.4,...,6,11,1,6,1,4,4,1,13,0
2,2,3844.0,3844.3,7.0,77.0,84.0,-69,3843.8,3843.6,3843.2,...,1,4,21,12,1,16,10,4,9,0
3,3,3843.8,3843.4,3.0,34.0,37.0,-30,3843.0,3842.8,3842.4,...,13,12,2,4,2,7,1,2,11,1
4,4,3843.2,3843.1,3.0,38.0,41.0,-35,3842.8,3842.4,3842.0,...,12,2,2,4,1,3,1,11,15,1


In [101]:
# Preview test set data
test.head()

Unnamed: 0,last_price,mid,opened_position_qty,closed_position_qty,transacted_qty,d_open_interest,bid1,bid2,bid3,bid4,...,bid1vol,bid2vol,bid3vol,bid4vol,bid5vol,ask1vol,ask2vol,ask3vol,ask4vol,ask5vol
0,5178.4,5178.3,1.0,11.0,12.0,-10,5178.2,5178.0,5177.8,5177.6,...,3,16,3,1,1,1,4,1,5,2
1,5133.0,5132.9,1.0,0.0,1.0,1,5132.8,5132.6,5132.4,5132.2,...,1,2,2,1,10,4,8,1,1,1
2,5177.4,5178.0,2.0,0.0,2.0,2,5177.2,5176.8,5176.6,5176.4,...,8,18,1,1,6,7,4,1,2,5
3,5093.6,5093.9,0.0,2.0,2.0,-1,5093.2,5093.0,5092.8,5092.2,...,8,5,4,3,1,1,1,11,6,2
4,5189.2,5189.2,1.444629,1.936579,0.0,0,5188.8,5188.6,5188.4,5188.2,...,1,5,1,3,3,3,2,1,3,4


In [102]:
print(train.shape)
print(test.shape)

(592380, 28)
(191859, 26)


In [103]:
train_X, test_X, train_y, test_y = train_test_split(train.loc[:,'last_price':'ask5vol'], 
                                                    train.loc[:,'y'], test_size=0.2, random_state=155155155)

In [104]:
# Normalize training and validation data
min_max_scaler = preprocessing.MinMaxScaler()
train_norm = min_max_scaler.fit_transform(train_X)
train_X = pd.DataFrame(train_norm)
val_norm = min_max_scaler.fit_transform(test_X)
test_X = pd.DataFrame(val_norm)

# Normalize test set data
test_norm = min_max_scaler.fit_transform(test.to_numpy())
test = pd.DataFrame(test_norm)

In [105]:
# Linear Regression
lin_reg = LinearRegression().fit(train_X, train_y)
print(1 - lin_reg.score(train_X, train_y))
print(1 - lin_reg.score(test_X, test_y))

0.9460217643428674
3.786814551039572e+16


In [106]:
# Linear SVC 
svc = LinearSVC().fit(train_X, train_y)
print('Training classification error: ' + str(1 - svc.score(train_X, train_y)))
print('Validation classification error: ' + str(1 - svc.score(test_X, test_y)))

Training classification error: 0.35424896181505117
Validation classification error: 0.3557851379182282


In [107]:
# Logistic Regression on original dataset
log_reg = LogisticRegression().fit(train.loc[:,'last_price':'ask5vol'], train.loc[:,'y'])
print('Training classification error: ' + str(1 - log_reg.score(train.loc[:,'last_price':'ask5vol'], train.loc[:,'y'])))
# print('Validation classification error: ' + str(1 - log_reg.score(test_X, test_y)))

# Predict probabilities on validation set
lr_pred = log_reg.predict_proba(test.to_numpy())
# Probabilities of being labeled 1
lr_pred = pd.DataFrame(lr_pred[:,1], columns=['Predicted'])
lr_pred.insert(0, 'id', test_ids)
lr_pred.to_csv("lr_probs_whole_submission.csv", index=False)

Training classification error: 0.3556956683210102


In [108]:
# Logistic Regression on train and test split
log_reg = LogisticRegression(max_iter=100000).fit(train_X, train_y)
print('Training classification error: ' + str(1 - log_reg.score(train_X, train_y)))
print('Validation classification error: ' + str(1 - log_reg.score(test_X, test_y)))

# # Predict labels on validation set
# lr_pred = log_reg.predict(test.to_numpy())
# lr_pred = pd.DataFrame(lr_pred, columns=['Predicted'])
# lr_pred.insert(0, 'id', test_ids)
# lr_pred.to_csv("lr_labels_submission.csv", index=False)

# Predict probabilities on validation set
lr_pred = log_reg.predict_proba(test.to_numpy())
# Probabilities of being labeled 1
lr_pred = pd.DataFrame(lr_pred[:,1], columns=['Predicted'])
lr_pred.insert(0, 'id', test_ids)
lr_pred.to_csv("lr_probs_split_submission2.csv", index=False)

Training classification error: 0.35521962253958606
Validation classification error: 0.35717782504473483


In [109]:
# SGD Classifier - high errors
sgd_reg = SGDClassifier(early_stopping=True)
sgd_reg.fit(train_X, train_y)
print('Training classification error: ' + str(1 - sgd_reg.score(train_X, train_y)))
print('Test classification error: ' + str(1 - sgd_reg.score(test_X, test_y)))

Training classification error: 0.3566123096660927
Test classification error: 0.35834261791417676


In [110]:
# Gradient Boosting Classifier - best so far
grad_boost = GradientBoostingClassifier().fit(train_X, train_y)
print('Training classification error: ' + str(1 - grad_boost.score(train_X, train_y)))
print('Validation classification error: ' + str(1 - grad_boost.score(test_X, test_y)))

# Predict probabilities on validation set
grad_pred = grad_boost.predict_proba(test.to_numpy())
# Probabilities of being labeled 1
grad_pred = pd.DataFrame(grad_pred[:,1], columns=['Predicted'])
grad_pred.insert(0, 'id', test_ids)
grad_pred.to_csv("grad_probs_submission.csv", index=False)

Training classification error: 0.34522392720888617
Validation classification error: 0.3457746716634593


In [113]:
# Adaboost Classifier - predicted probabilities are similar
ada_boost = AdaBoostClassifier(n_estimators=100).fit(train_X, train_y)
print('Training classification error: ' + str(1 - ada_boost.score(train_X, train_y)))
print('Validation classification error: ' + str(1 - ada_boost.score(test_X, test_y)))

# Predict probabilities on validation set
ada_pred = ada_boost.predict_proba(test.to_numpy())
# Probabilities of being labeled 1
ada_pred = pd.DataFrame(ada_pred[:,1], columns=['Predicted'])
ada_pred.insert(0, 'id', test_ids)
ada_pred.to_csv("ada_probs_submission.csv", index=False)

Training classification error: 0.3476885613964009
Validation classification error: 0.34958134980924405


In [114]:
# Gaussian NB Classifier
gauss_boost = GaussianNB().fit(train_X, train_y)
print('Training classification error: ' + str(1 - gauss_boost.score(train_X, train_y)))
print('Validation classification error: ' + str(1 - gauss_boost.score(test_X, test_y)))

Training classification error: 0.3726999561092542
Validation classification error: 0.4520240386238563


In [115]:
# Decision Tree Classifier
min_samples_leaf = np.arange(1, 26)

tree = DecisionTreeClassifier(random_state=0).fit(train_X, train_y)
print('Training classification error: ' + str(1 - tree.score(train_X, train_y)))
print('Validation classification error: ' + str(1 - tree.score(test_X, test_y)))

Training classification error: 0.0035007090043552846
Validation classification error: 0.43754009250818726


In [116]:
# Random Forest
forest = RandomForestClassifier(random_state=0).fit(train_X, train_y)
print('Training classification error: ' + str(1 - forest.score(train_X, train_y)))
print('Validation classification error: ' + str(1 - forest.score(test_X, test_y)))

# Predict probabilities on validation set
forest_pred = forest.predict_proba(test.to_numpy())
# Probabilities of being labeled 1
forest_pred = pd.DataFrame(forest_pred[:,1], columns=['Predicted'])
forest_pred.insert(0, 'id', test_ids)
forest_pred.to_csv("forest_probs_submission.csv", index=False)

Training classification error: 0.0035070394003848815
Validation classification error: 0.3497586008980722


In [118]:
# Voting Classifier 1
estimators=[('grad_boost', grad_boost), ('lr', log_reg), ('ada_boost', ada_boost)]
#create our voting classifier, inputting our models
ensemble1 = VotingClassifier(estimators, voting='soft')
ensemble1.fit(train_X, train_y)
print('Training classification error: ' + str(1 - ensemble1.score(train_X, train_y)))
print('Validation classification error: ' + str(1 - ensemble1.score(test_X, test_y)))

# Predict probabilities for 1st voting classifier
vot_pred1 = ensemble1.predict_proba(test.to_numpy())
# Probabilities of being labeled 1
vot_pred1 = pd.DataFrame(vot_pred1[:,1], columns=['Predicted'])
vot_pred1.insert(0, 'id', test_ids)
vot_pred1.to_csv("voting_probs_submission1.csv", index=False)

Training classification error: 0.35018906782808334
Validation classification error: 0.3505773321178973


In [119]:
# Voting Classifier 2
estimators=[('grad_boost', grad_boost), ('rf', forest), ('lr', log_reg), ('decision', tree)]
#create our voting classifier, inputting our models
ensemble2 = VotingClassifier(estimators, voting='soft')
ensemble2.fit(train_X, train_y)
print('Training classification error: ' + str(1 - ensemble2.score(train_X, train_y)))
print('Validation classification error: ' + str(1 - ensemble2.score(test_X, test_y)))

# Predict probabilities for 2nd voting classifier
vot_pred2 = ensemble2.predict_proba(test.to_numpy())
# Probabilities of being labeled 1
vot_pred2 = pd.DataFrame(vot_pred2[:,1], columns=['Predicted'])
vot_pred2.insert(0, 'id', test_ids)
vot_pred2.to_csv("voting_probs_submission2.csv", index=False)

Training classification error: 0.005165603160133703
Validation classification error: 0.3937421925115635
