In [61]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV, KFold, train_test_split, validation_curve
from sklearn.linear_model import LinearRegression, LogisticRegression, SGDRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

In [72]:
# Load in training and test set data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

train = train.dropna()
test_ids = test['id']
test = test.drop(['id'], axis=1)

# Replace NaN values with 0
test = test.fillna(0)

In [52]:
# Preview training set data
train.head()

Unnamed: 0,id,last_price,mid,opened_position_qty,closed_position_qty,transacted_qty,d_open_interest,bid1,bid2,bid3,...,bid2vol,bid3vol,bid4vol,bid5vol,ask1vol,ask2vol,ask3vol,ask4vol,ask5vol,y
1,1,3842.8,3843.4,6.0,49.0,55.0,-43,3843.0,3842.8,3842.4,...,6,11,1,6,1,4,4,1,13,0
2,2,3844.0,3844.3,7.0,77.0,84.0,-69,3843.8,3843.6,3843.2,...,1,4,21,12,1,16,10,4,9,0
3,3,3843.8,3843.4,3.0,34.0,37.0,-30,3843.0,3842.8,3842.4,...,13,12,2,4,2,7,1,2,11,1
4,4,3843.2,3843.1,3.0,38.0,41.0,-35,3842.8,3842.4,3842.0,...,12,2,2,4,1,3,1,11,15,1
5,5,3843.6,3844.2,12.0,17.0,29.0,-5,3843.8,3843.4,3843.2,...,6,1,2,17,1,12,15,10,3,0


In [53]:
# Preview test set data
test.head()

Unnamed: 0,last_price,mid,opened_position_qty,closed_position_qty,transacted_qty,d_open_interest,bid1,bid2,bid3,bid4,...,bid1vol,bid2vol,bid3vol,bid4vol,bid5vol,ask1vol,ask2vol,ask3vol,ask4vol,ask5vol
0,5178.4,5178.3,1.0,11.0,12.0,-10,5178.2,5178.0,5177.8,5177.6,...,3,16,3,1,1,1,4,1,5,2
1,5133.0,5132.9,1.0,0.0,1.0,1,5132.8,5132.6,5132.4,5132.2,...,1,2,2,1,10,4,8,1,1,1
2,5177.4,5178.0,2.0,0.0,2.0,2,5177.2,5176.8,5176.6,5176.4,...,8,18,1,1,6,7,4,1,2,5
3,5093.6,5093.9,0.0,2.0,2.0,-1,5093.2,5093.0,5092.8,5092.2,...,8,5,4,3,1,1,1,11,6,2
4,5189.2,5189.2,0.0,0.0,0.0,0,5188.8,5188.6,5188.4,5188.2,...,1,5,1,3,3,3,2,1,3,4


In [6]:
print(train.shape)
print(test.shape)

(419920, 27)
(191859, 26)


In [73]:
train_X, test_X, train_y, test_y = train_test_split(train.loc[:,'last_price':'ask5vol'], 
                                                    train.loc[:,'y'], test_size=0.2, random_state=155155155)

In [74]:
# Normalize training and validation data
min_max_scaler = preprocessing.MinMaxScaler()
train_norm = min_max_scaler.fit_transform(train_X)
train_X = pd.DataFrame(train_norm)
val_norm = min_max_scaler.fit_transform(test_X)
test_X = pd.DataFrame(val_norm)

# Normalize test set data
test_norm = min_max_scaler.fit_transform(test.to_numpy())
test = pd.DataFrame(test_norm)

In [10]:
# lin_reg = LinearRegression().fit(train_X, train_y)
# print(lin_reg.score(train_X, train_y))
# print(lin_reg.score(test_X, test_y))

In [59]:
# Logistic Regression 
log_reg = LogisticRegression().fit(train_X, train_y)
print('Training classification error: ' + str(1 - log_reg.score(train_X, train_y)))
print('Validation classification error: ' + str(1 - log_reg.score(test_X, test_y)))

# Predict labels on validation set
lr_pred = log_reg.predict(test.to_numpy())
lr_pred = pd.DataFrame(lr_pred, columns=['Predicted'])
lr_pred.insert(0, 'id', test_ids)
lr_pred.to_csv("lr_labels_submission.csv", index=False)

# Predict probabilities on validation set
lr_pred = log_reg.predict_proba(test.to_numpy())
# Probabilities of being labeled 1
lr_pred = pd.DataFrame(lr_pred[:,1], columns=['Predicted'])
lr_pred.insert(0, 'id', test_ids)
lr_pred.to_csv("lr_probs_submission.csv", index=False)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Training classification error: 0.3692697418555916
Validation classification error: 0.36890360068584493


In [12]:
lr_pred.shape

(191859, 2)

In [13]:
# for i in range(9):
#     C = 0.00001 * 10**i
#     log_reg = LogisticRegression(C=C).fit(train_X, train_y)
#     print('For C = %0.5f' % C)
#     print('Training classification error: ' + str(1 - log_reg.score(train_X, train_y)))
#     print('Validation classification error: ' + str(1 - log_reg.score(test_X, test_y)))

In [14]:
# SGD Regressor - high errors
sgd_reg = SGDRegressor(early_stopping=True)
sgd_reg.fit(train_X, train_y)
print(sgd_reg.score(train_X, train_y))
print(sgd_reg.score(test_X, test_y))

-4.959032900427136e+33
-4.9540569403385874e+33


In [19]:
# # GradientBoostingRegressor - high errors
# grad_boost = GradientBoostingClassifier().fit(train_X, train_y)
# print('Training classification error: ' + str(1 - grad_boost.score(train_X, train_y)))
# print('Validation classification error: ' + str(1 - grad_boost.score(test_X, test_y)))

Training classification error: 0.3572168508287292
Validation classification error: 0.3605210516288817


In [16]:
# Decision Tree Classifier - overfitting errors (TUNE)
# min_samples_leaf = np.arange(1, 26)

# for leaf in min_samples_leaf:
#     tree = DecisionTreeClassifier(min_samples_leaf=leaf, random_state=0)
#     tree.fit(train_X, train_y)
#     print('For minimum sample leaf of %d:' % leaf)
#     print('Training classification error: ' + str(1 - tree.score(train_X, train_y)))
#     print('Validation classification error: ' + str(1 - tree.score(test_X, test_y)))

In [None]:
# Random Forest
forest = RandomForestClassifier(random_state=0).fit(train_X, train_y)
print('Training classification error: ' + str(1 - forest.score(train_X, train_y)))
print('Validation classification error: ' + str(1 - forest.score(test_X, test_y)))

# Predict probabilities on validation set
forest_pred = forest.predict_proba(test.to_numpy())
# Probabilities of being labeled 1
forest_pred = pd.DataFrame(forest_pred[:,1], columns=['Predicted'])
forest_pred.insert(0, 'id', test_ids)
forest_pred.to_csv("forest_probs_submission.csv", index=False)

In [None]:
# num_est = [100, 300, 500, 750, 800, 1200]
# train_scoreNum, test_scoreNum = validation_curve(RandomForestClassifier(), 
#                                                  X=train_X, y=train_y, 
#                                                  param_name='n_estimators', 
#                                                  param_range=num_est)

In [None]:
n_estimators = [100, 300, 500, 800]
max_depth = [5, 8, 15, 25]
min_samples_split = [2, 5, 10, 15]
min_samples_leaf = [1, 2, 5, 10] 

parameters = {'n_estimators':[100, 300, 500, 800], 'max_depth':[5, 8, 15, 25],
              'min_samples_split':[2, 5, 10, 15], 'min_samples_leaf':[1, 2, 5, 10] }

clf = GridSearchCV(RandomForestClassifier(), parameters)
clf = clf.fit(train_X, train_y)
sorted(clf.cv_results_)

In [None]:
# Random Forest hyperparameter tuning
leaf_train_error = []
leaf_test_error = []
for i in range(1, 11):
    forest = RandomForestClassifier(random_state=0).fit(train_X, train_y)
    leaf_error.append(1 - forest.score(train_X, train_y))
    leaf_error.append(1 - forest.score(test_X, test_y))



print('Training classification error: ' + str(1 - forest.score(train_X, train_y)))
print('Validation classification error: ' + str(1 - forest.score(test_X, test_y)))