In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression, LogisticRegression, SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor

In [2]:
# Load in training and test set data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# Shape and column labels of data sets
print("Training set data and column labels: ")
print(train.shape)
train_col = list(train.columns)
print(train_col)

print("\nTest set data and column labels: ")
print(test.shape)
test_col = list(test.columns)
print(test_col)

Training set data and column labels: 
(592380, 28)
['id', 'last_price', 'mid', 'opened_position_qty ', 'closed_position_qty', 'transacted_qty', 'd_open_interest', 'bid1', 'bid2', 'bid3', 'bid4', 'bid5', 'ask1', 'ask2', 'ask3', 'ask4', 'ask5', 'bid1vol', 'bid2vol', 'bid3vol', 'bid4vol', 'bid5vol', 'ask1vol', 'ask2vol', 'ask3vol', 'ask4vol', 'ask5vol', 'y']

Test set data and column labels: 
(191859, 27)
['id', 'last_price', 'mid', 'opened_position_qty ', 'closed_position_qty', 'transacted_qty', 'd_open_interest', 'bid1', 'bid2', 'bid3', 'bid4', 'bid5', 'ask1', 'ask2', 'ask3', 'ask4', 'ask5', 'bid1vol', 'bid2vol', 'bid3vol', 'bid4vol', 'bid5vol', 'ask1vol', 'ask2vol', 'ask3vol', 'ask4vol', 'ask5vol']


In [3]:
# Preview training set data
train.head()

Unnamed: 0,id,last_price,mid,opened_position_qty,closed_position_qty,transacted_qty,d_open_interest,bid1,bid2,bid3,...,bid2vol,bid3vol,bid4vol,bid5vol,ask1vol,ask2vol,ask3vol,ask4vol,ask5vol,y
0,0,3842.4,3842.6,,,103.0,0,3842.4,3842.0,3841.8,...,1,6,14,6,6,1,1,10,2,1
1,1,3842.8,3843.4,6.0,49.0,55.0,-43,3843.0,3842.8,3842.4,...,6,11,1,6,1,4,4,1,13,0
2,2,3844.0,3844.3,7.0,77.0,84.0,-69,3843.8,3843.6,3843.2,...,1,4,21,12,1,16,10,4,9,0
3,3,3843.8,3843.4,3.0,34.0,37.0,-30,3843.0,3842.8,3842.4,...,13,12,2,4,2,7,1,2,11,1
4,4,3843.2,3843.1,3.0,38.0,41.0,-35,3842.8,3842.4,3842.0,...,12,2,2,4,1,3,1,11,15,1


In [4]:
# Preview test set data
test.head()

Unnamed: 0,id,last_price,mid,opened_position_qty,closed_position_qty,transacted_qty,d_open_interest,bid1,bid2,bid3,...,bid1vol,bid2vol,bid3vol,bid4vol,bid5vol,ask1vol,ask2vol,ask3vol,ask4vol,ask5vol
0,592380,5178.4,5178.3,1.0,11.0,12.0,-10,5178.2,5178.0,5177.8,...,3,16,3,1,1,1,4,1,5,2
1,592381,5133.0,5132.9,1.0,0.0,1.0,1,5132.8,5132.6,5132.4,...,1,2,2,1,10,4,8,1,1,1
2,592382,5177.4,5178.0,2.0,0.0,2.0,2,5177.2,5176.8,5176.6,...,8,18,1,1,6,7,4,1,2,5
3,592383,5093.6,5093.9,0.0,2.0,2.0,-1,5093.2,5093.0,5092.8,...,8,5,4,3,1,1,1,11,6,2
4,592384,5189.2,5189.2,,,0.0,0,5188.8,5188.6,5188.4,...,1,5,1,3,3,3,2,1,3,4


In [5]:
train = train.dropna()
#train_y = train['y']
#drop = ['id', 'y']

train = train.drop(['id'], axis=1)
# del train_col[0]
#del train_col[-1]

test_ids = test['id']
test = test.drop(['id'], axis=1)

# Replace NaN values with 0
test = test.fillna(0)

In [6]:
print(train.shape)
print(test.shape)

(419920, 27)
(191859, 26)


In [7]:
test.head()

Unnamed: 0,last_price,mid,opened_position_qty,closed_position_qty,transacted_qty,d_open_interest,bid1,bid2,bid3,bid4,...,bid1vol,bid2vol,bid3vol,bid4vol,bid5vol,ask1vol,ask2vol,ask3vol,ask4vol,ask5vol
0,5178.4,5178.3,1.0,11.0,12.0,-10,5178.2,5178.0,5177.8,5177.6,...,3,16,3,1,1,1,4,1,5,2
1,5133.0,5132.9,1.0,0.0,1.0,1,5132.8,5132.6,5132.4,5132.2,...,1,2,2,1,10,4,8,1,1,1
2,5177.4,5178.0,2.0,0.0,2.0,2,5177.2,5176.8,5176.6,5176.4,...,8,18,1,1,6,7,4,1,2,5
3,5093.6,5093.9,0.0,2.0,2.0,-1,5093.2,5093.0,5092.8,5092.2,...,8,5,4,3,1,1,1,11,6,2
4,5189.2,5189.2,0.0,0.0,0.0,0,5188.8,5188.6,5188.4,5188.2,...,1,5,1,3,3,3,2,1,3,4


In [8]:
# Shuffle training set data
train_shuf = train.sample(frac=1, random_state=155155155)
n = int(train.shape[0] * 0.2)

train_val = train_shuf.iloc[0:n]
train = train_shuf.iloc[n:]
train.head()

# Training data set aside for validation
test_y = train_val['y']
test_X = train_val.drop(['y'], axis=1)

# Training data set for models
train_y = train['y']
train_X = train.drop(['y'], axis=1)

In [9]:
print(train.shape)

# # Normalize data
min_max_scaler = preprocessing.MinMaxScaler()
train_norm = min_max_scaler.fit_transform(train.to_numpy())
train = pd.DataFrame(train_norm)

test_norm = min_max_scaler.fit_transform(test.to_numpy())
test = pd.DataFrame(test_norm)

# train.head()


(335936, 27)


In [10]:
lin_reg = LinearRegression().fit(train_X, train_y)
print(lin_reg.score(train_X, train_y))
print(lin_reg.score(test_X, test_y))

0.05790713604831099
0.05858375736213484


In [11]:
# Logistic Regression 
log_reg = LogisticRegression().fit(train_X, train_y)
print('Training classification error: ' + str(1 - log_reg.score(train_X, train_y)))
print('Validation classification error: ' + str(1 - log_reg.score(test_X, test_y)))

# Predict labels on validation set
lr_pred = log_reg.predict(test.to_numpy())
lr_pred = pd.DataFrame(lr_pred, columns=['Predicted'])
lr_pred.insert(0, 'id', test_ids)
lr_pred.to_csv("lr_labels_submission.csv", index=False)

# Predict probabilities on validation set
lr_pred = log_reg.predict_proba(test.to_numpy())
# Probabilities of being labeled 1
lr_pred = pd.DataFrame(lr_pred[:,1], columns=['Predicted'])
lr_pred.insert(0, 'id', test_ids)
lr_pred.to_csv("lr_probs_submission.csv", index=False)

Training classification error: 0.36950788245380073
Validation classification error: 0.37060630596304056


In [12]:
lr_pred.shape

(191859, 2)

In [13]:
# for i in range(9):
#     C = 0.00001 * 10**i
#     log_reg = LogisticRegression(C=C).fit(train_X, train_y)
#     print('For C = %0.5f' % C)
#     print('Training classification error: ' + str(1 - log_reg.score(train_X, train_y)))
#     print('Validation classification error: ' + str(1 - log_reg.score(test_X, test_y)))

In [14]:
# SGD Regressor - high errors
sgd_reg = SGDRegressor(early_stopping=True)
sgd_reg.fit(train_X, train_y)
print(sgd_reg.score(train_X, train_y))
print(sgd_reg.score(test_X, test_y))

-5.972392848653097e+33
-5.966178746455632e+33


In [15]:
# # GradientBoostingRegressor - high errors
# grad_boost = GradientBoostingRegressor().fit(train_X, train_y)
# print('Training classification error: ' + str(1 - grad_boost.score(train_X, train_y)))
# print('Validation classification error: ' + str(1 - grad_boost.score(test_X, test_y)))

In [16]:
# min_samples_leaf = np.arange(1, 26)

# for leaf in min_samples_leaf:
#     tree = DecisionTreeRegressor(min_samples_leaf=leaf, random_state=0)
#     tree.fit(train_X, train_y)
#     print('For minimum sample leaf of %d:' % leaf)
#     print('Training classification error: ' + str(1 - tree.score(train_X, train_y)))
#     print('Validation classification error: ' + str(1 - tree.score(test_X, test_y)))

In [17]:
# # Random Forest - hella overfitting
# forest = DecisionTreeRegressor(random_state=0).fit(train_X, train_y)
# print('Training classification error: ' + str(1 - forest.score(train_X, train_y)))
# print('Validation classification error: ' + str(1 - forest.score(test_X, test_y)))

In [19]:
# PCA
for i in range(26):
    pca = PCA(n_components=i+1)
    pca.fit(train)
    M_train = pca.fit_transform(train_X)
    #pca = PCA(n_components=4)
    pca.fit(test)
    M_test = pca.fit_transform(test_X)
    log_reg = LogisticRegression().fit(M_train, train_y)
    print('Training classification error: ' + str(1 - log_reg.score(M_train, train_y)))
    print('Validation classification error: ' + str(1 - log_reg.score(M_test, test_y)))

Training classification error: 0.37113319203657835
Validation classification error: 0.3720113354924748
Training classification error: 0.37113319203657835
Validation classification error: 0.3720113354924748
Training classification error: 0.371136168794056
Validation classification error: 0.3720113354924748
Training classification error: 0.3711421223090112
Validation classification error: 0.3720113354924748
Training classification error: 0.37109449418936935
Validation classification error: 0.3720113354924748
Training classification error: 0.3711629596113546
Validation classification error: 0.37220184797104205
Training classification error: 0.37128500666793673
Validation classification error: 0.37191607925319103
Training classification error: 0.37155589159839975
Validation classification error: 0.3721661268813107
Training classification error: 0.3715410078110116
Validation classification error: 0.3724757096589827
Training classification error: 0.3722554296056392
Validation classification 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Training classification error: 0.3691060201943227
Validation classification error: 0.4017312821489808


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Training classification error: 0.3613307296627929
Validation classification error: 0.4220565822061345


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Training classification error: 0.36167305677271866
Validation classification error: 0.42165174318917886


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Training classification error: 0.3607621689845685
Validation classification error: 0.42138978853114883


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Training classification error: 0.36083361116403123
Validation classification error: 0.42244951419317966


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Training classification error: 0.36050021432653834
Validation classification error: 0.42344970470565824


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Training classification error: 0.3613337064202705
Validation classification error: 0.4229138883596876


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Training classification error: 0.3614587302343304
Validation classification error: 0.41755572489998094


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Training classification error: 0.3598631882263288
Validation classification error: 0.42323537816726997


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Training classification error: 0.3598631882263288
Validation classification error: 0.42323537816726997
Training classification error: 0.3598631882263288
Validation classification error: 0.42323537816726997


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
