In [21]:
from sklearn.model_selection import train_test_split, cross_val_score,  GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn import svm
from sklearn.svm import SVC
from itertools import combinations

import pandas as pd
import numpy as np
import sys

In [63]:
featSet = pd.read_csv('labeled_data.csv')
featSet['outcome'] = featSet['outcome'].astype(int)
featSet['Spread'] = featSet['Best_Bid_Price'] - featSet['Best_Offer_Price']
featSet['Spread_L2'] = featSet['Next_Best_Bid'] - featSet['Next_Best_Offer']
featSet['Spread_Bid'] = featSet['Best_Bid_Price'] - featSet['Next_Best_Bid']
featSet['Spread_Offer'] = featSet['Best_Offer_Price'] - featSet['Next_Best_Offer']

# Split the data into a test and train set
train, test = train_test_split(featSet, test_size = 0.10)
GlobalFeat = ['FA0', 'FB0', 'Spread']
train['outcome'].value_counts()
train = train.dropna()
test = test.dropna()

train

Unnamed: 0,last_interval,Exchange,Symbol,Best_Bid_Price,FB0,Best_Offer_Price,FA0,FB2,FA2,p_time,Next_Best_Bid,Next_Best_Offer,outcome,Spread,Spread_L2,Spread_Bid,Spread_Offer
672,2020-01-03 11:17:00,N,AAPL,298.52,4.0,298.55,3.0,2.0,0.0,2020-01-03 11:17:00.869643,298.42,298.46,-1,-0.03,-0.04,0.10,0.09
2381,2020-01-08 11:38:00,Q,AAPL,299.13,3.0,299.18,1.0,1.0,0.0,2020-01-08 11:38:00.037572,299.40,299.43,1,-0.05,-0.03,-0.27,-0.25
976,2020-01-03 16:21:00,Q,AAPL,297.72,1.0,297.75,4.0,0.0,-1.0,2020-01-03 16:21:42.243890,297.72,297.75,0,-0.03,-0.03,0.00,0.00
677,2020-01-03 11:22:00,Z,AAPL,298.49,5.0,298.51,2.0,0.0,-4.0,2020-01-03 11:22:00.176509,298.26,298.29,-1,-0.02,-0.03,0.23,0.22
2261,2020-01-08 09:38:00,Z,AAPL,298.29,1.0,298.33,2.0,-1.0,0.0,2020-01-08 09:38:00.003436,298.47,298.51,1,-0.04,-0.04,-0.18,-0.18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1545,2020-01-06 16:48:00,P,AAPL,299.46,2.0,299.54,1.0,0.0,-3.0,2020-01-06 16:48:04.035050,299.50,299.57,0,-0.08,-0.07,-0.04,-0.03
2720,2020-01-08 17:29:00,K,AAPL,303.35,1.0,303.55,13.0,0.0,10.0,2020-01-08 17:29:13.213369,303.30,303.53,0,-0.20,-0.23,0.05,0.02
2313,2020-01-08 10:30:00,P,AAPL,298.46,1.0,298.48,3.0,0.0,2.0,2020-01-08 10:30:00.037915,298.41,298.44,-1,-0.02,-0.03,0.05,0.04
2545,2020-01-08 14:22:00,Q,AAPL,302.84,2.0,302.86,1.0,0.0,0.0,2020-01-08 14:22:00.827329,302.85,302.86,0,-0.02,-0.01,-0.01,0.00


In [41]:
col1, col2 = 'Spread_Offer', 'outcome'
corr = featSet[col1].corr(featSet[col2])
print(f'Correlation between {col1} and {col2} is: ', round(corr, 2))


Correlation between Spread_Offer and outcome is:  -0.49


In [3]:
def GetData(df, features, labels):

    X = np.array(df[features].values)
    y = np.array(df[labels].values)

    return X,y

In [39]:
features = ['Spread_Bid']
labels = ['outcome']

X_train, y_train = GetData(train, features, labels)
X_test, y_test = GetData(test, features, labels)

clf = svm.SVC(kernel='linear', C=1)
#scores = cross_val_score(clf, X_train, y_train.ravel(), cv = 10)
#print(scores)

#np.any(np.isnan(X_train))

In [39]:
clf = svm.SVC(kernel = 'linear')
clf.fit(X_train, y_train.ravel())

SVC(kernel='linear')

In [40]:
features = ['Spread_Bid']
labels = ['outcome']

X_test, y_test = GetData(test, features, labels)

Z = clf.predict(X_test)
print(accuracy_score(y_test, Z)*100)


83.22981366459628


## $\color{red}{\text{Use GridSearch to Tune Radial Kernel Parameters}}$

In [66]:
combofallfeat = sum([list(combinations(GlobalFeat, i)) for i in range(1,10)], [])
param_grid = {'C': [0.1, 1, 10, 100, 1000], 
              'gamma': [0.1, 0.01, 0.001, 0.0001, 0.00001],
              'kernel': ['rbf']} 
labels = ['outcome']

def train_all(comboList, param_grid, labels = ['outcome']):
    
    best_score = 0
    count = 0
    for item in comboList:
        count +=1 
        try:
            features = []
            for i in range(0, len(item)):
                features.append(item[i])
            
            print(f'ON FEATURE {count} OF OUT {len(comboList)}')
            
            grid = GridSearchCV(SVC(), param_grid, refit = True)
            X_train, y_train = GetData(train, features, labels)

            grid.fit(X_train, y_train.ravel())
            print("For %s, the best parameters are %s with a score of %0.2f" % (features, grid.best_params_, grid.best_score_))
                       
            if best_score < grid.best_score_:
                best_grid = grid
                best_params = grid.best_params_
                best_score = grid.best_score_
                best_feat = features

        except Exception as e:
            print(e)
            print(f'ISSUE FITTING FEATURES: {item}')
            
    return best_grid, best_params, best_score, best_feat
 
grid, paras, score, features = train_all(combofallfeat, param_grid)

X_test, y_test = GetData(test, features, labels)   
grid_predictions = grid.predict(X_test)
    
print(f'The best features/parameters were {features} and {paras}, with a score of {score}')
print(classification_report(y_test, grid_predictions))
    

ON FEATURE 1 OF OUT 7
For ['FA0'], the best parameters are {'C': 100, 'gamma': 0.1, 'kernel': 'rbf'} with a score of 0.43
ON FEATURE 2 OF OUT 7
For ['FB0'], the best parameters are {'C': 1000, 'gamma': 0.1, 'kernel': 'rbf'} with a score of 0.40
ON FEATURE 3 OF OUT 7
For ['Spread'], the best parameters are {'C': 1000, 'gamma': 0.01, 'kernel': 'rbf'} with a score of 0.52
ON FEATURE 4 OF OUT 7
For ['FA0', 'FB0'], the best parameters are {'C': 1, 'gamma': 0.1, 'kernel': 'rbf'} with a score of 0.41
ON FEATURE 5 OF OUT 7
For ['FA0', 'Spread'], the best parameters are {'C': 100, 'gamma': 0.1, 'kernel': 'rbf'} with a score of 0.51
ON FEATURE 6 OF OUT 7
For ['FB0', 'Spread'], the best parameters are {'C': 1000, 'gamma': 0.01, 'kernel': 'rbf'} with a score of 0.51
ON FEATURE 7 OF OUT 7
For ['FA0', 'FB0', 'Spread'], the best parameters are {'C': 1000, 'gamma': 0.01, 'kernel': 'rbf'} with a score of 0.51
The best features/parameters were ['Spread'] and {'C': 1000, 'gamma': 0.01, 'kernel': 'rbf'}, 

## $\color{red}{\text{Cross-Validate to Build the Linear Kernel}}$

In [None]:
combofallfeat = sum([list(combinations(GlobalFeat, i)) for i in range(1,10)], [])

def train_all(comboList, labels = ['outcome']):
    
    best_score = 0
    count = 0
    for item in comboList:
        count +=1 
        try:
            features = []
            for i in range(0, len(item)):
                features.append(item[i])
            
            print(f'ON FEATURE {count} OF OUT {len(comboList)}')
            
            X_train, y_train = GetData(train, features, labels)
            X_test, y_test = GetData(test, features, labels)
            
            clf = svm.SVC(kernel='linear', C=1)
            clf.fit(X_train, y_train.ravel())
            
            scores = cross_val_score(svm.SVC(kernel='linear', C=1), X_train, y_train, scoring='accuracy', cv = 10, n_jobs = -1)
            avg_score = sum(scores) / len(scores)
            
            Z = clf.predict(X_test)
            OOS_score = accuracy_score(y_test, Z)*100

            print("For %s, the cross validated score is %0.2f, OOS is %0.2f" % (features, avg_score, OOS_score))
            
            if best_score < OOS_score:
                best_score = OOS_score
                best_crossVal = avg_score
                best_feat = features

        except Exception as e:
            print(e)
            print(f'ISSUE FITTING FEATURES: {item}')
            
    return best_score, best_feat, best_crossVal

    
score, features, validated = train_all(combofallfeat)
    
print(f'The best features were {features}, with a OOS score of {score} and Validated score of {validated}')