# Trading Dataset: S&P 500

In [5]:
import numpy as np
import pandas as pd
import sklearn as sklearn
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn import grid_search
from sklearn import metrics
from sklearn.metrics import accuracy_score

# Importing Dataset

In [6]:
mydataset = pd.read_csv('dataset1.csv')
spy = mydataset.iloc[:,:].values
print(spy)
print(type(spy))
spy.shape[0]

[['2000-02-08' 0.01577 0.00949 ... 0.0567 -0.001 0]
 ['2000-02-09' 0.01594 0.01207 ... 0.0576 -0.0013 -1]
 ['2000-02-10' -0.00292 -0.007109999999999999 ... 0.0579 0.0 0]
 ...
 ['2018-11-07' 0.01938 0.006809999999999999 ... 0.0219 0.0026 1]
 ['2018-11-08' 0.02278 0.01455 ... 0.0219 0.0026 0]
 ['2018-11-09' 0.01438 0.0094 ... 0.022000000000000002 0.0025 0]]
<class 'numpy.ndarray'>


4725

In [7]:
features=spy[:,1:-1]
print(features)
features.shape
type(features)

[[0.01577 0.00949 0.00074 ... 0.0225 0.0567 -0.001]
 [0.01594 0.01207 0.00027 ... 0.0225 0.0576 -0.0013]
 [-0.00292 -0.007109999999999999 -0.00064 ... 0.0225 0.0579 0.0]
 ...
 [0.01938 0.006809999999999999 0.0 ... -0.004 0.0219 0.0026]
 [0.02278 0.01455 9e-05 ... -0.004 0.0219 0.0026]
 [0.01438 0.0094 0.0 ... -0.004 0.022000000000000002 0.0025]]


numpy.ndarray

In [8]:
output=spy[:,13]
print(output)
output.shape
type(output)

[0 -1 0 ... 1 0 0]


numpy.ndarray

# Splitting Dataset

In [9]:
#4-FOLD FORWARD CHAINING
#70% train, 30% validation
#https://towardsdatascience.com/time-series-nested-cross-validation-76adba623eb9
#https://machinelearningmastery.com/backtest-machine-learning-models-time-series-forecasting/

from sklearn.model_selection import train_test_split

#FOLD 1
x1 = features[0:1725,:]
y1 = output[0:1725]
x_train1, x_test1, y_train1, y_test1 = train_test_split(x1, y1, test_size=0.20, train_size=0.80)
x_train1 = x_train1.astype('float')
y_train1 = y_train1.astype('int')
x_test1 = x_train1.astype('float')
y_test1 = y_train1.astype('int')
print('FOLD 1 train feature shape:',x_train1.shape)
print('FOLD 1 train output shape:',y_train1.shape)
print('FOLD 1 test feature shape:',x_test1.shape)
print('FOLD 1 test output shape:',y_test1.shape)


#FOLD 2
x2 = features[0:2725,:]
y2 = output[0:2725]
x_train2, x_test2, y_train2, y_test2 = train_test_split(x2, y2, test_size=0.20, train_size=0.80)
x_train2 = x_train2.astype('float')
y_train2 = y_train2.astype('int')
x_test2 = x_train2.astype('float')
y_test2 = y_train2.astype('int')
print('FOLD 2 train feature shape:',x_train2.shape)
print('FOLD 2 train output shape:',y_train2.shape)
print('FOLD 2 test feature shape:',x_test2.shape)
print('FOLD 2 test output shape:',y_test2.shape)


#FOLD 3
x3 = features[0:3725,:]
y3 = output[0:3725]
x_train3, x_test3, y_train3, y_test3 = train_test_split(x3, y3, test_size=0.20, train_size=0.80)
x_train3 = x_train3.astype('float')
y_train3 = y_train3.astype('int')
x_test3 = x_train3.astype('float')
y_test3 = y_train3.astype('int')
print('FOLD 3 train feature shape:',x_train3.shape)
print('FOLD 3 train output shape:',y_train3.shape)
print('FOLD 3 test feature shape:',x_test3.shape)
print('FOLD 3 test output shape:',y_test3.shape)


#FOLD 4
x4 = features[0:4725,:]
y4 = output[0:4725]
x_train4, x_test4, y_train4, y_test4 = train_test_split(x4, y4, test_size=0.20, train_size=0.80)
x_train4 = x_train4.astype('float')
y_train4 = y_train4.astype('int')
x_test4 = x_train4.astype('float')
y_test4 = y_train4.astype('int')
print('FOLD 4 train feature shape:',x_train4.shape)
print('FOLD 4 train output shape:',y_train4.shape)
print('FOLD 4 test feature shape:',x_test4.shape)
print('FOLD 4 test output shape:',y_test4.shape)




FOLD 1 train feature shape: (1380, 12)
FOLD 1 train output shape: (1380,)
FOLD 1 test feature shape: (1380, 12)
FOLD 1 test output shape: (1380,)
FOLD 2 train feature shape: (2180, 12)
FOLD 2 train output shape: (2180,)
FOLD 2 test feature shape: (2180, 12)
FOLD 2 test output shape: (2180,)
FOLD 3 train feature shape: (2980, 12)
FOLD 3 train output shape: (2980,)
FOLD 3 test feature shape: (2980, 12)
FOLD 3 test output shape: (2980,)
FOLD 4 train feature shape: (3780, 12)
FOLD 4 train output shape: (3780,)
FOLD 4 test feature shape: (3780, 12)
FOLD 4 test output shape: (3780,)


# RANDOM FOREST

In [39]:
rfc = RandomForestClassifier()

#FOLD1
parameter_rfc1 = {'n_estimators':[10, 200], 
                   'criterion':('gini', 'entropy'),'max_depth':[1,20], 
                   'min_samples_split':[2,20], 
                   'min_samples_leaf':[1,20], 
                   'max_features':('auto','sqrt','log2'),
                   'min_impurity_decrease':[0.0001,0.9999]}


CV_rfc1 = GridSearchCV(rfc, parameter_rfc1, cv = 5)
CV_rfc1.fit(x_train1, y_train1)
print("Best parameters for Random forest 1: ",CV_rfc1.best_params_)

rfc1 = RandomForestClassifier(criterion = 'gini',
                              max_depth = 20,
                              max_features = 'auto',
                              min_impurity_decrease = 0.0001,
                              min_samples_leaf = 20,
                              min_samples_split = 20,
                              n_estimators = 200)
rfc1.fit(x_train1, y_train1)
predrfc1 = rfc1.predict(x_test1)
print("Test accuracy for Random Forest on FOLD 1: ",accuracy_score(y_test1,predrfc1))



#FOLD2
parameter_rfc2 = {'n_estimators':[10, 200], 
                   'criterion':('gini', 'entropy'),'max_depth':[1,20], 
                   'min_samples_split':[2,20], 
                   'min_samples_leaf':[1,20], 
                   'max_features':('auto','sqrt','log2'),
                   'min_impurity_decrease':[0.0001,0.9999]}


CV_rfc2 = GridSearchCV(rfc, parameter_rfc2, cv = 5)
CV_rfc2.fit(x_train2, y_train2)
print("Best parameters for Random forest 2: ",CV_rfc2.best_params_)

rfc2 = RandomForestClassifier(criterion = 'gini',
                              max_depth = 20,
                              max_features = 'auto',
                              min_impurity_decrease = 0.0001,
                              min_samples_leaf = 20,
                              min_samples_split = 2,
                              n_estimators = 200)
rfc2.fit(x_train2, y_train2)
predrfc2 = rfc2.predict(x_test2)
print("Test accuracy for Random Forest on FOLD 2: ",accuracy_score(y_test2,predrfc2))



#FOLD3
parameter_rfc3 = {'n_estimators':[10, 200], 
                   'criterion':('gini', 'entropy'),'max_depth':[1,20], 
                   'min_samples_split':[2,20], 
                   'min_samples_leaf':[1,20], 
                   'max_features':('auto','sqrt','log2'),
                   'min_impurity_decrease':[0.0001,0.9999]}


CV_rfc3 = GridSearchCV(rfc, parameter_rfc3, cv = 5)
CV_rfc3.fit(x_train3, y_train3)
print("Best parameters for Random forest 3: ",CV_rfc3.best_params_)

rfc3 = RandomForestClassifier(criterion = 'gini',
                              max_depth = 20,
                              max_features = 'auto',
                              min_impurity_decrease = 0.0001,
                              min_samples_leaf = 20,
                              min_samples_split = 2,
                              n_estimators = 10)
rfc3.fit(x_train3, y_train3)
predrfc3 = rfc3.predict(x_test3)
print("Test accuracy for Random Forest on FOLD 3: ",accuracy_score(y_test3,predrfc3))



#FOLD4
parameter_rfc4 = {'n_estimators':[10, 200], 
                   'criterion':('gini', 'entropy'),'max_depth':[1,20], 
                   'min_samples_split':[2,20], 
                   'min_samples_leaf':[1,20], 
                   'max_features':('auto','sqrt','log2'),
                   'min_impurity_decrease':[0.0001,0.9999]}

CV_rfc4 = GridSearchCV(rfc, parameter_rfc4, cv = 5)
CV_rfc4.fit(x_train4, y_train4)
print("Best parameters for Random forest 4: ",CV_rfc4.best_params_)

rfc4 = RandomForestClassifier(criterion = 'entropy',
                              max_depth = 20,
                              max_features = 'auto',
                              min_impurity_decrease = 0.0001,
                              min_samples_leaf = 20,
                              min_samples_split = 2,
                              n_estimators = 200)
rfc4.fit(x_train4, y_train4)
predrfc4 = rfc4.predict(x_test4)
print("Accuracy for Random Forest on FOLD 4: ",accuracy_score(y_test4,predrfc4))


#AVERAGES
score_test_array = np.array([accuracy_score(y_test1,predrfc1),accuracy_score(y_test2,predrfc2), accuracy_score(y_test3,predrfc3), accuracy_score(y_test4,predrfc4)])
mean_score_test = np.mean(score_test_array) 
print('Average accuracy on test set:',mean_score_test)


Best parameters for Random forest 1:  {'criterion': 'gini', 'max_depth': 20, 'max_features': 'log2', 'min_impurity_decrease': 0.0001, 'min_samples_leaf': 20, 'min_samples_split': 2, 'n_estimators': 200}
Test accuracy for Random Forest on FOLD 1:  0.6224637681159421
Best parameters for Random forest 2:  {'criterion': 'entropy', 'max_depth': 20, 'max_features': 'auto', 'min_impurity_decrease': 0.0001, 'min_samples_leaf': 20, 'min_samples_split': 2, 'n_estimators': 200}
Test accuracy for Random Forest on FOLD 2:  0.631651376146789
Best parameters for Random forest 3:  {'criterion': 'gini', 'max_depth': 20, 'max_features': 'auto', 'min_impurity_decrease': 0.0001, 'min_samples_leaf': 20, 'min_samples_split': 2, 'n_estimators': 200}
Test accuracy for Random Forest on FOLD 3:  0.5956375838926175
Best parameters for Random forest 4:  {'criterion': 'gini', 'max_depth': 20, 'max_features': 'sqrt', 'min_impurity_decrease': 0.0001, 'min_samples_leaf': 20, 'min_samples_split': 20, 'n_estimators': 2

# LOGISTIC REGRESSION

In [10]:
logistic = LogisticRegression()

#FOLD1
parameter_logistic1 = {'tol':[0.00001, 0.01], 
                       'C':[0.1, 10], 
                       'solver':('newton-cg', 'sag', 'lbfgs')}

CV_logistic1 = GridSearchCV(logistic, parameter_logistic1, cv = 5)
CV_logistic1.fit(x_train1, y_train1)
print("Best parameters for Logistic Regression 1: ", CV_logistic1.best_params_)

logistic1 = LogisticRegression(C = 10, solver = 'sag', tol= 0.01)
logistic1.fit(x_train1, y_train1)
predlog1 = logistic1.predict(x_test1)
print("Accuracy for Logistic Regression on FOLD 1: ",accuracy_score(y_test1,predlog1))


#FOLD2
parameter_logistic2 = {'tol':[0.00001, 0.01], 
                       'C':[0.1, 10], 
                       'solver':('newton-cg', 'sag', 'lbfgs')}

CV_logistic2 = GridSearchCV(logistic, parameter_logistic2, cv = 5)
CV_logistic2.fit(x_train2, y_train2)
print("Best parameters for Logistic Regression 2: ", CV_logistic2.best_params_)

logistic2 = LogisticRegression(C = 10, solver = 'sag', tol = 0.01)
logistic2.fit(x_train2, y_train2)
predlog2 = logistic2.predict(x_test2)
print("Accuracy for Logistic Regression on FOLD 2: ",accuracy_score(y_test2,predlog2))

#FOLD3
parameter_logistic3 = {'tol':[0.00001, 0.01], 
                       'C':[0.1, 10], 
                       'solver':('newton-cg', 'sag', 'lbfgs')}

CV_logistic3 = GridSearchCV(logistic, parameter_logistic3, cv = 5)
CV_logistic3.fit(x_train3, y_train3)
print("Best parameters for Logistic Regression 3: ", CV_logistic3.best_params_)

logistic3 = LogisticRegression(C = 10, solver = 'sag', tol= 0.01)
logistic3.fit(x_train3, y_train3)
predlog3 = logistic3.predict(x_test3)
print("Accuracy for Logistic Regression on FOLD 3: ",accuracy_score(y_test3,predlog3))


#FOLD4
parameter_logistic4 = {'tol':[0.00001, 0.01], 
                       'C':[0.1, 10], 
                       'solver':('newton-cg', 'sag', 'lbfgs')}
CV_logistic4 = GridSearchCV(logistic, parameter_logistic4, cv = 5)
CV_logistic4.fit(x_train4, y_train4)
print("Best parameters for Logistic Regression 4: ", CV_logistic4.best_params_)

logistic4 = LogisticRegression(C = 10, solver = 'sag', tol= 0.01)
logistic4.fit(x_train4, y_train4)
predlog4 = logistic4.predict(x_test4)
print("Accuracy for Logistic Regression on FOLD 4: ",accuracy_score(y_test4,predlog4))

#AVERAGES
score_test_array = np.array([accuracy_score(y_test1,predlog1),accuracy_score(y_test2,predlog2), accuracy_score(y_test3,predlog3), accuracy_score(y_test4,predlog4)])
mean_score_test = np.mean(score_test_array) 
print('Average accuracy on test set:',mean_score_test)



Best parameters for Logistic Regression 1:  {'C': 10, 'solver': 'sag', 'tol': 0.01}
Accuracy for Logistic Regression on FOLD 1:  0.4855072463768116
Best parameters for Logistic Regression 2:  {'C': 10, 'solver': 'newton-cg', 'tol': 0.01}
Accuracy for Logistic Regression on FOLD 2:  0.47706422018348627
Best parameters for Logistic Regression 3:  {'C': 10, 'solver': 'newton-cg', 'tol': 1e-05}
Accuracy for Logistic Regression on FOLD 3:  0.5191275167785235
Best parameters for Logistic Regression 4:  {'C': 10, 'solver': 'newton-cg', 'tol': 1e-05}
Accuracy for Logistic Regression on FOLD 4:  0.5574074074074075
Average accuracy on test set: 0.5097765976865571


# SVM

In [11]:
svm = SVC()

#FOLD1
parameter_svm1 = {'kernel':('linear', 'rbf', 'poly', 'sigmoid'), 
                  'C':[0.1, 10],
                  'gamma':[0.01,0.9]}

CV_svm1 = GridSearchCV(svm, parameter_svm1, cv = 2)
CV_svm1.fit(x_train1, y_train1)
print("Best parameters for SVM1: ", CV_svm1.best_params_)
print("Best score for SVM1: ", CV_svm1.best_score_)

svm1 = SVC(C = 0.1, kernel = 'rbf', gamma = 0.9, decision_function_shape = 'ovo')
svm1.fit(x_train1, y_train1)
predsvm1 = svm1.predict(x_test1)
print("Accuracy for SVM on FOLD 1: ",accuracy_score(y_test1,predsvm1))



#FOLD2
parameter_svm2 = {'kernel':('linear', 'rbf', 'poly'), 
                  'C':[0.1, 10]}

CV_svm2 = GridSearchCV(svm, parameter_svm2, cv = 5)
CV_svm2.fit(x_train2, y_train2)
print("Best parameters for SVM2: ", CV_svm2.best_params_)

svm2 = SVC(C = 10, kernel = 'linear')
svm2.fit(x_train2, y_train2)
predsvm2 = svm2.predict(x_test2)
print("Accuracy for SVM on FOLD 2: ",accuracy_score(y_test2,predsvm2))

#FOLD3
parameter_svm3 = {'kernel':('linear', 'rbf', 'poly'), 
                  'C':[1, 10]}

CV_svm3 = GridSearchCV(svm, parameter_svm3, cv = 5)
CV_svm3.fit(x_train3, y_train3)
print("Best parameters for SVM3: ", CV_svm3.best_params_)

svm3 = SVC(C = 1, kernel = 'linear')
svm3.fit(x_train3, y_train3)
predsvm3 = svm2.predict(x_test3)
print("Accuracy for SVM on FOLD 3: ",accuracy_score(y_test3,predsvm3))

#FOLD4
parameter_svm4 = {'kernel':('linear', 'rbf', 'poly'), 
                  'C':[1, 10]}

CV_svm4 = GridSearchCV(svm, parameter_svm4, cv = 5)
CV_svm4.fit(x_train4, y_train4)
print("Best parameters for SVM4: ", CV_svm4.best_params_)

svm4 = SVC(C = 1, kernel = 'linear')
svm4.fit(x_train4, y_train4)
predsvm4 = svm4.predict(x_test4)
print("Accuracy for SVM on FOLD 4: ",accuracy_score(y_test4,predsvm4))


#AVERAGES
score_test_array = np.array([accuracy_score(y_test1,predsvm1),accuracy_score(y_test2,predsvm2), accuracy_score(y_test3,predsvm3), accuracy_score(y_test4,predsvm4)])
mean_score_test = np.mean(score_test_array) 
print('Average accuracy on test set:',mean_score_test)


Best parameters for SVM1:  {'C': 10, 'gamma': 0.9, 'kernel': 'rbf'}
Best score for SVM1:  0.46304347826086956
Accuracy for SVM on FOLD 1:  0.46159420289855074
Best parameters for SVM2:  {'C': 10, 'kernel': 'linear'}
Accuracy for SVM on FOLD 2:  0.4701834862385321
Best parameters for SVM3:  {'C': 10, 'kernel': 'linear'}
Accuracy for SVM on FOLD 3:  0.5130872483221477
Best parameters for SVM4:  {'C': 1, 'kernel': 'linear'}
Accuracy for SVM on FOLD 4:  0.5465608465608466
Average accuracy on test set: 0.4978564460050193


# NEURAL NETWORK

In [42]:
#from sklearn.neural_network import MLPClassifier

#FOLD1
#parameter_NN1 = {'hidden_layer_sizes':[1, 10],'activation':('identity', 'relu','logistic', 'tanh'),
                # 'solver':('lbfgs', 'sgd', 'adam'), 'alpha':[0.00001,0.01], 'batch_size':[10,100],
                #'learning_rate':('constant','invscaling','adaptive'), 'learning_rate_init':[0.00001,0.01],
                # 'tol':[0.000001,0.01]}
#NN1_train = MLPClassifier()
#NN1_train.fit(x_train1, y_train1)
#NN1_valid = GridSearchCV(NN1_train, parameter_NN1)
#NN1_valid.fit(x_valid1, y_valid1)
#SVM1_valid.fit(x_train1, y_train1)
#print(NN1_valid.best_params_)
#NN1_score_train1 = NN1_train.score(x_train1, y_train1)
#NN1_score_valid1 = NN1_valid.score(x_valid1, y_valid1)
#NN1_score_test1 = NN1_valid.score(x_test1, y_test1)
#print('Accuracy on the training set 1: {:.3f}'.format(NN1_score_train1))
#print('Accuracy on the validation set 1: {:.3f}'.format(NN1_score_valid1))
#print('Accuracy on the test set 1: {:.3f}'.format(NN1_score_test1))
#print(NN1.predict(x_test1))