In [1]:
# First we import all our required libraries for data manipulation and ML implementation
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [2]:
# In this new workbook we need to import our datasets again and get our target
reduced_megaset = pd.read_csv('reduced_megaset.csv')
SP500 = pd.read_csv('SP500_2.csv', parse_dates=True)
target_raw = (SP500['Adj Close'].shift(-1)/SP500['Adj Close'])-1

In [3]:
# Label encode our target variable, 1 for increase, 0 for decrease/no change
target = target_raw[21:]
target[target > 0] = 1
target[target <= 0] = 0
print("Our target has {} values".format(target.shape[0]))

Our target has 6232 values


In [4]:
# We split our megaset into training and cross-validation (test) subsets
X_train = reduced_megaset[:6001]
X_test = reduced_megaset[6001:-1]
y_train = target[:6001]
y_test = target[6001:-1]
print("Our training set has shape {}, {}".format(X_train.shape[0], X_train.shape[1]))
print("Our training set target is {} long".format(y_train.shape[0]))
print("Our testing set has shape {}, {}".format(X_test.shape[0], X_test.shape[1]))
print("Our testing set target is {} long".format(y_test.shape[0]))

Our training set has shape 6001, 10
Our training set target is 6001 long
Our testing set has shape 230, 10
Our testing set target is 230 long


In [5]:
# We train SVM classifiers with two different kernels
"""Support Vector Classifier with Linear Kernel"""
clf1 = svm.SVC(kernel = 'linear')
clf1.fit(X_train, y_train)
clf1_predictions = clf1.predict(X_test)
clf1_accuracy = accuracy_score(y_test, clf1_predictions)
clf1_f1 = f1_score(y_test, clf1_predictions)
print("SVM Linear: [Accuracy: {:.4f}, f1-score: {:.4f}]".format(clf1_accuracy, clf1_f1))

"""Support Vector Classifier with RBF Kernel"""
clf2 = svm.SVC(kernel = 'rbf')
clf2.fit(X_train, y_train)
clf2_predictions = clf2.predict(X_test)
clf2_accuracy = accuracy_score(y_test, clf2_predictions)
clf2_f1 = f1_score(y_test, clf2_predictions)
print("SVM RBF: [Accuracy: {:.4f}, f1-score: {:.4f}]".format(clf2_accuracy, clf2_f1))

SVM Linear: [Accuracy: 0.5826, f1-score: 0.7363]
SVM RBF: [Accuracy: 0.5826, f1-score: 0.7363]


In [6]:
# We 'train' a k-Nearest Neighbours model
clf3 = KNeighborsClassifier(n_neighbors = 3)
clf3.fit(X_train, y_train)
clf3_predictions = clf3.predict(X_test)
clf3_accuracy = accuracy_score(y_test, clf3_predictions)
clf3_f1 = f1_score(y_test, clf3_predictions)
print("kNN: [Accuracy: {:.4f}, f1-score: {:.4f}]".format(clf3_accuracy, clf3_f1))

kNN: [Accuracy: 0.5391, f1-score: 0.5760]


In [7]:
# We train our decision tree and random forest classifiers
"""Decision Tree Classifier"""
clf4 = tree.DecisionTreeClassifier()
clf4.fit(X_train, y_train)
clf4_predictions = clf4.predict(X_test)
clf4_accuracy = accuracy_score(y_test, clf4_predictions)
clf4_f1 = f1_score(y_test, clf4_predictions)
print("Decision Tree: [Accuracy: {:.4f}, f1-score: {:.4f}]".format(clf4_accuracy, clf4_f1))

"""Random Forest Classifier"""
clf5 = RandomForestClassifier(n_estimators=10)
clf5.fit(X_train, y_train)
clf5_predictions = clf4.predict(X_test)
clf5_accuracy = accuracy_score(y_test, clf5_predictions)
clf5_f1 = f1_score(y_test, clf5_predictions)
print("Random Forest Classifier: [Accuracy: {:.4f}, f1-score: {:.4f}]".format(clf5_accuracy, clf5_f1))

Decision Tree: [Accuracy: 0.4870, f1-score: 0.5317]
Random Forest Classifier: [Accuracy: 0.4870, f1-score: 0.5317]


In [8]:
# We train our AdaBoost Classifier with two different base estimators
"""AdaBoost Classifier with DecisionTree base"""
clf6a = AdaBoostClassifier(n_estimators=100)
clf6a.fit(X_train, y_train)
clf6a_predictions = clf6a.predict(X_test)
clf6a_accuracy = accuracy_score(y_test, clf6a_predictions)
clf6a_f1 = f1_score(y_test, clf6a_predictions)
print("AdaBoost Classifier DT: [Accuracy: {:.4f}, f1-score: {:.4f}]".format(clf6a_accuracy, clf6a_f1))

"""AdaBoost Classifier with NaiveBayes base"""
clf6b = AdaBoostClassifier(n_estimators=100, base_estimator=GaussianNB())
clf6b.fit(X_train, y_train)
clf6b_predictions = clf6b.predict(X_test)
clf6b_accuracy = accuracy_score(y_test, clf6b_predictions)
clf6b_f1 = f1_score(y_test, clf6b_predictions)
print("AdaBoost Classifier NB: [Accuracy: {:.4f}, f1-score: {:.4f}]".format(clf6b_accuracy, clf6b_f1))

AdaBoost Classifier DT: [Accuracy: 0.5913, f1-score: 0.7152]
AdaBoost Classifier NB: [Accuracy: 0.5783, f1-score: 0.7298]


In [9]:
# We train our Gradient Boosting Classifier
"""Gradient Boosting Classifier"""
clf7 = GradientBoostingClassifier(n_estimators=100)
clf7.fit(X_train, y_train)
clf7_predictions = clf7.predict(X_test)
clf7_accuracy = accuracy_score(y_test, clf7_predictions)
clf7_f1 = f1_score(y_test, clf7_predictions)
print("Gradient Boosting Classifier: [Accuracy: {:.4f}, f1-score: {:.4f}]".format(clf7_accuracy, clf7_f1))

Gradient Boosting Classifier: [Accuracy: 0.5348, f1-score: 0.6748]


In [10]:
# We optimize the above models with GridSearch using TimeSeriesSplit as cross-validation function
"""
For this next part, since it is very time consuming to actually do a GridSearch I have only put down the general codes used
to optimize the algorithms. We will not see improved scores.
"""
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import fbeta_score, make_scorer

In [None]:
"""Optimization of SVC"""
# Choose the time series cross-validator
tscv = TimeSeriesSplit(n_splits=10)
# Choose SVC as the algorithm for optimization with GridSearch
clf8 = svm.SVC(random_state = 0)
# Create a dictionary for the parameters
parameters_SVC = {'kernel':('linear', 'rbf', 'poly'), 'C':[0.001, 0.01, 0.1], 'degree':[200, 400, 600, 800]}
# Define a scoring function
scorer = make_scorer(f1_score)
# Create the GridSearch object"""
grid_obj_SVC = GridSearchCV(estimator=clf8, param_grid=parameters_SVC, scoring=scorer, cv=tscv)
# Fit the grid search object to the data to compute the optimal model
grid_fit_SVC = grid_obj_SVC.fit(X_train, y_train)
# Return the optimal model after fitting the data
best_clf_SVC = grid_fit_SVC.best_estimator_
# Make predictions with the optimal model
best_predictions_SVC = best_clf_SVC.predict(X_test)
# Get the accuracy and f1_score of the optimized model
clf8_optimized_accuracy = accuracy_score(y_test, best_predictions_SVC)
clf8_optimized_f1 = f1_score(y_test, best_predictions_SVC)
print("SVC Optimized [Accuracy score: {:.4f}, f1-score: {:.4f}]".format(clf8_optimized_accuracy, clf8_optimized_f1))

"""Optimization of decision tree"""
# Choose DT as the algorithm for optimization with GridSearch
clf9 = tree.DecisionTreeClassifier()
# Create a dictionary for the parameters
parameters_DT = {'criterion':('gini', 'entropy')}
# Define a scoring function
scorer = make_scorer(f1_score)
# Create the GridSearch object"""
grid_obj_DT = GridSearchCV(estimator=clf9, param_grid=parameters_DT, scoring=scorer, cv=tscv)
# Fit the grid search object to the data to compute the optimal model
grid_fit_DT = grid_obj_DT.fit(X_train, y_train)
# Return the optimal model after fitting the data
best_clf_DT = grid_fit_DT.best_estimator_
# Make predictions with the optimal model
best_predictions_DT = best_clf_DT.predict(X_test)
# Get the accuracy and f1_score of the optimized model
clf9_optimized_accuracy = accuracy_score(y_test, best_predictions_DT)
clf9_optimized_f1 = f1_score(y_test, best_predictions_DT)
print("Decidion Tree Optimized [Accuracy score: {:.4f}, f1-score: {:.4f}]".format(clf9_optimized_accuracy, clf9_optimized_f1))

"""Optimization of random forest"""
# Choose RF as the algorithm for optimization with GridSearch
clf10 = RandomForestClassifier()
# Create a dictionary for the parameters
parameters_RF = {'n_estimators':[5, 10, 20], 'criterion':('gini', 'entropy')}
# Define a scoring function
scorer = make_scorer(f1_score)
# Create the GridSearch object"""
grid_obj_RF = GridSearchCV(estimator=clf10, param_grid=parameters_RF, scoring=scorer, cv=tscv)
# Fit the grid search object to the data to compute the optimal model
grid_fit_RF = grid_obj_RF.fit(X_train, y_train)
# Return the optimal model after fitting the data
best_clf_RF = grid_fit_RF.best_estimator_
# Make predictions with the optimal model
best_predictions_RF = best_clf_RF.predict(X_test)
# Get the accuracy and f1_score of the optimized model
clf10_optimized_accuracy = accuracy_score(y_test, best_predictions_RF)
clf10_optimized_f1 = f1_score(y_test, best_predictions_RF)
print("Decidion Tree Optimized [Accuracy score: {:.4f}, f1-score: {:.4f}]".format(clf10_optimized_accuracy, clf10_optimized_f1))

"""Optimization of kNN"""
# Choose the time series cross-validator
tscv = TimeSeriesSplit(n_splits=3)
# Choose kNN as the algorithm for optimization with GridSearch
clf11 = KNeighborsClassifier()
# Create a dictionary for the parameters
parameters_kNN = {'weights':('uniform', 'distance'), 'n_neighbors':[3, 5, 7, 10, 25, 50, 100]}
# Define a scoring function
scorer = make_scorer(f1_score)
# Create the GridSearch object"""
grid_obj_kNN = GridSearchCV(estimator=clf11, param_grid=parameters_kNN, scoring=scorer, cv=tscv)
# Fit the grid search object to the data to compute the optimal model
grid_fit_kNN = grid_obj_kNN.fit(X_train, y_train)
# Return the optimal model after fitting the data
best_clf_kNN = grid_fit_kNN.best_estimator_
# Make predictions with the optimal model
best_predictions_kNN = best_clf_kNN.predict(X_test)
# Get the accuracy and f1_score of the optimized model
clf11_optimized_accuracy = accuracy_score(y_test, best_predictions_kNN)
clf11_optimized_f1 = f1_score(y_test, best_predictions_kNN)
print("kNN Optimized [Accuracy score: {:.4f}, f1-score: {:.4f}]".format(clf11_optimized_accuracy, clf11_optimized_f1))

"""Optimization of AdaBoost with DecisionTree"""
# Choose the time series cross-validator
tscv = TimeSeriesSplit(n_splits=3)
# Choose AdaBoost as the algorithm for optimization with GridSearch
clf12 = AdaBoostClassifier(base_estimator = DecisionTreeClassifier())
# Create a dictionary for the parameters
parameters_AdaBoost1 = {'n_estimators':[3, 5, 7, 10, 25, 50, 75, 100], 'learning_rate':[0.01, 0.05, 0.1, 0.5, 1]}
# Define a scoring function
scorer = make_scorer(f1_score)
# Create the GridSearch object"""
grid_obj_AdaBoost1 = GridSearchCV(estimator=clf12, param_grid=parameters_AdaBoost1, scoring=scorer, cv=tscv)
# Fit the grid search object to the data to compute the optimal model
grid_fit_AdaBoost1 = grid_obj_AdaBoost1.fit(X_train, y_train)
# Return the optimal model after fitting the data
best_clf_AdaBoost1 = grid_fit_AdaBoost1.best_estimator_
# Make predictions with the optimal model
best_predictions_AdaBoost1 = best_clf_AdaBoost1.predict(X_test)
# Get the accuracy and f1_score of the optimized model
clf12_optimized_accuracy = accuracy_score(y_test, best_predictions_AdaBoost1)
clf12_optimized_f1 = f1_score(y_test, best_predictions_AdaBoost1)
print("AdaBoost DT Optimized [Accuracy score: {:.4f}, f1-score: {:.4f}]".format(clf12_optimized_accuracy, clf12_optimized_f1))

"""Optimization of AdaBoost with GaussianNB"""
# Choose the time series cross-validator
tscv = TimeSeriesSplit(n_splits=3)
# Choose AdaBoost as the algorithm for optimization with GridSearch
clf13 = AdaBoostClassifier(base_estimator = GaussianNB())
# Create a dictionary for the parameters
parameters_AdaBoost2 = {'n_estimators':[3, 5, 7, 10, 25, 50, 75, 100], 'learning_rate':[0.01, 0.05, 0.1, 0.5, 1]}
# Define a scoring function
scorer = make_scorer(f1_score)
# Create the GridSearch object"""
grid_obj_AdaBoost2 = GridSearchCV(estimator=clf13, param_grid=parameters_AdaBoost2, scoring=scorer, cv=tscv)
# Fit the grid search object to the data to compute the optimal model
grid_fit_AdaBoost2 = grid_obj_AdaBoost2.fit(X_train, y_train)
# Return the optimal model after fitting the data
best_clf_AdaBoost2 = grid_fit_AdaBoost2.best_estimator_
# Make predictions with the optimal model
best_predictions_AdaBoost2 = best_clf_AdaBoost2.predict(X_test)
# Get the accuracy and f1_score of the optimized model
clf13_optimized_accuracy = accuracy_score(y_test, best_predictions_AdaBoost2)
clf13_optimized_f1 = f1_score(y_test, best_predictions_AdaBoost2)
print("AdaBoost DT Optimized [Accuracy score: {:.4f}, f1-score: {:.4f}]".format(clf13_optimized_accuracy, clf13_optimized_f1))