In [1]:
"""First we import the necessary libraries"""
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [3]:
"""Grab our csv files and import as DataFrames"""
SP500 = pd.read_csv('../Data/SP500_new.csv', parse_dates=True)
Nasdaq = pd.read_csv('../Data/Nasdaq_new.csv', parse_dates=True)
DJI = pd.read_csv('../Data/DJI_new.csv', parse_dates=True)
DAX = pd.read_csv('../Data/DAX_new.csv', parse_dates=True)
Paris = pd.read_csv('../Data/Paris_new.csv', parse_dates=True)
Tokyo = pd.read_csv('../Data/Tokyo_new.csv', parse_dates=True)
HongKong = pd.read_csv('../Data/HongKong_new.csv', parse_dates=True)
Aus = pd.read_csv('../Data/Aus_new.csv', parse_dates=True)

"""Our target variable is tomorrow's Adj Close"""
target_raw = (SP500['Adj Close'].shift(-1)/SP500['Adj Close'])-1

datasets = [SP500, Nasdaq, DJI, DAX, Paris, Tokyo, HongKong, Aus]
names = ['SP500', 'Nasdaq', 'DJI', 'DAX', 'Paris', 'Tokyo', 'HongKong', 'Aus']


"""
The generate_features function performs feature engineering using Adj Close,
the features generated are Daily Returns, Momentum (Daily Returns over 2 days),
Daily Return SMA and lagging Daily Returns
"""

def generate_features(datasets, DR, DR_SMA, Lagging):
    Max = max(DR, DR_SMA, Lagging+1)
    for i in range(len(datasets)):
        dataset = datasets[i]
        name = names[i]
        for j in range(1, DR+1):
            dataset[name+'_'+str(j)+'DailyReturn'] = (dataset['Adj Close']/dataset['Adj Close'].shift(j))-1
        for k in range(2, DR_SMA+1):
            dataset[name+'_'+str(k)+'DR_SMA'] = pd.rolling_mean(dataset[name+'_'+str(1)+'DailyReturn'], window=k)
        for l in range(1, Lagging+1):
            dataset[name+'_'+str(l)+'LaggingDays'] = dataset[name+'_'+str(1)+'DailyReturn'].shift(l)
        dataset.drop(dataset.index[:Max], inplace=True)
    return Max

"""After feature engineering, merge all datasets and drop the 'useless' features"""
def merge_datasets(datasets):
    drop_features = ['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'Date']
    for i in range(len(datasets)):
        datasets[i] = datasets[i].drop(drop_features, axis=1)
    megaset = pd.concat(datasets, axis=1)
    return megaset

generate_features(datasets, 9, 9, 9)
megaset = merge_datasets(datasets)

"""Label encode our target variable, 1 for increase, 0 for decrease or no change"""
target = target_raw[Max:]
target[target > 0] = 1
target[target <= 0] = 0

"""Split our megaset into training and cross-validation (test) subsets"""
X_train = megaset[:-500]
X_test = megaset[-500:-1]
y_train = target[:-500]
y_test = target[-500:-1]

	Series.rolling(window=2,center=False).mean()
	Series.rolling(window=3,center=False).mean()
	Series.rolling(window=4,center=False).mean()
	Series.rolling(window=5,center=False).mean()
	Series.rolling(window=6,center=False).mean()
	Series.rolling(window=7,center=False).mean()
	Series.rolling(window=8,center=False).mean()
	Series.rolling(window=9,center=False).mean()


In [4]:
"""Support Vector Classifier with Linear Kernel"""
clf1 = svm.SVC(kernel = 'linear')
clf1.fit(X_train, y_train)
clf1_predictions = clf1.predict(X_test)
clf1_accuracy = accuracy_score(y_test, clf1_predictions)
clf1_f1 = f1_score(y_test, clf1_predictions)
print("SVM Linear: [Accuracy: {:.4f}, f1-score: {:.4f}]".format(clf1_accuracy, clf1_f1))

SVM Linear: [Accuracy: 0.5351, f1-score: 0.6971]


In [5]:
"""Support Vector Classifier with RBF Kernel"""
clf2 = svm.SVC(kernel = 'rbf')
clf2.fit(X_train, y_train)
clf2_predictions = clf2.predict(X_test)
clf2_accuracy = accuracy_score(y_test, clf2_predictions)
clf2_f1 = f1_score(y_test, clf2_predictions)
print("SVM RBF: [Accuracy: {:.4f}, f1-score: {:.4f}]".format(clf2_accuracy, clf2_f1))

SVM RBF: [Accuracy: 0.5351, f1-score: 0.6971]


In [6]:
"""k-Nearest Neighbours"""
clf3 = KNeighborsClassifier(n_neighbors = 3)
clf3.fit(X_train, y_train)
clf3_predictions = clf3.predict(X_test)
clf3_accuracy = accuracy_score(y_test, clf3_predictions)
clf3_f1 = f1_score(y_test, clf3_predictions)
print("kNN: [Accuracy: {:.4f}, f1-score: {:.4f}]".format(clf3_accuracy, clf3_f1))

kNN: [Accuracy: 0.5210, f1-score: 0.5755]


In [7]:
"""Decision Tree Classifier"""
clf4 = tree.DecisionTreeClassifier()
clf4.fit(X_train, y_train)
clf4_predictions = clf4.predict(X_test)
clf4_accuracy = accuracy_score(y_test, clf4_predictions)
clf4_f1 = f1_score(y_test, clf4_predictions)
print("Decision Tree: [Accuracy: {:.4f}, f1-score: {:.4f}]".format(clf4_accuracy, clf4_f1))

Decision Tree: [Accuracy: 0.4930, f1-score: 0.5289]


In [8]:
"""Random Forest Classifier"""
clf5 = RandomForestClassifier(n_estimators=10)
clf5.fit(X_train, y_train)
clf5_predictions = clf4.predict(X_test)
clf5_accuracy = accuracy_score(y_test, clf5_predictions)
clf5_f1 = f1_score(y_test, clf5_predictions)
print("Random Forest Classifier: [Accuracy: {:.4f}, f1-score: {:.4f}]".format(clf5_accuracy, clf5_f1))

Random Forest Classifier: [Accuracy: 0.4930, f1-score: 0.5289]


In [9]:
"""AdaBoost Classifier"""
clf6 = AdaBoostClassifier(n_estimators=100)
clf6.fit(X_train, y_train)
clf6_predictions = clf6.predict(X_test)
clf6_accuracy = accuracy_score(y_test, clf6_predictions)
clf6_f1 = f1_score(y_test, clf6_predictions)
print("AdaBoost Classifier: [Accuracy: {:.4f}, f1-score: {:.4f}]".format(clf6_accuracy, clf6_f1))

AdaBoost Classifier: [Accuracy: 0.4990, f1-score: 0.5819]


In [10]:
"""Gradient Boosting Classifier"""
clf7 = GradientBoostingClassifier(n_estimators=100)
clf7.fit(X_train, y_train)
clf7_predictions = clf7.predict(X_test)
clf7_accuracy = accuracy_score(y_test, clf7_predictions)
clf7_f1 = f1_score(y_test, clf7_predictions)
print("Gradient Boosting Classifier: [Accuracy: {:.4f}, f1-score: {:.4f}]".format(clf7_accuracy, clf7_f1))

Gradient Boosting Classifier: [Accuracy: 0.5210, f1-score: 0.6470]
