In [1]:
"""First we import the necessary libraries"""
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
"""Grab our csv files and import as DataFrames"""
SP500 = pd.read_csv('../Data/SP500_new.csv', parse_dates=True)
Nasdaq = pd.read_csv('../Data/Nasdaq_new.csv', parse_dates=True)
DJI = pd.read_csv('../Data/DJI_new.csv', parse_dates=True)
DAX = pd.read_csv('../Data/DAX_new.csv', parse_dates=True)
Paris = pd.read_csv('../Data/Paris_new.csv', parse_dates=True)
Tokyo = pd.read_csv('../Data/Tokyo_new.csv', parse_dates=True)
HongKong = pd.read_csv('../Data/HongKong_new.csv', parse_dates=True)
Aus = pd.read_csv('../Data/Aus_new.csv', parse_dates=True)

In [3]:
"""Our target variable is tomorrow's Adj Close"""
target_raw = (SP500['Adj Close'].shift(-1)/SP500['Adj Close'])-1

In [4]:
"""
The generate_features function performs feature engineering using Adj Close, the features generated are Daily Returns, 
Momentum (Daily Returns over 2 days), Daily Return SMA and lagging Daily Returns
"""
datasets = [SP500, Nasdaq, DJI, DAX, Paris, Tokyo, HongKong, Aus]
names = ['SP500', 'Nasdaq', 'DJI', 'DAX', 'Paris', 'Tokyo', 'HongKong', 'Aus']

def generate_features(datasets, DR, DR_SMA, Lagging):
    Max = max(DR, DR_SMA, Lagging+1)
    for i in range(len(datasets)):
        dataset = datasets[i]
        name = names[i]
        for j in range(1, DR+1):
            dataset[name+'_'+str(j)+'DailyReturn'] = (dataset['Adj Close']/dataset['Adj Close'].shift(j))-1
        for k in range(2, DR_SMA+1):
            dataset[name+'_'+str(k)+'DR_SMA'] = pd.rolling_mean(dataset[name+'_'+str(1)+'DailyReturn'], window=k)
        for l in range(1, Lagging+1):
            dataset[name+'_'+str(l)+'LaggingDays'] = dataset[name+'_'+str(1)+'DailyReturn'].shift(l)
        dataset.drop(dataset.index[:Max], inplace=True)
    return Max

In [5]:
"""After feature engineering, merge all datasets and drop the 'useless' features"""
def merge_datasets(datasets):
    drop_features = ['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'Date']
    for i in range(len(datasets)):
        datasets[i] = datasets[i].drop(drop_features, axis=1)
    megaset = pd.concat(datasets, axis=1)
    return megaset

In [6]:
generate_features(datasets, 9, 9, 9)
megaset = merge_datasets(datasets)

	Series.rolling(window=2,center=False).mean()
  app.launch_new_instance()
	Series.rolling(window=3,center=False).mean()
  app.launch_new_instance()
	Series.rolling(window=4,center=False).mean()
  app.launch_new_instance()
	Series.rolling(window=5,center=False).mean()
  app.launch_new_instance()
	Series.rolling(window=6,center=False).mean()
  app.launch_new_instance()
	Series.rolling(window=7,center=False).mean()
  app.launch_new_instance()
	Series.rolling(window=8,center=False).mean()
  app.launch_new_instance()
	Series.rolling(window=9,center=False).mean()
  app.launch_new_instance()


In [8]:
"""Label encode our target variable, 1 for increase, 0 for decrease or no change"""
target = target_raw[Max:]
target[target > 0] = 1
target[target <= 0] = 0

In [9]:
"""Split our megaset into training and cross-validation (test) subsets"""
X_train = megaset[:-500]
X_test = megaset[-500:-1]
y_train = target[:-500]
y_test = target[-500:-1]

In [11]:
"""Let's have a look at the dimensions of our training and testing sets"""
print("The size of our training features is: {}".format(X_train.shape))
print("The size of our testing features is: {}".format(X_test.shape))
print("The size of our training target is: {}".format(y_train.shape))
print("The size of our testing target is: {}".format(y_test.shape))

The size of our training features is: (5743, 208)
The size of our testing features is: (499, 208)
The size of our training target is: (5743,)
The size of our testing target is: (499,)
