# III. Methodology: Code

### Setup

In [1]:
# Import modules
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

## 1. Data Preprocessing

In [2]:
header_names = ['Symbol',
 'Date',
 'Open',
 'High',
 'Low',
 'Close',
 'Volume',
 'Ex-Dividend',
 'Split Ratio',
 'Adj. Open',
 'Adj. High',
 'Adj. Low',
 'Adj. Close',
 'Adj. Volume']

In [3]:
# Read HUGE csv that has all the daily LSE data from 1977
# Data Preprocessing: adding header to CSV
df = pd.read_csv('~/lse-data/lse/WIKI_20160909.csv', header=None, names=header_names)

### 1.1 Examining Abnormalities

Need to investigate previous observation that Opening, High, Low, Close prices have minimum of 0.

In [9]:
df[df['Open'] == 0]
#['Symbol'].unique()

Unnamed: 0,Symbol,Date,Open,High,Low,Close,Volume,Ex-Dividend,Split Ratio,Adj. Open,Adj. High,Adj. Low,Adj. Close,Adj. Volume
1047193,ARWR,2002-10-11,0.0,0.00,0.0,0.00,65000.0,0.0,1.0,0.0,0.00,0.0,0.000000,100.000000
1047194,ARWR,2002-10-14,0.0,0.00,0.0,0.00,0.0,0.0,1.0,0.0,0.00,0.0,0.000000,0.000000
1047195,ARWR,2002-10-15,0.0,0.00,0.0,0.00,0.0,0.0,1.0,0.0,0.00,0.0,0.000000,0.000000
1047196,ARWR,2002-10-16,0.0,0.00,0.0,0.00,0.0,0.0,1.0,0.0,0.00,0.0,0.000000,0.000000
1047197,ARWR,2002-10-17,0.0,0.00,0.0,0.00,0.0,0.0,1.0,0.0,0.00,0.0,0.000000,0.000000
1047198,ARWR,2002-10-18,0.0,0.00,0.0,0.00,0.0,0.0,1.0,0.0,0.00,0.0,0.000000,0.000000
1047199,ARWR,2002-10-21,0.0,0.00,0.0,0.00,0.0,0.0,1.0,0.0,0.00,0.0,0.000000,0.000000
1047200,ARWR,2002-10-22,0.0,0.00,0.0,0.00,0.0,0.0,1.0,0.0,0.00,0.0,0.000000,0.000000
7608936,LFVN,2003-02-21,0.0,0.01,0.0,0.01,27200.0,0.0,1.0,0.0,4.76,0.0,4.760000,57.142857
7608983,LFVN,2003-04-30,0.0,0.00,0.0,0.00,6800.0,0.0,1.0,0.0,0.00,0.0,0.000000,14.285714


### 1.2 Feature Engineering

#### 1.2.1 Measures of variation

In [10]:
# Create additional features
# These features are not used in the current model
df.loc[:,'Daily Variation'] = df.loc[:,'High'] - df.loc[:,'Low']
df.loc[:,'Percentage Variation'] = df.loc[:,'Daily Variation'] / df.loc[:,'Open'] * 100
df.loc[:,'Adj. Daily Variation'] = df.loc[:,'Adj. High'] - df.loc[:,'Adj. Low']
df.loc[:,'Adj. Percentage Variation'] = df.loc[:,'Adj. Daily Variation'] / df.loc[:,'Adj. Open'] * 100

#### 1.2.2 Extracting specific stocks
#### 1.2.2.1 BP

In [19]:
# Extract BP data
bp = df[df['Symbol'] == 'BP']
bp.head()

Unnamed: 0,Symbol,Date,Open,High,Low,Close,Volume,Ex-Dividend,Split Ratio,Adj. Open,Adj. High,Adj. Low,Adj. Close,Adj. Volume,Daily Variation,Percentage Variation,Adj. Daily Variation,Adj. Percentage Variation
1923099,BP,1977-01-03,76.5,77.62,76.5,77.62,12400.0,0.0,1.0,1.990787,2.019933,1.990787,2.019933,198400.0,1.12,1.464052,0.029146,1.464052
1923100,BP,1977-01-04,77.62,78.0,76.75,77.0,19300.0,0.0,1.0,2.019933,2.029822,1.997292,2.003798,308800.0,1.25,1.61041,0.032529,1.61041
1923101,BP,1977-01-05,77.0,77.0,74.5,74.5,17900.0,0.0,1.0,2.003798,2.003798,1.93874,1.93874,286400.0,2.5,3.246753,0.065058,3.246753
1923102,BP,1977-01-06,74.5,75.5,74.5,75.12,23900.0,0.0,1.0,1.93874,1.964763,1.93874,1.954874,382400.0,1.0,1.342282,0.026023,1.342282
1923103,BP,1977-01-07,75.12,75.38,74.62,75.12,41700.0,0.0,1.0,1.954874,1.96164,1.941863,1.954874,667200.0,0.76,1.011715,0.019778,1.011715


#### 1.2.2.2 Stocks that are in the same group as BP:

Found using the LSE stocks list (supplementary data source).

#### 1.2.2.3 FTSE 100:

Source: Scraped from Google Finance.


In [17]:
ftse100_csv = pd.read_csv("ftse100-figures.csv")
ftse100_csv.head()

Unnamed: 0,Date,Open,High,Low,Close
0,2016-09-09,6858.7,6862.38,6762.3,6776.95
1,2016-09-08,6846.58,6889.64,6819.82,6858.7
2,2016-09-07,6826.05,6856.12,6814.87,6846.58
3,2016-09-06,6879.42,6887.92,6818.96,6826.05
4,2016-09-05,6894.6,6910.66,6867.08,6879.42


#### 1.2.3 N-day moving averages

Only applying this to specific stocks because this takes much computational power.

In [20]:
# N-day moving averages of adjusted close prices

def n_day_moving_average(df, moving_average):
    # Create a column `N-day moving Average`.
    df['%s-day Moving Average' % str(moving_average)] = 0

    for i in range(moving_average, len(bp)):
        m_average = sum(df.iloc[i-moving_average:i]['Adj. Close'])/moving_average
        df.iloc[i].loc['%s-day Moving Average' % str(moving_average)] = m_average

# n_day_moving_average(bp, 30)

## 2. Implementation

### 2.1 Build training and test sets

In [21]:
# Initialise variables
# Number of days prior that we consider
days = 7
# Number of train and test examples combined
periods = 9000

# Columns
columns = []
for j in range(1,days+1):
    columns.append('i-%s' % str(j))
columns.append('Adj. High')
columns.append('Adj. Low')
print(columns)

# Index
start_date = bp.iloc[days]["Date"]
print("Start date: ", start_date)
index = pd.date_range(start_date, periods=periods, freq='D')

# Create empty dataframes for features and prices
features = pd.DataFrame(index=index, columns=columns)
prices = pd.DataFrame(index=index, columns=["Target"])

# Prepare test and training sets
for i in range(periods):
    prices.iloc[i]['Target'] = bp.iloc[i+days]['Adj. Close']
    for j in range(days):
        features.iloc[i]['i-%s' % str(7-j)] = bp.iloc[i+j]['Adj. Close']
    features.iloc[i]['Adj. High'] = max(bp[i:i+days]['Adj. High'])
    features.iloc[i]['Adj. Low'] = min(bp[i:i+days]['Adj. Low'])
print(features.head())
print(prices.head())

['i-1', 'i-2', 'i-3', 'i-4', 'i-5', 'i-6', 'i-7', 'Adj. High', 'Adj. Low']
Start date:  1977-01-12
                i-1      i-2      i-3      i-4      i-5      i-6      i-7  \
1977-01-12  1.95175  1.96789  1.95487  1.95487  1.93874   2.0038  2.01993   
1977-01-13  1.93223  1.95175  1.96789  1.95487  1.95487  1.93874   2.0038   
1977-01-14  1.97777  1.93223  1.95175  1.96789  1.95487  1.95487  1.93874   
1977-01-15  1.95175  1.97777  1.93223  1.95175  1.96789  1.95487  1.95487   
1977-01-16  1.95826  1.95175  1.97777  1.93223  1.95175  1.96789  1.95487   

           Adj. High Adj. Low  
1977-01-12   2.02982  1.93874  
1977-01-13   2.02982  1.91272  
1977-01-14    2.0038  1.91272  
1977-01-15   1.98766  1.91272  
1977-01-16   1.98766  1.91272  
             Target
1977-01-12  1.93223
1977-01-13  1.97777
1977-01-14  1.95175
1977-01-15  1.95826
1977-01-16  1.94863


In [22]:
# N-day prices target

# Initialise variables
target_days = 7

# Create target dataframe
nday_columns = []
for j in range(1,target_days+1):
    nday_columns.append('Day %s' % str(j-1))
nday_prices = pd.DataFrame(index=index, columns=nday_columns)

# Fill target dataframe
for i in range(periods):
    for j in range(target_days):
        nday_prices.iloc[i]['Day %s' % str(j)] = bp.iloc[i+days+j]['Adj. Close']
nday_prices

Unnamed: 0,Day 0,Day 1,Day 2,Day 3,Day 4,Day 5,Day 6
1977-01-12,1.93223,1.97777,1.95175,1.95826,1.94863,2.0103,1.97777
1977-01-13,1.97777,1.95175,1.95826,1.94863,2.0103,1.97777,1.96789
1977-01-14,1.95175,1.95826,1.94863,2.0103,1.97777,1.96789,1.99079
1977-01-15,1.95826,1.94863,2.0103,1.97777,1.96789,1.99079,1.99391
1977-01-16,1.94863,2.0103,1.97777,1.96789,1.99079,1.99391,2.0038
1977-01-17,2.0103,1.97777,1.96789,1.99079,1.99391,2.0038,2.00068
1977-01-18,1.97777,1.96789,1.99079,1.99391,2.0038,2.00068,2.0038
1977-01-19,1.96789,1.99079,1.99391,2.0038,2.00068,2.0038,1.99079
1977-01-20,1.99079,1.99391,2.0038,2.00068,2.0038,1.99079,1.99729
1977-01-21,1.99391,2.0038,2.00068,2.0038,1.99079,1.99729,1.99729


In [32]:
# Train-test split (predict prices one day ahead)
def train_test_split_noshuffle(X, y, test_size=0.2):
    if len(X) != len(y):
        return "Error"
    split_index = int(len(X) * (1-test_size))
    X_train = X[:split_index]
    X_test = X[split_index:]
    y_train = y[:split_index]
    y_test = y[split_index:]
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = train_test_split_noshuffle(features, prices, test_size=0.2)

print("Train shapes (X,y): ", X_train.shape, y_train.shape)
print("Test shapes (X,y): ", X_test.shape, y_test.shape)

Train shapes (X,y):  (7200, 9) (7200, 1)
Test shapes (X,y):  (1800, 9) (1800, 1)


In [36]:
# Train-test split (predict prices `target_days` days ahead)

Xnd_train, Xnd_test, ynd_train, ynd_test = train_test_split_noshuffle(features, nday_prices, test_size=0.2)

print("Train shapes (Xnd,ynd): ", Xnd_train.shape, ynd_train.shape)
print("Test shapes (Xnd,ynd): ", Xnd_test.shape, ynd_test.shape)

Train shapes (Xnd,ynd):  (7200, 9) (7200, 7)
Test shapes (Xnd,ynd):  (1800, 9) (1800, 7)


### 2.2 Classifier

In [None]:
# Classifier

from sklearn import svm
# clf = svm.SVR()

from sklearn.multioutput import MultiOutputRegressor
clf = MultiOutputRegressor(svm.SVR(random_state=0))

clf.fit(Xnd_train, ynd_train)
pred = clf.predict(Xnd_test)