# III. Methodology: Code

### Setup

In [292]:
# Import modules
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

## 1. Data Preprocessing

In [293]:
header_names = ['Symbol',
 'Date',
 'Open',
 'High',
 'Low',
 'Close',
 'Volume',
 'Ex-Dividend',
 'Split Ratio',
 'Adj. Open',
 'Adj. High',
 'Adj. Low',
 'Adj. Close',
 'Adj. Volume']

In [294]:
# Read HUGE csv that has all the daily LSE data from 1977
# Data Preprocessing: adding header to CSV
df = pd.read_csv('~/lse-data/lse/WIKI_20160909.csv', header=None, names=header_names)

### 1.1 Examining Abnormalities

Need to investigate previous observation that Opening, High, Low, Close prices have minimum of 0.

In [295]:
df[df['Open'] == 0]
#['Symbol'].unique()

Unnamed: 0,Symbol,Date,Open,High,Low,Close,Volume,Ex-Dividend,Split Ratio,Adj. Open,Adj. High,Adj. Low,Adj. Close,Adj. Volume
1047193,ARWR,2002-10-11,0.0,0.00,0.0,0.00,65000.0,0.0,1.0,0.0,0.00,0.0,0.000000,100.000000
1047194,ARWR,2002-10-14,0.0,0.00,0.0,0.00,0.0,0.0,1.0,0.0,0.00,0.0,0.000000,0.000000
1047195,ARWR,2002-10-15,0.0,0.00,0.0,0.00,0.0,0.0,1.0,0.0,0.00,0.0,0.000000,0.000000
1047196,ARWR,2002-10-16,0.0,0.00,0.0,0.00,0.0,0.0,1.0,0.0,0.00,0.0,0.000000,0.000000
1047197,ARWR,2002-10-17,0.0,0.00,0.0,0.00,0.0,0.0,1.0,0.0,0.00,0.0,0.000000,0.000000
1047198,ARWR,2002-10-18,0.0,0.00,0.0,0.00,0.0,0.0,1.0,0.0,0.00,0.0,0.000000,0.000000
1047199,ARWR,2002-10-21,0.0,0.00,0.0,0.00,0.0,0.0,1.0,0.0,0.00,0.0,0.000000,0.000000
1047200,ARWR,2002-10-22,0.0,0.00,0.0,0.00,0.0,0.0,1.0,0.0,0.00,0.0,0.000000,0.000000
7608936,LFVN,2003-02-21,0.0,0.01,0.0,0.01,27200.0,0.0,1.0,0.0,4.76,0.0,4.760000,57.142857
7608983,LFVN,2003-04-30,0.0,0.00,0.0,0.00,6800.0,0.0,1.0,0.0,0.00,0.0,0.000000,14.285714


### 1.2 Feature Engineering

#### 1.2.1 Measures of variation

In [296]:
# Create additional features
# These features are not used in the current model
df.loc[:,'Daily Variation'] = df.loc[:,'High'] - df.loc[:,'Low']
df.loc[:,'Percentage Variation'] = df.loc[:,'Daily Variation'] / df.loc[:,'Open'] * 100
df.loc[:,'Adj. Daily Variation'] = df.loc[:,'Adj. High'] - df.loc[:,'Adj. Low']
df.loc[:,'Adj. Percentage Variation'] = df.loc[:,'Adj. Daily Variation'] / df.loc[:,'Adj. Open'] * 100

#### 1.2.2 Extracting specific stocks
#### 1.2.2.1 BP

In [297]:
# Extract BP data
bp = df[df['Symbol'] == 'BP']
bp.head()

Unnamed: 0,Symbol,Date,Open,High,Low,Close,Volume,Ex-Dividend,Split Ratio,Adj. Open,Adj. High,Adj. Low,Adj. Close,Adj. Volume,Daily Variation,Percentage Variation,Adj. Daily Variation,Adj. Percentage Variation
1923099,BP,1977-01-03,76.5,77.62,76.5,77.62,12400.0,0.0,1.0,1.990787,2.019933,1.990787,2.019933,198400.0,1.12,1.464052,0.029146,1.464052
1923100,BP,1977-01-04,77.62,78.0,76.75,77.0,19300.0,0.0,1.0,2.019933,2.029822,1.997292,2.003798,308800.0,1.25,1.61041,0.032529,1.61041
1923101,BP,1977-01-05,77.0,77.0,74.5,74.5,17900.0,0.0,1.0,2.003798,2.003798,1.93874,1.93874,286400.0,2.5,3.246753,0.065058,3.246753
1923102,BP,1977-01-06,74.5,75.5,74.5,75.12,23900.0,0.0,1.0,1.93874,1.964763,1.93874,1.954874,382400.0,1.0,1.342282,0.026023,1.342282
1923103,BP,1977-01-07,75.12,75.38,74.62,75.12,41700.0,0.0,1.0,1.954874,1.96164,1.941863,1.954874,667200.0,0.76,1.011715,0.019778,1.011715


#### 1.2.2.2 Stocks that are in the same group as BP:

Found using the LSE stocks list (supplementary data source).

In [298]:
# See which stocks are in our dataset:
oil_stocks = ["SNP", "GAIA", "GAID", "GAZ", "81JK", "OGZD", "GDG", "98LQ", "HLPD", 
              "LKOE", "LKOD", "LKOH", "MOLD", "MNMD", "05IS", "40XT", "ROSN",
             "RDSA", "RDSB", "SAC", "SGGD", "ATAD"]
oil_stocks_in_df = []
for stock in oil_stocks:
    in_df = False
    if not df[df['Symbol'] == stock].empty:
        in_df = True
        oil_stocks_in_df.append(stock)
    # print(stock, in_df)
print("Oil stocks in DF: ", oil_stocks_in_df)

Oil stocks in DF:  ['GAIA']


In [299]:
gaia = df[df['Symbol'] == 'GAIA']
gaia.head()
# GAIA data is available from 1999-10-29 to 2016-09-09.

Unnamed: 0,Symbol,Date,Open,High,Low,Close,Volume,Ex-Dividend,Split Ratio,Adj. Open,Adj. High,Adj. Low,Adj. Close,Adj. Volume,Daily Variation,Percentage Variation,Adj. Daily Variation,Adj. Percentage Variation
5391755,GAIA,1999-10-29,5.5,8.62,5.38,6.38,895000.0,0.0,1.0,5.303154,8.311489,5.187449,6.151659,895000.0,3.24,58.909091,3.12404,58.909091
5391756,GAIA,1999-11-01,6.62,6.94,6.5,6.88,144900.0,0.0,1.0,6.383069,6.691617,6.267364,6.633764,144900.0,0.44,6.646526,0.424252,6.646526
5391757,GAIA,1999-11-02,6.91,6.94,6.5,6.62,158000.0,0.0,1.0,6.66269,6.691617,6.267364,6.383069,158000.0,0.44,6.367583,0.424252,6.367583
5391758,GAIA,1999-11-03,6.56,6.75,6.56,6.62,54500.0,0.0,1.0,6.325217,6.508417,6.325217,6.383069,54500.0,0.19,2.896341,0.1832,2.896341
5391759,GAIA,1999-11-04,6.62,6.69,6.56,6.56,21000.0,0.0,1.0,6.383069,6.450564,6.325217,6.325217,21000.0,0.13,1.963746,0.125347,1.963746


In [300]:
bp.loc[bp['Date'] == '1999-10-29']

Unnamed: 0,Symbol,Date,Open,High,Low,Close,Volume,Ex-Dividend,Split Ratio,Adj. Open,Adj. High,Adj. Low,Adj. Close,Adj. Volume,Daily Variation,Percentage Variation,Adj. Daily Variation,Adj. Percentage Variation
1928868,BP,1999-10-29,57.5,58.12,57.38,57.75,2688800.0,0.0,1.0,28.106849,28.409914,28.048192,28.229053,2688800.0,0.74,1.286957,0.361723,1.286957


In [301]:
# Add GAIA figures to BP dataframe

# GAIA data starts on 1999-10-29

# Label for the BP row with date 1999-10-29
bp_gaia_start = 1928868
# Label for the GAIA row with date 1999-10-29
gaia_start = 5391755

data_to_copy = ['Date', 'Adj. Open', 'Adj. High', 'Adj. Low', 'Adj. Close']

bp_gaia_intersect_length = 3753

for i in range(bp_gaia_intersect_length):
    for col in data_to_copy:
        bp.loc[bp_gaia_start+i,'GAIA %s' % str(col)] = gaia.loc[gaia_start+i,'%s' % str(col)]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [302]:
# OPTIONAL, MAY DELETE
# Showing that `bp_gaia_intersect_length` is correct
print(bp.loc[bp_gaia_start+bp_gaia_intersect_length-1])
print(bp.loc[bp_gaia_start+bp_gaia_intersect_length])

Symbol                               BP
Date                         2014-09-30
Open                              44.04
High                              44.22
Low                                43.8
Close                             43.95
Volume                       6.8345e+06
Ex-Dividend                           0
Split Ratio                           1
Adj. Open                       39.0862
Adj. High                        39.246
Adj. Low                        38.8732
Adj. Close                      39.0064
Adj. Volume                  6.8345e+06
Daily Variation                    0.42
Percentage Variation           0.953678
Adj. Daily Variation           0.372757
Adj. Percentage Variation      0.953678
GAIA Date                    2014-09-30
GAIA Adj. Open                     6.61
GAIA Adj. High                     7.41
GAIA Adj. Low                      6.61
GAIA Adj. Close                    7.34
Name: 1932620, dtype: object
Symbol                               BP
Date       

#### 1.2.2.3 FTSE 100:

Source: Scraped from Google Finance.

In [303]:
ftse100_csv = pd.read_csv("ftse100-figures.csv")
ftse100_csv.head()

Unnamed: 0,Date,Open,High,Low,Close
0,2016-09-09,6858.7,6862.38,6762.3,6776.95
1,2016-09-08,6846.58,6889.64,6819.82,6858.7
2,2016-09-07,6826.05,6856.12,6814.87,6846.58
3,2016-09-06,6879.42,6887.92,6818.96,6826.05
4,2016-09-05,6894.6,6910.66,6867.08,6879.42


In [304]:
# Sorted FTSE100 by date (ascending) to fit with LSE stock data

# Date range from 1984-04-02 to 2016-09-09
sorted_ftse100 = ftse100_csv.sort_values(by='Date')
sorted_ftse100.head()

Unnamed: 0,Date,Open,High,Low,Close
8187,1984-04-02,1108.1,1108.1,1108.1,1108.1
8186,1984-04-03,1095.4,1095.4,1095.4,1095.4
8185,1984-04-04,1095.4,1095.4,1095.4,1095.4
8184,1984-04-05,1102.2,1102.2,1102.2,1102.2
8183,1984-04-06,1096.3,1096.3,1096.3,1096.3


In [305]:
bp[bp['Date'] == '1984-04-02']

Unnamed: 0,Symbol,Date,Open,High,Low,Close,Volume,Ex-Dividend,Split Ratio,Adj. Open,...,Adj. Volume,Daily Variation,Percentage Variation,Adj. Daily Variation,Adj. Percentage Variation,GAIA Date,GAIA Adj. Open,GAIA Adj. High,GAIA Adj. Low,GAIA Adj. Close
1924931,BP,1984-04-02,45.62,46.38,45.5,46.0,209700.0,0.0,1.0,4.748742,...,838800.0,0.88,1.928979,0.091602,1.928979,,,,,


In [306]:
sorted_ftse100.head()

Unnamed: 0,Date,Open,High,Low,Close
8187,1984-04-02,1108.1,1108.1,1108.1,1108.1
8186,1984-04-03,1095.4,1095.4,1095.4,1095.4
8185,1984-04-04,1095.4,1095.4,1095.4,1095.4
8184,1984-04-05,1102.2,1102.2,1102.2,1102.2
8183,1984-04-06,1096.3,1096.3,1096.3,1096.3


In [307]:
# Add FTSE data to BP dataframe

# FTSE data starts on 1984-04-02

# Label for the BP row with date 1984-04-02
bp_ftse_start = 1924931
# Label for the GAIA row with date 1984-04-02
ftse_start = 8187

ftse_data_to_copy = ['Date', 'Open', 'High', 'Low', 'Close']

ftse_gaia_intersect_length = len(ftse100_csv)

for i in range(ftse_gaia_intersect_length):
    for col in ftse_data_to_copy:
        bp.loc[bp_ftse_start+i,'FTSE %s' % str(col)] = sorted_ftse100.loc[ftse_start-i,'%s' % str(col)]

bp.tail()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,Symbol,Date,Open,High,Low,Close,Volume,Ex-Dividend,Split Ratio,Adj. Open,...,GAIA Date,GAIA Adj. Open,GAIA Adj. High,GAIA Adj. Low,GAIA Adj. Close,FTSE Date,FTSE Open,FTSE High,FTSE Low,FTSE Close
1933114,,,,,,,,,,,...,,,,,,2016-09-05,6894.6,6910.66,6867.08,6879.42
1933115,,,,,,,,,,,...,,,,,,2016-09-06,6879.42,6887.92,6818.96,6826.05
1933116,,,,,,,,,,,...,,,,,,2016-09-07,6826.05,6856.12,6814.87,6846.58
1933117,,,,,,,,,,,...,,,,,,2016-09-08,6846.58,6889.64,6819.82,6858.7
1933118,,,,,,,,,,,...,,,,,,2016-09-09,6858.7,6862.38,6762.3,6776.95


In [308]:
for i in range(ftse_gaia_intersect_length):
    bp_date = bp.loc[bp_ftse_start+i, 'Date']
    ftse_date = bp.loc[bp_ftse_start+i, 'FTSE Date']
    if bp_date != ftse_date:
        print("BP: ", bp_date, " FTSE: ", ftse_date)

BP:  1984-04-23  FTSE:  1984-04-20
BP:  1984-04-24  FTSE:  1984-04-23
BP:  1984-04-25  FTSE:  1984-04-24
BP:  1984-04-26  FTSE:  1984-04-25
BP:  1984-04-27  FTSE:  1984-04-26
BP:  1984-05-02  FTSE:  1984-05-03
BP:  1984-05-03  FTSE:  1984-05-04
BP:  1984-05-04  FTSE:  1984-05-08
BP:  1984-05-07  FTSE:  1984-05-09
BP:  1984-05-08  FTSE:  1984-05-10
BP:  1984-05-09  FTSE:  1984-05-11
BP:  1984-05-10  FTSE:  1984-05-14
BP:  1984-05-11  FTSE:  1984-05-15
BP:  1984-05-14  FTSE:  1984-05-16
BP:  1984-05-15  FTSE:  1984-05-17
BP:  1984-05-16  FTSE:  1984-05-18
BP:  1984-05-17  FTSE:  1984-05-21
BP:  1984-05-18  FTSE:  1984-05-22
BP:  1984-05-21  FTSE:  1984-05-23
BP:  1984-05-22  FTSE:  1984-05-24
BP:  1984-05-23  FTSE:  1984-05-25
BP:  1984-05-24  FTSE:  1984-05-30
BP:  1984-05-25  FTSE:  1984-05-31
BP:  1984-05-29  FTSE:  1984-06-01
BP:  1984-05-30  FTSE:  1984-06-04
BP:  1984-05-31  FTSE:  1984-06-05
BP:  1984-06-01  FTSE:  1984-06-06
BP:  1984-06-04  FTSE:  1984-06-07
BP:  1984-06-05  FTS

In [309]:
bp.loc[bp_ftse_start+2350]

Symbol                               BP
Date                         1993-07-20
Open                              52.25
High                                 53
Low                               52.12
Close                                53
Volume                           961600
Ex-Dividend                           0
Split Ratio                           1
Adj. Open                       5.96843
Adj. High                        6.0541
Adj. Low                        5.95358
Adj. Close                       6.0541
Adj. Volume                  3.8464e+06
Daily Variation                    0.88
Percentage Variation            1.68421
Adj. Daily Variation           0.100521
Adj. Percentage Variation       1.68421
GAIA Date                           NaN
GAIA Adj. Open                      NaN
GAIA Adj. High                      NaN
GAIA Adj. Low                       NaN
GAIA Adj. Close                     NaN
FTSE Date                    1993-07-21
FTSE Open                        2827.2


In [310]:
# Add FTSE data to BP dataframe
# bp.iloc[1832] has date 1984-04-02.
# BP is of row 1923099 to 1933108 in df

bp_with_ftse = bp.loc[1832+1923099:]
bp_with_ftse.loc[:,'FTSE Open'] = sorted_ftse100.loc[:,'Open']
bp_with_ftse.loc[:,'FTSE Close'] = sorted_ftse100.loc[:,'Close']

bp_with_ftse.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Unnamed: 0,Symbol,Date,Open,High,Low,Close,Volume,Ex-Dividend,Split Ratio,Adj. Open,...,GAIA Date,GAIA Adj. Open,GAIA Adj. High,GAIA Adj. Low,GAIA Adj. Close,FTSE Date,FTSE Open,FTSE High,FTSE Low,FTSE Close
1924931,BP,1984-04-02,45.62,46.38,45.5,46.0,209700.0,0.0,1.0,4.748742,...,,,,,,1984-04-02,,1108.1,1108.1,
1924932,BP,1984-04-03,46.12,46.5,45.88,46.38,148900.0,0.0,1.0,4.800788,...,,,,,,1984-04-03,,1095.4,1095.4,
1924933,BP,1984-04-04,46.62,48.0,46.62,48.0,283800.0,0.0,1.0,4.852835,...,,,,,,1984-04-04,,1095.4,1095.4,
1924934,BP,1984-04-05,48.38,48.38,47.0,47.5,166400.0,0.0,1.0,5.03604,...,,,,,,1984-04-05,,1102.2,1102.2,
1924935,BP,1984-04-06,47.12,47.5,47.0,47.5,81500.0,0.0,1.0,4.904882,...,,,,,,1984-04-06,,1096.3,1096.3,


#### 1.2.3 N-day moving averages

Only applying this to specific stocks because this takes much computational power.

In [311]:
# N-day moving averages of adjusted close prices

def n_day_moving_average(df, moving_average):
    # Create a column `N-day moving Average`.
    df['%s-day Moving Average' % str(moving_average)] = 0

    for i in range(moving_average, len(bp)):
        m_average = sum(df.iloc[i-moving_average:i]['Adj. Close'])/moving_average
        df.iloc[i].loc['%s-day Moving Average' % str(moving_average)] = m_average
    
    return df

n_day_moving_average(bp, 7)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,Symbol,Date,Open,High,Low,Close,Volume,Ex-Dividend,Split Ratio,Adj. Open,...,GAIA Adj. Open,GAIA Adj. High,GAIA Adj. Low,GAIA Adj. Close,FTSE Date,FTSE Open,FTSE High,FTSE Low,FTSE Close,7-day Moving Average
1923099,BP,1977-01-03,76.50,77.6200,76.5000,77.62,12400.0,0.0,1.0,1.990787,...,,,,,,,,,,0
1923100,BP,1977-01-04,77.62,78.0000,76.7500,77.00,19300.0,0.0,1.0,2.019933,...,,,,,,,,,,0
1923101,BP,1977-01-05,77.00,77.0000,74.5000,74.50,17900.0,0.0,1.0,2.003798,...,,,,,,,,,,0
1923102,BP,1977-01-06,74.50,75.5000,74.5000,75.12,23900.0,0.0,1.0,1.938740,...,,,,,,,,,,0
1923103,BP,1977-01-07,75.12,75.3800,74.6200,75.12,41700.0,0.0,1.0,1.954874,...,,,,,,,,,,0
1923104,BP,1977-01-10,75.12,75.7500,74.5000,75.62,13000.0,0.0,1.0,1.954874,...,,,,,,,,,,0
1923105,BP,1977-01-11,75.62,76.3800,74.7500,75.00,13300.0,0.0,1.0,1.967886,...,,,,,,,,,,0
1923106,BP,1977-01-12,74.75,74.7500,73.5000,74.25,21000.0,0.0,1.0,1.945246,...,,,,,,,,,,0
1923107,BP,1977-01-13,74.25,76.0000,74.1200,76.00,27300.0,0.0,1.0,1.932234,...,,,,,,,,,,0
1923108,BP,1977-01-14,76.00,76.0000,75.0000,75.00,10400.0,0.0,1.0,1.977775,...,,,,,,,,,,0


## 2. Implementation

### 2.1 Build training and test sets

In [312]:
# Not tried yet
def prepare_train_test(days, periods, target='Adj. Close', test_size=0.2, buffer=0, target_days=7):  
    """Returns X_train, X_test, y_train, y_test for parameters.
    Predicts prices `target_days` ahead."""
    # Columns
    columns = []
    for j in range(1,days+1):
        columns.append('i-%s' % str(j))
    columns.append('Adj. High')
    columns.append('Adj. Low')

    # Columns: Prices (predict multiple day)
    nday_columns = []
    for j in range(1,target_days+1):
        nday_columns.append('Day %s' % str(j-1))

    # Index
    start_date = bp.iloc[days+buffer]["Date"]
    index = pd.date_range(start_date, periods=periods, freq='D')

    # Create empty dataframes for features and prices
    features = pd.DataFrame(index=index, columns=columns)
    prices = pd.DataFrame(index=index, columns=["Target"])
    nday_prices = pd.DataFrame(index=index, columns=nday_columns)

    # Prepare test and training sets
    for i in range(periods):
        # Fill in Target df
#        prices.iloc[i]['Target'] = bp.iloc[i+days][target]
        for j in range(target_days):
            nday_prices.iloc[i]['Day %s' % str(j)] = bp.iloc[i+days+j][target]
        # Fill in Features df
        for j in range(days):
            features.iloc[i]['i-%s' % str(days-j)] = bp.iloc[i+j][target]
        features.iloc[i]['Adj. High'] = max(bp[i:i+days]['Adj. High'])
        features.iloc[i]['Adj. Low'] = min(bp[i:i+days]['Adj. Low'])
    print("Features", features.head())
    # print("Prices", prices.head())
                
    X = features
    y = nday_prices
    
    # Train-test split
    if len(X) != len(y):
        return "Error"
    split_index = int(len(X) * (1-test_size))
    X_train = X[:split_index]
    X_test = X[split_index:]
    y_train = y[:split_index]
    y_test = y[split_index:]
    
    return X_train, X_test, y_train, y_test

In [315]:
# Initialise variables
# Number of days prior that we consider
days = 100
# Number of train and test examples combined
periods = 1000
# Entries that we exclude from consideration completely
buffer = 0 

X_train, X_test, y_train, y_test = prepare_train_test(days, periods, buffer=buffer)

print("Train shapes (X,y): ", X_train.shape, y_train.shape)
print("Test shapes (X,y): ", X_test.shape, y_test.shape)

Features                 i-1      i-2      i-3      i-4      i-5      i-6      i-7  \
1977-05-25  2.31608  2.34522  2.35199  2.36813  2.35511  2.32909   2.3421   
1977-05-26  2.26715  2.31608  2.34522  2.35199  2.36813  2.35511  2.32909   
1977-05-27  2.27054  2.26715  2.31608  2.34522  2.35199  2.36813  2.35511   
1977-05-28  2.26091  2.27054  2.26715  2.31608  2.34522  2.35199  2.36813   
1977-05-29  2.26403  2.26091  2.27054  2.26715  2.31608  2.34522  2.35199   

                i-8      i-9     i-10   ...        i-93     i-94     i-95  \
1977-05-25  2.32258  2.31608  2.32258   ...     1.93223  1.95175  1.96789   
1977-05-26   2.3421  2.32258  2.31608   ...     1.97777  1.93223  1.95175   
1977-05-27  2.32909   2.3421  2.32258   ...     1.95175  1.97777  1.93223   
1977-05-28  2.35511  2.32909   2.3421   ...     1.95826  1.95175  1.97777   
1977-05-29  2.36813  2.35511  2.32909   ...     1.94863  1.95826  1.95175   

               i-96     i-97     i-98     i-99    i-100 Adj. High

### 2.2 Classifier

In [None]:
# Import MultiOutputRegressor to handle predicting multiple outputs
from sklearn.multioutput import MultiOutputRegressor

# Import metrics
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import median_absolute_error

In [None]:
# Helper functions for metrics
def rmsp(test, pred):
    return np.sqrt(np.mean(((test - pred)/test)**2))

def print_metrics(test, pred):
    print("Root Mean Squared Percentage Error", rmsp(test, pred))
    print("Mean Absolute Error: ", mean_absolute_error(test, pred))
    print("Explained Variance Score: ", explained_variance_score(test, pred))
    print("Mean Squared Error: ", mean_squared_error(test, pred))
    print("R2 score: ", r2_score(test, pred))
    print("Median Absolute Error: ", median_absolute_error(test, pred))

In [None]:
# Import Classifiers
from sklearn import svm
from sklearn.linear_model import LinearRegression

In [None]:
# Apply Classifier and Print Metrics
def classify_and_metrics(clf=LinearRegression(), X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test):
    """Trains and tests classifier on training and test datasets.
    Prints performance metrics.
    Returns classifier and predictions."""
    clf = MultiOutputRegressor(clf)
    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)
    
    # Print metrics
    print("# Days used to predict: %s" % str(days))
    print("\n%s-day predictions" % str(target_days)) 
    print_metrics(y_test, pred)
    
    return clf, pred

In [None]:
# Do multiple train-test cycles on different train-test sets and see
# if they all produce reliable results
for segment in range(10):
    buffer = segment*1000
    X_train, X_test, y_train, y_test = prepare_train_test(days, periods, buffer=buffer)
    classify_and_metrics()

### Metrics

## 3. Refinement



### 3.1 Tuning model parameters


In [None]:
columns

## Deprecated

In [136]:
# Initialise variables
# Number of days prior that we consider
days = 100
# Number of train and test examples combined
periods = 9000

# Columns
columns = []
for j in range(1,days+1):
    columns.append('i-%s' % str(j))
columns.append('Adj. High')
columns.append('Adj. Low')
print(columns)

# Index
start_date = bp.iloc[days]["Date"]
print("Start date: ", start_date)
index = pd.date_range(start_date, periods=periods, freq='D')

# Create empty dataframes for features and prices
features = pd.DataFrame(index=index, columns=columns)
prices = pd.DataFrame(index=index, columns=["Target"])

# Prepare test and training sets
for i in range(periods):
    prices.iloc[i]['Target'] = bp.iloc[i+days]['Adj. Close']
    for j in range(days):
        features.iloc[i]['i-%s' % str(days-j)] = bp.iloc[i+j]['Adj. Close']
    features.iloc[i]['Adj. High'] = max(bp[i:i+days]['Adj. High'])
    features.iloc[i]['Adj. Low'] = min(bp[i:i+days]['Adj. Low'])
print(features.head())
print(prices.head())

['i-1', 'i-2', 'i-3', 'i-4', 'i-5', 'i-6', 'i-7', 'i-8', 'i-9', 'i-10', 'i-11', 'i-12', 'i-13', 'i-14', 'i-15', 'i-16', 'i-17', 'i-18', 'i-19', 'i-20', 'i-21', 'i-22', 'i-23', 'i-24', 'i-25', 'i-26', 'i-27', 'i-28', 'i-29', 'i-30', 'i-31', 'i-32', 'i-33', 'i-34', 'i-35', 'i-36', 'i-37', 'i-38', 'i-39', 'i-40', 'i-41', 'i-42', 'i-43', 'i-44', 'i-45', 'i-46', 'i-47', 'i-48', 'i-49', 'i-50', 'i-51', 'i-52', 'i-53', 'i-54', 'i-55', 'i-56', 'i-57', 'i-58', 'i-59', 'i-60', 'i-61', 'i-62', 'i-63', 'i-64', 'i-65', 'i-66', 'i-67', 'i-68', 'i-69', 'i-70', 'i-71', 'i-72', 'i-73', 'i-74', 'i-75', 'i-76', 'i-77', 'i-78', 'i-79', 'i-80', 'i-81', 'i-82', 'i-83', 'i-84', 'i-85', 'i-86', 'i-87', 'i-88', 'i-89', 'i-90', 'i-91', 'i-92', 'i-93', 'i-94', 'i-95', 'i-96', 'i-97', 'i-98', 'i-99', 'i-100', 'Adj. High', 'Adj. Low']
Start date:  1977-05-25
                i-1      i-2      i-3      i-4      i-5      i-6      i-7  \
1977-05-25  2.31608  2.34522  2.35199  2.36813  2.35511  2.32909   2.3421   
1977

In [None]:
# N-day prices target

# Initialise variables
target_days = 7

# Create target dataframe
nday_columns = []
for j in range(1,target_days+1):
    nday_columns.append('Day %s' % str(j-1))
nday_prices = pd.DataFrame(index=index, columns=nday_columns)

# Fill target dataframe
for i in range(periods):
    for j in range(target_days):
        nday_prices.iloc[i]['Day %s' % str(j)] = bp.iloc[i+days+j]['Adj. Close']
nday_prices

In [137]:
# Train-test split (predict prices one day ahead)
def train_test_split_noshuffle(X, y, test_size=0.2):
    if len(X) != len(y):
        return "Error"
    split_index = int(len(X) * (1-test_size))
    X_train = X[:split_index]
    X_test = X[split_index:]
    y_train = y[:split_index]
    y_test = y[split_index:]
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = train_test_split_noshuffle(features, prices, test_size=0.2)

print("Train shapes (X,y): ", X_train.shape, y_train.shape)
print("Test shapes (X,y): ", X_test.shape, y_test.shape)

Train shapes (X,y):  (7200, 102) (7200, 1)
Test shapes (X,y):  (1800, 102) (1800, 1)


In [138]:
# Train-test split (predict prices `target_days` days ahead)

Xnd_train, Xnd_test, ynd_train, ynd_test = train_test_split_noshuffle(features, nday_prices, test_size=0.2)

print("Train shapes (Xnd,ynd): ", Xnd_train.shape, ynd_train.shape)
print("Test shapes (Xnd,ynd): ", Xnd_test.shape, ynd_test.shape)

Train shapes (Xnd,ynd):  (7200, 102) (7200, 7)
Test shapes (Xnd,ynd):  (1800, 102) (1800, 7)
