# III. Methodology: Code

### Setup

In [2]:
# Import modules
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

## 1. Data Preprocessing

In [3]:
header_names = ['Symbol',
 'Date',
 'Open',
 'High',
 'Low',
 'Close',
 'Volume',
 'Ex-Dividend',
 'Split Ratio',
 'Adj. Open',
 'Adj. High',
 'Adj. Low',
 'Adj. Close',
 'Adj. Volume']

In [4]:
# Read HUGE csv that has all the daily LSE data from 1977
# Data Preprocessing: adding header to CSV
df = pd.read_csv('~/lse-data/lse/WIKI_20160909.csv', header=None, names=header_names)

### 1.1 Examining Abnormalities

Need to investigate previous observation that Opening, High, Low, Close prices have minimum of 0.

In [5]:
df[df['Open'] == 0]

Unnamed: 0,Symbol,Date,Open,High,Low,Close,Volume,Ex-Dividend,Split Ratio,Adj. Open,Adj. High,Adj. Low,Adj. Close,Adj. Volume
1047193,ARWR,2002-10-11,0.0,0.00,0.0,0.00,65000.0,0.0,1.0,0.0,0.00,0.0,0.000000,100.000000
1047194,ARWR,2002-10-14,0.0,0.00,0.0,0.00,0.0,0.0,1.0,0.0,0.00,0.0,0.000000,0.000000
1047195,ARWR,2002-10-15,0.0,0.00,0.0,0.00,0.0,0.0,1.0,0.0,0.00,0.0,0.000000,0.000000
1047196,ARWR,2002-10-16,0.0,0.00,0.0,0.00,0.0,0.0,1.0,0.0,0.00,0.0,0.000000,0.000000
1047197,ARWR,2002-10-17,0.0,0.00,0.0,0.00,0.0,0.0,1.0,0.0,0.00,0.0,0.000000,0.000000
1047198,ARWR,2002-10-18,0.0,0.00,0.0,0.00,0.0,0.0,1.0,0.0,0.00,0.0,0.000000,0.000000
1047199,ARWR,2002-10-21,0.0,0.00,0.0,0.00,0.0,0.0,1.0,0.0,0.00,0.0,0.000000,0.000000
1047200,ARWR,2002-10-22,0.0,0.00,0.0,0.00,0.0,0.0,1.0,0.0,0.00,0.0,0.000000,0.000000
7608936,LFVN,2003-02-21,0.0,0.01,0.0,0.01,27200.0,0.0,1.0,0.0,4.76,0.0,4.760000,57.142857
7608983,LFVN,2003-04-30,0.0,0.00,0.0,0.00,6800.0,0.0,1.0,0.0,0.00,0.0,0.000000,14.285714


### 1.2 Feature Engineering

#### 1.2.1 Measures of variation

In [6]:
# Create additional features
# These features are not used in the current model but are nice for visualisations
df.loc[:,'Daily Variation'] = df.loc[:,'High'] - df.loc[:,'Low']
df.loc[:,'Percentage Variation'] = df.loc[:,'Daily Variation'] / df.loc[:,'Open'] * 100
df.loc[:,'Adj. Daily Variation'] = df.loc[:,'Adj. High'] - df.loc[:,'Adj. Low']
df.loc[:,'Adj. Percentage Variation'] = df.loc[:,'Adj. Daily Variation'] / df.loc[:,'Adj. Open'] * 100

#### 1.2.2 Extracting specific stocks
#### 1.2.2.1 BP

In [7]:
# Extract BP data
bp = df[df['Symbol'] == 'BP']
bp.head()

Unnamed: 0,Symbol,Date,Open,High,Low,Close,Volume,Ex-Dividend,Split Ratio,Adj. Open,Adj. High,Adj. Low,Adj. Close,Adj. Volume,Daily Variation,Percentage Variation,Adj. Daily Variation,Adj. Percentage Variation
1923099,BP,1977-01-03,76.5,77.62,76.5,77.62,12400.0,0.0,1.0,1.990787,2.019933,1.990787,2.019933,198400.0,1.12,1.464052,0.029146,1.464052
1923100,BP,1977-01-04,77.62,78.0,76.75,77.0,19300.0,0.0,1.0,2.019933,2.029822,1.997292,2.003798,308800.0,1.25,1.61041,0.032529,1.61041
1923101,BP,1977-01-05,77.0,77.0,74.5,74.5,17900.0,0.0,1.0,2.003798,2.003798,1.93874,1.93874,286400.0,2.5,3.246753,0.065058,3.246753
1923102,BP,1977-01-06,74.5,75.5,74.5,75.12,23900.0,0.0,1.0,1.93874,1.964763,1.93874,1.954874,382400.0,1.0,1.342282,0.026023,1.342282
1923103,BP,1977-01-07,75.12,75.38,74.62,75.12,41700.0,0.0,1.0,1.954874,1.96164,1.941863,1.954874,667200.0,0.76,1.011715,0.019778,1.011715


#### 1.2.2.2 Oil Stocks

Found using the LSE stocks list (supplementary data source).

In [8]:
# See which stocks are in our dataset:
oil_stocks = ["SNP", "GAIA", "GAID", "GAZ", "81JK", "OGZD", "GDG", "98LQ", "HLPD", 
              "LKOE", "LKOD", "LKOH", "MOLD", "MNMD", "05IS", "40XT", "ROSN",
             "RDSA", "RDSB", "SAC", "SGGD", "ATAD"]
oil_stocks_in_df = []
for stock in oil_stocks:
    in_df = False
    if not df[df['Symbol'] == stock].empty:
        in_df = True
        oil_stocks_in_df.append(stock)
print "Oil stocks in DF: ", oil_stocks_in_df

Oil stocks in DF:  ['GAIA']


In [9]:
# Extract GAIA data
gaia = df[df['Symbol'] == 'GAIA']
gaia.head()
# GAIA data is available from 1999-10-29 to 2016-09-09.

Unnamed: 0,Symbol,Date,Open,High,Low,Close,Volume,Ex-Dividend,Split Ratio,Adj. Open,Adj. High,Adj. Low,Adj. Close,Adj. Volume,Daily Variation,Percentage Variation,Adj. Daily Variation,Adj. Percentage Variation
5391755,GAIA,1999-10-29,5.5,8.62,5.38,6.38,895000.0,0.0,1.0,5.303154,8.311489,5.187449,6.151659,895000.0,3.24,58.909091,3.12404,58.909091
5391756,GAIA,1999-11-01,6.62,6.94,6.5,6.88,144900.0,0.0,1.0,6.383069,6.691617,6.267364,6.633764,144900.0,0.44,6.646526,0.424252,6.646526
5391757,GAIA,1999-11-02,6.91,6.94,6.5,6.62,158000.0,0.0,1.0,6.66269,6.691617,6.267364,6.383069,158000.0,0.44,6.367583,0.424252,6.367583
5391758,GAIA,1999-11-03,6.56,6.75,6.56,6.62,54500.0,0.0,1.0,6.325217,6.508417,6.325217,6.383069,54500.0,0.19,2.896341,0.1832,2.896341
5391759,GAIA,1999-11-04,6.62,6.69,6.56,6.56,21000.0,0.0,1.0,6.383069,6.450564,6.325217,6.325217,21000.0,0.13,1.963746,0.125347,1.963746


In [10]:
# Check index of row where BP and GAIA data start intersecting 
# i.e. date = 1999-10-29
bp.loc[bp['Date'] == '1999-10-29']

Unnamed: 0,Symbol,Date,Open,High,Low,Close,Volume,Ex-Dividend,Split Ratio,Adj. Open,Adj. High,Adj. Low,Adj. Close,Adj. Volume,Daily Variation,Percentage Variation,Adj. Daily Variation,Adj. Percentage Variation
1928868,BP,1999-10-29,57.5,58.12,57.38,57.75,2688800.0,0.0,1.0,28.106849,28.409914,28.048192,28.229053,2688800.0,0.74,1.286957,0.361723,1.286957


In [11]:
# Add GAIA figures to BP dataframe

# GAIA data starts on 1999-10-29

# Label for the BP row with date 1999-10-29
bp_gaia_start = 1928868
# Label for the GAIA row with date 1999-10-29
gaia_start = 5391755

data_to_copy = ['Date', 'Adj. Open', 'Adj. High', 'Adj. Low', 'Adj. Close']

bp_gaia_intersect_length = 3753

for i in range(bp_gaia_intersect_length):
    for col in data_to_copy:
        bp.loc[bp_gaia_start+i,'GAIA %s' % str(col)] = gaia.loc[gaia_start+i,'%s' % str(col)]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


#### 1.2.2.3 FTSE 100:

Source: Scraped from Google Finance.

In [12]:
# Read in FTSE100 data
ftse100_csv = pd.read_csv("ftse100-figures.csv")

# Preview data
ftse100_csv.head()

Unnamed: 0,Date,Open,High,Low,Close
0,2016-09-09,6858.7,6862.38,6762.3,6776.95
1,2016-09-08,6846.58,6889.64,6819.82,6858.7
2,2016-09-07,6826.05,6856.12,6814.87,6846.58
3,2016-09-06,6879.42,6887.92,6818.96,6826.05
4,2016-09-05,6894.6,6910.66,6867.08,6879.42


In [13]:
# Sort FTSE100 data by date (ascending) to fit with LSE stock data

# Date range from 1984-04-02 to 2016-09-09
sorted_ftse100 = ftse100_csv.sort_values(by='Date')
sorted_ftse100.head()

Unnamed: 0,Date,Open,High,Low,Close
8187,1984-04-02,1108.1,1108.1,1108.1,1108.1
8186,1984-04-03,1095.4,1095.4,1095.4,1095.4
8185,1984-04-04,1095.4,1095.4,1095.4,1095.4
8184,1984-04-05,1102.2,1102.2,1102.2,1102.2
8183,1984-04-06,1096.3,1096.3,1096.3,1096.3


In [14]:
# Check index of row where BP and FTSE data start intersecting 
# i.e. date = 1984-04-02
bp[bp['Date'] == '1984-04-02']

Unnamed: 0,Symbol,Date,Open,High,Low,Close,Volume,Ex-Dividend,Split Ratio,Adj. Open,...,Adj. Volume,Daily Variation,Percentage Variation,Adj. Daily Variation,Adj. Percentage Variation,GAIA Date,GAIA Adj. Open,GAIA Adj. High,GAIA Adj. Low,GAIA Adj. Close
1924931,BP,1984-04-02,45.62,46.38,45.5,46.0,209700.0,0.0,1.0,4.748742,...,838800.0,0.88,1.928979,0.091602,1.928979,,,,,


In [15]:
# Adds FTSE data to BP dataframe, joining at dates

# FTSE columns we want to copy to BP dataframe
ftse_data_to_copy = ['Date', 'Open', 'High', 'Low', 'Close']    

# FTSE data starts on 1984-04-02

# Label for the BP row with date 1984-04-02
bp_ftse_start = 1924931
# Label for the FTSE row with date 1984-04-02
ftse_start = 8187

bp_counter = 0
ftse_counter = 0
while ftse_counter < len(sorted_ftse100):
    bp_date = bp.loc[bp_ftse_start + bp_counter, 'Date']
    ftse_date = sorted_ftse100.loc[ftse_start - ftse_counter, 'Date']
    if bp_date == ftse_date:
        # Add FTSE data to BP row
        for col in ftse_data_to_copy:
            bp.loc[bp_ftse_start + bp_counter, 'FTSE %s' % str(col)] = sorted_ftse100.loc[ftse_start - ftse_counter,'%s' % str(col)]
        # FTSE counter + 1, BP counter + 1
        bp_counter += 1
        ftse_counter += 1
    elif bp_date < ftse_date:
        # Move to next BP row, same FTSE row and repeat
        bp_counter += 1
    elif bp_date > ftse_date:
        # Move to next FTSE row, same BP row and repeat
        ftse_counter += 1
    else:
        print "Error: BP date is ", bp_date, "; FTSE date is ", ftse_date
        # FTSE row + 1, BP row + 1
        bp_counter += 1
        ftse_counter += 1

In [16]:
# Count and display NaNs in FTSE data 
# i.e. dates where we have BP but not FTSE data
nan_counter = 0
for row in range(len(bp.loc[bp_ftse_start:])):
    if pd.isnull(bp.loc[bp_ftse_start+row, 'FTSE Date']):
        print bp.loc[bp_ftse_start+row, 'Date']
        nan_counter += 1
print "NaNs: ", nan_counter

1984-04-27
1984-05-02
1984-05-07
1984-05-29
1984-08-27
1984-12-26
1985-04-08
1985-05-06
1985-08-26
1985-12-26
1986-03-31
1986-05-05
1986-08-25
1986-12-26
1987-04-20
1987-05-04
1987-08-31
1987-12-28
1988-04-04
1988-05-02
1988-08-29
1988-12-27
1989-03-27
1989-05-01
1989-08-28
1989-12-26
1990-04-16
1990-05-07
1990-08-27
1990-12-26
1991-04-01
1991-05-06
1991-08-26
1991-12-26
1992-04-20
1992-05-04
1992-08-31
1992-12-28
1993-04-12
1993-05-03
1993-08-30
1993-12-27
1993-12-28
1994-01-03
1994-04-04
1994-05-02
1994-08-29
1994-12-27
1995-04-17
1995-05-08
1995-08-28
1995-12-26
1996-04-08
1996-05-06
1996-08-26
1996-12-26
1997-03-31
1997-05-05
1997-08-25
1997-12-26
1998-04-13
1998-05-04
1998-08-31
1998-12-28
1998-12-31
1999-04-05
1999-05-03
1999-08-30
1999-12-27
1999-12-28
1999-12-31
2000-01-03
2000-04-24
2000-05-01
2000-08-28
2000-12-26
2001-04-16
2001-05-07
2001-08-27
2001-12-26
2002-04-01
2002-05-06
2002-06-03
2002-06-04
2002-08-26
2002-12-26
2003-04-21
2003-05-05
2003-08-25
2003-12-26
2004-04-12

In [17]:
# Proxy remaining FTSE NaNs by taking the mean of the prices in the 
# two closest trading days where data is available 
# (one before, one after the day)
ftse_data_to_average = ['Open', 'High', 'Low', 'Close']    
for row in range(len(bp.loc[bp_ftse_start:])):
    if pd.isnull(bp.loc[bp_ftse_start+row, 'FTSE Date']):
        if not (pd.isnull(bp.loc[bp_ftse_start+row-1, 'FTSE Date']) or pd.isnull(bp.loc[bp_ftse_start+row+1, 'FTSE Date'])):
            for col in ftse_data_to_average:
                bp.loc[bp_ftse_start+row,'FTSE %s' % str(col)] = np.mean([float(bp.loc[bp_ftse_start+row-1,'FTSE %s' % str(col)]), float(bp.loc[bp_ftse_start+row+1,'FTSE %s' % str(col)])])
            bp.loc[bp_ftse_start+row,'FTSE Date'] = bp.loc[bp_ftse_start+row, 'Date']
        else:
            go_back = 0
            go_forward = 0
            while pd.isnull(bp.loc[bp_ftse_start+row-1-go_back, 'FTSE Date']):
                go_back += 1
            while pd.isnull(bp.loc[bp_ftse_start+row+1+go_forward, 'FTSE Date']):
                go_forward += 1
            for col in ftse_data_to_average:
                    bp.loc[bp_ftse_start+row,'FTSE %s' % str(col)] = np.mean([float(bp.loc[bp_ftse_start+row-1-go_back,'FTSE %s' % str(col)]), float(bp.loc[bp_ftse_start+row+1+go_forward,'FTSE %s' % str(col)])])
            bp.loc[bp_ftse_start+row,'FTSE Date'] = bp.loc[bp_ftse_start+row, 'Date']

In [19]:
# Check there are no more NaNs
nan_counter = 0
for row in range(len(bp.loc[bp_ftse_start:])):
    if pd.isnull(bp.loc[bp_ftse_start+row, 'FTSE Date']):
        print bp.loc[bp_ftse_start+row, 'Date']
        nan_counter += 1
print "NaNs: ", nan_counter

NaNs:  0


## 2. Implementation

### 2.1 Build training and test sets

In [20]:
def prepare_train_test(days, periods, target='Adj. Close', test_size=0.2, buffer=0, target_days=7):  
    """Returns X_train, X_test, y_train, y_test for parameters.
    Predicts prices `target_days` ahead.
    `days` = number of days prior we consider"""
    # Columns
    columns = []
    for j in range(1,days+1):
        columns.append('i-%s' % str(j))
    columns.append('Adj. High')
    columns.append('Adj. Low')

    # Columns: Prices (predict multiple day)
    nday_columns = []
    for j in range(1,target_days+1):
        nday_columns.append('Day %s' % str(j-1))

    # Index
    start_date = bp.iloc[days+buffer]["Date"]
    index = pd.date_range(start_date, periods=periods, freq='D')

    # Create empty dataframes for features and prices
    features = pd.DataFrame(index=index, columns=columns)
    prices = pd.DataFrame(index=index, columns=["Target"])
    nday_prices = pd.DataFrame(index=index, columns=nday_columns)

    # Prepare test and training sets
    for i in range(periods):
        # Fill in Target df
        for j in range(target_days):
            nday_prices.iloc[i]['Day %s' % str(j)] = bp.iloc[buffer+i+days+j][target]
        # Fill in Features df
        for j in range(days):
            features.iloc[i]['i-%s' % str(days-j)] = bp.iloc[buffer+i+j][target]
        features.iloc[i]['Adj. High'] = max(bp[buffer+i:buffer+i+days]['Adj. High'])
        features.iloc[i]['Adj. Low'] = min(bp[buffer+i:buffer+i+days]['Adj. Low'])
                
    X = features
    y = nday_prices
    print "X.tail: ", X.tail()

    # Train-test split
    if len(X) != len(y):
        return "Error"
    split_index = int(len(X) * (1-test_size))
    X_train = X[:split_index]
    X_test = X[split_index:]
    y_train = y[:split_index]
    y_test = y[split_index:]
    
    return X_train, X_test, y_train, y_test

In [21]:
# Initialise variables to prevent errors
X_train = []
X_test = []
y_train = []
y_test = []

### 2.2 Classifier

In [22]:
# Import MultiOutputRegressor to handle predicting multiple outputs
from sklearn.multioutput import MultiOutputRegressor

# Import metrics
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import median_absolute_error

ImportError: No module named multioutput

In [None]:
# Helper functions for metrics
def rmsp(test, pred):
    return np.sqrt(np.mean(((test - pred)/test)**2)) * 100

def print_metrics(test, pred):
    print "Root Mean Squared Percentage Error", rmsp(test, pred)
    print "Mean Absolute Error: ", mean_absolute_error(test, pred)
    print "Explained Variance Score: ", explained_variance_score(test, pred)
    print "Mean Squared Error: ", mean_squared_error(test, pred)
    print "R2 score: ", r2_score(test, pred))

In [None]:
# Import Classifiers
from sklearn import svm
from sklearn.linear_model import LinearRegression

In [None]:
# Initialise variables to prevent errors
days = 7

In [None]:
# Apply Classifier and Print Metrics
def classify_and_metrics(clf=LinearRegression(), target_days=7, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, days=days):
    """Trains and tests classifier on training and test datasets.
    Prints performance metrics.
    """
    # Classify and predict
    clf = MultiOutputRegressor(clf)
    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)
    # Lines below for debugging purposes
#    print "X_train.head(): ", X_train.head()
#    print "X_train.tail(): ", X_train.tail()
#    print "Pred: ", pred[:5]
#    print "Test: ", y_test[:5]
    
    # Print metrics
    print "# Days used to predict: %s" % str(days)
    print "\n%s-day predictions" % str(target_days) 
    print_metrics(y_test, pred)
    return rmsp(y_test, pred)

In [None]:
# Do multiple train-test cycles on different train-test sets and see
# if they all produce reliable results
def execute(steps=8, buffer_step=1000, days=7, periods=1000, model=LinearRegression(), predict_days=7):
    """Performs `steps` train-test cycles and prints evaluation metrics for BP data.
    `steps`: number of train-test cycles.
    `periods`: the total number of datapoints used in each cycle (training + test)
    `buffer_step`: number of datapoints between the starting points of each
    consecutive train-test cycle
    """
    errors=[]
    r2=[]
    for segment in range(steps):
        buffer = segment*buffer_step
        print "Buffer: ", buffer
        X_train, X_test, y_train, y_test = prepare_train_test(days=days, periods=periods, buffer=buffer)
        errors.append(classify_and_metrics(clf=model, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, days=days))
    print "Errors: ", errors
    
    daily_error = []
    for target_day in range(predict_days):
        daily_error.append([])
    for segment in range(steps):
        for target_day in range(predict_days):
            daily_error[target_day].append(errors[segment][target_day])
    print "Daily error: ", daily_error
    average_daily_error = []
    for day in daily_error:
        average_daily_error.append(np.mean(day))
    print "Mean daily error: ", average_daily_error

In [None]:
# svm.SVR() trial
execute(model=svm.SVR(), steps=8)

In [None]:
# Linear Regression trial
execute(steps=8)

# R2 scores: [0.859, 0.791, 0.606, 0.936, 0.835, 0.871, 0.623, 0.936]

## 3. Refinement



### 3.1 Tuning model parameters

No change in performance.

### 3.2 Feature Selection

### 3.2.1 Adding more of the same type of features

In [None]:
# Considering more than 7 days' worth of prior data
# 10 days' worth of prior data
execute(steps=10, days=10, buffer_step = 700)

# Mean daily error:  [1.7321477061307597, 2.5432152188018913, 3.1383346165356416, 3.5793927574194155, 3.9394427230724309, 4.2692644737508925, 4.5432050435026108]

In [None]:
# Consider 14 days' worth of prior data
execute(steps=15, days=14, buffer_step = 500)

# Mean daily error:  [1.7285404855953252, 2.5255007498628097, 3.1026280963920607, 3.5862999911658147, 4.0020669863612239, 4.3722863441980762, 4.701971393685997]

In [None]:
# Consider 21 days' worth of prior data
execute(steps=15, days=21, buffer_step = 500)

# Mean daily error:  [1.7458324393865607, 2.5550697635040556, 3.1130306876040765, 3.5859111257648624, 3.9906346379964006, 4.3416348748811986, 4.6578080578960108]

In [None]:
# Consider 30 days' worth of prior data

execute(steps=15, days=30, buffer_step = 500)

# Mean daily error:  [1.7839163888017815, 2.593162562286222, 3.1521417303676622, 3.6325948299484372, 4.0479378120671301, 4.3916975345657692, 4.7046907424412074]

In [None]:
# Consider 100 days' worth of prior data

execute(steps=15, days=100, buffer_step = 500)

# Mean daily error:  [1.9238550915564432, 2.7676076433106056, 3.3695076303415705, 3.8902423145616098, 4.3550552824867319, 4.7687380251335467, 5.1629268283684322]

### 3.2.2 Adding Oil Stock Prices (GAIA)

In [None]:
# Create dataframe with BP and GAIA data in overlapping date range
# Date range: 1999-10-29 to 2014-09-30
# `bp_gaia_start` etc defined in Feature Engineering section 1.2.2.2
bp_gaia = bp.loc[bp_gaia_start:bp_gaia_start+bp_gaia_intersect_length-1]

# Check it ends at the right date
bp_gaia.tail()

In [None]:
len(bp_gaia)

In [None]:
# Modify `prepare_train_test` function to add GAIA data.

# Potential improvement: Generalise `prepare_train_test` function instead
# of copy and pasting it and making a new function.
def prepare_train_test_with_gaia(days, periods, target='Adj. Close', test_size=0.2, buffer=0, target_days=7, df=bp_gaia):  
    """Returns X_train, X_test, y_train, y_test for parameters.
    Predicts prices `target_days` ahead.
    `days`: the number of days prior we consider (the prices of)
    `periods`: the total number of datapoints used (training + test)
    """
    # Columns
    # BP cols
    columns = []
    for j in range(1,days+1):
        columns.append('i-%s' % str(j))
    columns.append('Adj. High')
    columns.append('Adj. Low')
    # GAIA cols
    for j in range(1,days+1):
        columns.append('GAIA i-%s' % str(j))
    columns.append('GAIA Adj. High')
    columns.append('GAIA Adj. Low')

    # Columns: Prices (predict multiple day)
    nday_columns = []
    for j in range(1,target_days+1):
        nday_columns.append('Day %s' % str(j-1))

    # Index
    start_date = df.iloc[days+buffer]["Date"]
    index = pd.date_range(start_date, periods=periods, freq='D')

    # Create empty dataframes for features and prices
    features = pd.DataFrame(index=index, columns=columns)
    prices = pd.DataFrame(index=index, columns=["Target"])
    nday_prices = pd.DataFrame(index=index, columns=nday_columns)

    # Prepare test and training sets
    for i in range(periods):
        # Fill in Target df
        for j in range(target_days):
            nday_prices.iloc[i]['Day %s' % str(j)] = df.iloc[buffer+i+days+j][target]
        # Fill in Features df
        for j in range(days):
            features.iloc[i]['i-%s' % str(days-j)] = df.iloc[buffer+i+j][target]
        features.iloc[i]['Adj. High'] = max(df[buffer+i:buffer+i+days]['Adj. High'])
        features.iloc[i]['Adj. Low'] = min(df[buffer+i:buffer+i+days]['Adj. Low'])
        for j in range(days):
            features.iloc[i]['GAIA i-%s' % str(days-j)] = df.iloc[buffer+i+j]['GAIA %s' % str(target)]
        features.iloc[i]['GAIA Adj. High'] = max(df[buffer+i:buffer+i+days]['GAIA Adj. High'])
        features.iloc[i]['GAIA Adj. Low'] = min(df[buffer+i:buffer+i+days]['GAIA Adj. Low'])
                
    X = features
    y = nday_prices

    # Train-test split
    if len(X) != len(y):
        return "Error"
    split_index = int(len(X) * (1-test_size))
    X_train = X[:split_index]
    X_test = X[split_index:]
    y_train = y[:split_index]
    y_test = y[split_index:]
    
    return X_train, X_test, y_train, y_test

In [None]:
def execute_with_gaia(steps=8, buffer_step=200, days=7, periods=1000, model=LinearRegression(), predict_days=7):
    """Performs `steps` train-test cycles and prints evaluation metrics for BP + GAIA data.
    `steps`: number of train-test cycles.
    `periods`: the total number of datapoints used in each cycle (training + test)
    `buffer_step`: number of datapoints between the starting points of each
    consecutive train-test cycle
    """
    errors=[]
    r2=[]
    for segment in range(steps):
        buffer = segment*buffer_step
        print "Buffer: ", buffer
        X_train, X_test, y_train, y_test = prepare_train_test_with_gaia(days=days, periods=periods, buffer=buffer, df=bp_gaia)
        errors.append(classify_and_metrics(clf=model, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, days=days))
    print "Errors: ", errors
    
    daily_error = []
    for target_day in range(predict_days):
        daily_error.append([])
    for segment in range(steps):
        for target_day in range(predict_days):
            daily_error[target_day].append(errors[segment][target_day])
    print "Daily error: ", daily_error
    average_daily_error = []
    for day in daily_error:
        average_daily_error.append(np.mean(day))
    print "Mean daily error: ", average_daily_error

In [None]:
# Consider 7 days' worth of BP and GAIA data
execute_with_gaia(steps=13)

In [None]:
# Consider 10 days' worth of BP and GAIA data
execute_with_gaia(days=10, steps=13)

### 3.2.3 TODO: Adding FTSE100

In [None]:
# Create df with BP and FTSE data
bp_ftse = bp.loc[bp_ftse_start:]
bp_ftse.head()

In [None]:
# Modify `prepare_train_test` function to add FTSE data.
def prepare_train_test_with_ftse(days, periods, target='Adj. Close', test_size=0.2, buffer=0, target_days=7, df=bp_ftse, name='FTSE'):  
    """Returns X_train, X_test, y_train, y_test for parameters.
    Predicts prices `target_days` ahead.
    `days` = number of days prior we consider"""
    # Columns
    # BP cols
    columns = []
    for j in range(1,days+1):
        columns.append('i-%s' % str(j))
    columns.append('Adj. High')
    columns.append('Adj. Low')
    # FTSE cols
    for j in range(1,days+1):
        columns.append('%s i-%s' % (name, str(j)))
    columns.append('%s High' % name)
    columns.append('%s Low' % name)

    # Columns: Prices (predict multiple day)
    nday_columns = []
    for j in range(1,target_days+1):
        nday_columns.append('Day %s' % str(j-1))

    # Index
    start_date = df.iloc[days+buffer]["Date"]
    index = pd.date_range(start_date, periods=periods, freq='D')

    # Create empty dataframes for features and prices
    features = pd.DataFrame(index=index, columns=columns)
    prices = pd.DataFrame(index=index, columns=["Target"])
    nday_prices = pd.DataFrame(index=index, columns=nday_columns)

    # Prepare test and training sets
    for i in range(periods):
        # Fill in Target df
        for j in range(target_days):
            nday_prices.iloc[i]['Day %s' % str(j)] = df.iloc[buffer+i+days+j][target]
        # Fill in Features df
        for j in range(days):
            features.iloc[i]['i-%s' % str(days-j)] = df.iloc[buffer+i+j][target]
        features.iloc[i]['Adj. High'] = max(df[buffer+i:buffer+i+days]['Adj. High'])
        features.iloc[i]['Adj. Low'] = min(df[buffer+i:buffer+i+days]['Adj. Low'])
        for j in range(days):
            features.iloc[i]['%s i-%s' % (name, str(days-j))] = df.iloc[buffer+i+j]['%s %s' % (name, 'Close')]
        features.iloc[i]['%s High' % name] = max(df[buffer+i:buffer+i+days]['%s High' % name])
        features.iloc[i]['%s Low' % name] = min(df[buffer+i:buffer+i+days]['%s Low' % name])
                
    X = features
    y = nday_prices

    # Train-test split
    if len(X) != len(y):
        return "Error"
    split_index = int(len(X) * (1-test_size))
    X_train = X[:split_index]
    X_test = X[split_index:]
    y_train = y[:split_index]
    y_test = y[split_index:]
    
    return X_train, X_test, y_train, y_test

In [None]:
def execute_with_ftse(steps=8, buffer_step=200, days=7, periods=1000, model=LinearRegression(), predict_days=7):
    """Performs `steps` train-test cycles and prints evaluation metrics for BP + FTSE data.
    `steps`: number of train-test cycles.
    `periods`: the total number of datapoints used in each cycle (training + test)
    `buffer_step`: number of datapoints between the starting points of each
    consecutive train-test cycle
    """
    errors=[]
    r2=[]
    for segment in range(steps):
        buffer = segment*buffer_step
        print "Buffer: ", buffer
        X_train, X_test, y_train, y_test = prepare_train_test_with_ftse(days=days, periods=periods, buffer=buffer, df=bp_ftse)
        errors.append(classify_and_metrics(clf=model, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, days=days))
    print "Errors: ", errors
    
    daily_error = []
    for target_day in range(predict_days):
        daily_error.append([])
    for segment in range(steps):
        for target_day in range(predict_days):
            daily_error[target_day].append(errors[segment][target_day])
    print "Daily error: ", daily_error
    average_daily_error = []
    for day in daily_error:
        average_daily_error.append(np.mean(day))
    print "Mean daily error: ", average_daily_error

In [None]:
# Consider 7 days' worth of prior BP and FTSE data
execute_with_ftse(days=7, steps=15, buffer_step=450)

In [None]:
# Consider 10 days' worth of prior BP and FTSE data
execute_with_ftse(days=10, steps=15, buffer_step=450)

# Conclusion: Free-Form Visualisation

In [None]:
# We want an array with predictions for our model in a long date range.
# We will consider the max error predictions, that is,
# predictions of adjusted close prices 7 days ahead.

# Initialise variable
predictions_800_off = []

In [None]:
def predict(clf=LinearRegression(), target_days=7, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, days=days):
    """Trains and tests classifier on training and test datasets.
    Append predictions to `predictions_800_off`.
    """
    # Classify and predict
    clf = MultiOutputRegressor(clf)
    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)
    print "Pred: ", pred
    predictions_800_off.append(pred)

In [None]:
# Pared-down execute function that runs train-test cycles and 
# appends the predictions to `predictions_800_off` via the function `predict()`.
def execute_viz(steps=8, buffer_step=200, days=7, periods=1000, model=LinearRegression(), predict_days=7):
    """Performs `steps` train-test cycles and prints evaluation metrics for BP + FTSE data.
    `steps`: number of train-test cycles.
    `periods`: the total number of datapoints used in each cycle (training + test)
    `buffer_step`: number of datapoints between the starting points of each
    consecutive train-test cycle
    """
    for segment in range(steps):
        buffer = segment*buffer_step
        print "Buffer: ", buffer
        X_train, X_test, y_train, y_test = prepare_train_test_with_ftse(days=days, periods=periods, buffer=buffer, df=bp_ftse)
        predict(clf=model, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, days=days)

In [None]:
# Extract predictions. 
# `execute_viz` function appends predictions to `predictions_800_off`.
execute_viz(steps=35)
predictions_800_off

In [None]:
# Put all 7-days-ahead predictions into an array
predictions_800_7thday = []
for array in predictions_800_off:
    for week_prediction in array:
        predictions_800_7thday.append(week_prediction[6]) 
print len(predictions_800_7thday)
predictions_800_7thday

In [None]:
# Prepare dataframe for visualisation
# There are 7000 predictions
bp_final_predictions = bp_ftse[800+6:806+7000]
bp_final_predictions.loc[:,'7d Ahead Pred'] = predictions_800_7thday

In [None]:
# Plotting predictions compared with actual adjusted close prices
bp_final_predictions.plot(y=['Adj. Close','7d Ahead Pred'], x='Date').set_title("Model Predictions against BP Actual Adjusted Close Prices")

In [None]:
# Plotting predictions compared with actual prices
# Only first 200 predictions
bp_preds_200 = bp_final_predictions[:200]
bp_preds_200.plot(y=['Adj. Close','7d Ahead Pred'], x='Date', y='Price (£)').set_title("Model Predictions against BP Actual Adjusted Close Prices")