In [71]:
import pandas as pd 
import numpy as np
import keras
import math 
import random 

from matplotlib import pyplot as plt
from sklearn.preprocessing import MinMaxScaler

from lib.Trend import TrendData, extract_trend

In [72]:
# PREPROCESSING
def scale_col_values(
    df: pd.DataFrame, 
    col_name:str, 
    min_value:float=0, 
    max_value:float=1
): 
    values = df[col_name].values.reshape(-1, 1)
    scaler = MinMaxScaler(feature_range=(min_value, max_value))
    scaled_values = scaler.fit_transform(values)
    df[col_name] = scaled_values.transpose()[0]
    return df

def squash_col_outliers(
    df: pd.DataFrame, 
    col_name: str, 
    min_quantile: float =0.01, 
    max_quantile:float =0.99
): 
    q_lo = df[col_name].quantile(min_quantile)
    q_hi  = df[col_name].quantile(max_quantile)
    
    df.loc[df[col_name] >= q_hi, col_name] = q_hi
    df.loc[df[col_name] <= q_lo, col_name] = q_lo
    return df

df = pd.read_csv("data/prices-d.csv", index_col=0)
df.pop("Volume")
df["Range"] = (df["High"] - df["Low"]) / df["Open"]
df.pop("Open")
df.pop("High")
df.pop("Low")
df['Change'] = df["Adj Close"].pct_change()
df = df. tail(-1) 
df.pop("Close")
df = pd.DataFrame(df.values, columns=['Adj Close', 'Range', 'Change'])
df = squash_col_outliers(df, 'Change')
df = squash_col_outliers(df, "Range", min_quantile=0.0, max_quantile=0.97)
df = scale_col_values(df, 'Change')
df = scale_col_values(df, 'Range')

trend = extract_trend(df['Adj Close'], 100)
df['Trend'] = trend.as_boolean(df['Adj Close'][0])
df.pop("Adj Close")
df.head()

Unnamed: 0,Range,Change,Trend
0,1.0,0.437608,0.0
1,1.0,0.057881,0.5
2,1.0,0.0,0.5
3,1.0,0.0,0.5
4,0.989125,0.353968,1.0


In [None]:
# VI: Shaping the Data for LSTM Input

In [73]:
'''
Finally, we have 3 columns (or features): Range, Change, and Trend. 
Let's pretend that Change is what we want the model to predict. 

The input for a keras LSTM requires a three dimensional array with the shape: 
(s, t, f) 

s = samples: the number of samples in the data set (i.e. the number of rows of data) 
t = timesteps: the number of timesteps to be input for each sample (also sometimes called the 'lag')
f = features: the number of distinct features to be considered; in this case, 3 (Range, Change, Trend)

An LSTM can predict multiple output features, and can do so with a variable offset and width. But just to 
keep things simple, we'll assume for this example that the output offset is 1, the LSTM will predict only 
one output feature (Change), and it will predict for only one timestep: the next day's Change. 

Note also that the output feature need not be one of the input features as well. In this case, Change is 
present in both the input and the output. 

X represents the input values. 
y represents the predicted or expected values. 

X: Range(t[-10:0]), Change(t[-10:0]), Trend(t[-10:0])
y: Change (t+1)

Steps:
1. Extract the 'y' values, or the values to be predicted. This is supervised learning, so these are all 
the 'correct' answers for training. 

2. Window the appropriate number of timesteps for each input 

3. Add one example of each feature, to each window 

Because the LSTM keeps a memory of more recent inputs, data is fed into it in a forward walking window the size
of a predetermined number of timesteps. Each discrete input contains multiple overlapping windows, and each 
window contains one example of each feature. It's easier to explain with an example: 

The raw input data has 10 rows of 2 features each: f1, f2. It looks like this: 
'''
_df1 = pd.DataFrame()
_df1['f1'] = ['r0f1', 'r1f1', 'r2f1', 'r3f1', 'r4f1', 'r5f1', 'r6f1', 'r7f1', 'r8f1'] 
_df1['f2'] = ['r0f2', 'r1f2', 'r2f2', 'r3f2', 'r4f2', 'r5f2', 'r6f2', 'r7f2', 'r8f2'] 
_df1.head(9)

Unnamed: 0,f1,f2
0,r0f1,r0f2
1,r1f1,r1f2
2,r2f1,r2f2
3,r3f1,r3f2
4,r4f1,r4f2
5,r5f1,r5f2
6,r6f1,r6f2
7,r7f1,r7f2
8,r8f1,r8f2


In [74]:
'''
So the outermost dimension of the 3-dimensional input array will have 10 elements. Each of those elements 
will be an array, so let's create this to begin with: 

[ [] [] [] [] [] [] [] [] [] [] ]

It's an array containing 10 empty arrays. 

How many timesteps? Let's say 3. So each of those empty arrays will have 3 arrays inside of them. Each of 
those innermost arrays will contain the 2 features. 

To simplify, first create an array of 3-element arrays, where each element of the inner array represents 
one row. Since this is daily data, we'll call row 0 d0, row 1 is d1, and so on. 

'''
_x2 = [['  ', '  ', 'd0'],['  ', 'd0', 'd1'],['d0', 'd1', 'd2'],['d1', 'd2', 'd3'],['d2', 'd3', 'd4'],['d3', 'd4', 'd5'],['d4', 'd5', 'd6'],['d5', 'd6', 'd7'],['d6', 'd7', 'd8']]
_y = ['', '', 'd3', 'd4', 'd5', 'd6', 'd7', 'd8', '']
def printxy(X, y): 
    print('X:[            y:[')
    for i in range(len(X)): 
        print(f'   [{X[i][0]} {X[i][1]} {X[i][2]}]     {y[i]}')
    print(']                ]')
printxy(_x2, _y)

X:[            y:[
   [      d0]     
   [   d0 d1]     
   [d0 d1 d2]     d3
   [d1 d2 d3]     d4
   [d2 d3 d4]     d5
   [d3 d4 d5]     d6
   [d4 d5 d6]     d7
   [d5 d6 d7]     d8
   [d6 d7 d8]     
]                ]


In [75]:
'''
There are missing values, because in the beginning two records, there is a lack of previous data for t-2, and t-1, 
and at the end it's impossible to make a prediction because we don't have the future y value; this is expected. 
If we remove those missing-data rows then we are left with: 
'''
_x2 = _a2[2:-1]
_y = _y[2:-1]
printxy(_x2, _y)


X:[            y:[
   [d0 d1 d2]     d3
   [d1 d2 d3]     d4
   [d2 d3 d4]     d5
   [d3 d4 d5]     d6
   [d4 d5 d6]     d7
   [d5 d6 d7]     d8
]                ]


In [76]:
# extract X with the given number of timesteps
# df: the DataFrame
# ntimesteps: number of timesteps
#
def extract_X(df: pd.DataFrame, ntimesteps: int): 
    features = len(df.columns)
    X = list()
    
    #offset for timesteps
    offsets = list()
    for i in range (ntimesteps, 0, -1): 
        offsets.append(df.shift(i))
        
    #combine timestep columns into rows 
    combined = pd.concat(offsets, axis=1)
    combined = combined.tail(-ntimesteps) 
    combined.drop(combined.tail(1).index, inplace=True)
    
    #reshape each row (timesteps, features)
    for i in range(len(combined)): 
        row = combined.iloc[i].to_numpy()
        xrow = list()
        for n in range(ntimesteps): 
            xrow.append(row[n*features:(n*features)+features])
        X.append(xrow)
    
    #return as numpy array
    return np.array(X)

In [77]:
# extract y column (the col to be predicted)
# df: the DataFrame
# col_name: the name of the column to be predicted 
# ntimesteps: number of timesteps
#
def extract_y(df: pd.DataFrame, col_name: str, ntimesteps: int): 
    shifted = df.shift(-1)
    shifted = shifted.head(-2)
    shifted = shifted.tail(-(ntimesteps-1))
    return shifted[col_name].values


In [78]:
'''

Replace each day (row) with an array containing the two features of that day (row). So d0 becomes 
the two-element array [r0f1, r0f2]. 
'''
_y1 = extract_y(_df1, 'f2', 3)
_x1 = extract_X(_df1, 3)
print(_x1)

[[['r0f1' 'r0f2']
  ['r1f1' 'r1f2']
  ['r2f1' 'r2f2']]

 [['r1f1' 'r1f2']
  ['r2f1' 'r2f2']
  ['r3f1' 'r3f2']]

 [['r2f1' 'r2f2']
  ['r3f1' 'r3f2']
  ['r4f1' 'r4f2']]

 [['r3f1' 'r3f2']
  ['r4f1' 'r4f2']
  ['r5f1' 'r5f2']]

 [['r4f1' 'r4f2']
  ['r5f1' 'r5f2']
  ['r6f1' 'r6f2']]]


In [79]:
'''
The row numbers are ordinal in each column going from top to bottom, and ordinal from left to right. 
That's the input format. Since the first two rows contain nulls, we'd remove them. So we end up with 
the number of rows being r = (r - (timesteps - 1))

Now the y values are just a scalar array of feature 2 from each row, but shifted back 1. 
'''
printxy(_x1, _y1)


X:[            y:[
   [['r0f1' 'r0f2'] ['r1f1' 'r1f2'] ['r2f1' 'r2f2']]     r3f2
   [['r1f1' 'r1f2'] ['r2f1' 'r2f2'] ['r3f1' 'r3f2']]     r4f2
   [['r2f1' 'r2f2'] ['r3f1' 'r3f2'] ['r4f1' 'r4f2']]     r5f2
   [['r3f1' 'r3f2'] ['r4f1' 'r4f2'] ['r5f1' 'r5f2']]     r6f2
   [['r4f1' 'r4f2'] ['r5f1' 'r5f2'] ['r6f1' 'r6f2']]     r7f2
]                ]


In [80]:

'''
Because the y values are shifted by one, we have to lose one more row from the training data - we don't 
have tomorrow's value, so we need to remove the last row this time. 
'''
print(_x1.shape)

(5, 3, 2)


In [69]:
class DataSet:
    def __init__(self, X, y): 
        if X.ndim != 3: 
            raise Exception("Expected a 3-dimensional array for X")
        if y.ndim != 1: 
            raise Exception("Expected a 1-dimensional array for y")
        if len(X) != len(y): 
            raise Exception("Length of X and y must be the same")
        
        self.X = X
        self.y = y
    
    # pct% of the dataset will be split off and returned as a new DataSet
    def split(self, pct:float): 
        count = int(self.size * pct)
        new_dataset = DataSet(self.X[:count], self.y[:count])
        self.X = self.X[:-count]
        self.y = self.y[:-count]
        return new_dataset
        
    @property
    def size(self): 
        return len(self.X)


In [81]:
'''
Finally we can take all of the scaled, processed, shaped data as a whole and split it into training, 
evaluation, and testing sets with an approximately 70-20-10 split: 
'''
train = DataSet(X, y)
val = train.split(0.3)
test = val.split(0.3)

In [82]:
print(f'train set has {train.size} samples')
print(f'eval set has {val.size} samples')
print(f'test set has {test.size} samples')

train set has 2240 samples
eval set has 672 samples
test set has 288 samples


In [83]:

print('train X shape:', train.X.shape)
print('eval X shape', val.X.shape)
print('test X shape:', test.X.shape)


train X shape: (2240, 10, 3)
eval X shape (672, 10, 3)
test X shape: (288, 10, 3)


In [None]:
'''
And that's the input shape for a tensorflow LSTM. 
'''