In [65]:
import pandas as pd 
import numpy as np
import keras
import math 
import random 

from matplotlib import pyplot as plt
from sklearn.preprocessing import MinMaxScaler

from lib.Trend import TrendData, extract_trend

In [66]:
def scale_col_values(
    df: pd.DataFrame, 
    col_name:str, 
    min_value:float=0, 
    max_value:float=1
): 
    values = df[col_name].values.reshape(-1, 1)
    scaler = MinMaxScaler(feature_range=(min_value, max_value))
    scaled_values = scaler.fit_transform(values)
    df[col_name] = scaled_values.transpose()[0]
    return df

In [67]:
def squash_col_outliers(
    df: pd.DataFrame, 
    col_name: str, 
    min_quantile: float =0.01, 
    max_quantile:float =0.99
): 
    q_lo = df[col_name].quantile(min_quantile)
    q_hi  = df[col_name].quantile(max_quantile)
    
    df.loc[df[col_name] >= q_hi, col_name] = q_hi
    df.loc[df[col_name] <= q_lo, col_name] = q_lo
    return df

In [68]:
# extract y column (the col to be predicted)
# df: the DataFrame
# col_name: the name of the column to be predicted 
# ntimesteps: number of timesteps
#
def extract_y(df: pd.DataFrame, col_name: str, ntimesteps: int): 
    shifted = df.shift(-1)
    shifted = shifted.head(-2)
    shifted = shifted.tail(-(ntimesteps-1))
    return shifted[col_name].values


In [69]:
# extract X with the given number of timesteps
# df: the DataFrame
# ntimesteps: number of timesteps
#
def extract_X(df: pd.DataFrame, ntimesteps: int): 
    features = len(df.columns)
    X = list()
    
    #offset for timesteps
    offsets = list()
    for i in range (ntimesteps, 0, -1): 
        offsets.append(df.shift(i))
        
    #combine timestep columns into rows 
    combined = pd.concat(offsets, axis=1)
    combined = combined.tail(-ntimesteps) 
    combined.drop(combined.tail(1).index, inplace=True)
    
    #reshape each row (timesteps, features)
    for i in range(len(combined)): 
        row = combined.iloc[i].to_numpy()
        xrow = list()
        for n in range(ntimesteps): 
            xrow.append(row[n*features:(n*features)+features])
        X.append(xrow)
    
    #return as numpy array
    return np.array(X)

In [70]:
class DataSet:
    def __init__(self, X, y): 
        if X.ndim != 3: 
            raise Exception("Expected a 3-dimensional array for X")
        if y.ndim != 1: 
            raise Exception("Expected a 1-dimensional array for y")
        if len(X) != len(y): 
            raise Exception("Length of X and y must be the same")
        
        self.X = X
        self.y = y
        
    def split(self, pct): 
        count = int(self.size * pct)
        new_dataset = DataSet(self.X[:count], self.y[:count])
        self.X = self.X[:-count]
        self.y = self.y[:-count]
        return new_dataset
        
    @property
    def size(self): 
        return len(self.X)

In [71]:
def shape_dataset(df, ntimesteps, y_col_name): 
    X = extract_X(df, ntimesteps)
    y = extract_y(df, 'Change', ntimesteps)
    return DataSet(X, y)

In [None]:
# begin 

In [72]:
#read in the data 
df = pd.read_csv("data/prices-d.csv", index_col=0)

In [73]:
# extract range
df["Range"] = (df["High"] - df["Low"]) / df["Open"]

In [74]:
# detrend (capture daily change %)
df['Change'] = df["Adj Close"].pct_change()
df = df. tail(-1) 

In [75]:
# remove extra columns
df.drop(['Open','High','Low','Close','Volume'], axis='columns', inplace=True)
df = pd.DataFrame(df.values, columns=['Adj Close', 'Range','Change'])
df.head()

Unnamed: 0,Adj Close,Range,Change
0,1.588667,0.276076,-0.002512
1,1.464,0.226,-0.078473
2,1.28,0.19087,-0.125683
3,1.074,0.2085,-0.160937
4,1.053333,0.10061,-0.019243


In [76]:
# remove outliers 
df = squash_col_outliers(df, "Change")
df = squash_col_outliers(df, "Range", min_quantile=0.0, max_quantile=0.97)

In [77]:
# scale the data 
df = scale_col_values(df, "Change")
df = scale_col_values(df, "Range")

In [78]:
# add the trend data 
trend = extract_trend(df['Adj Close'], 100)
df['Trend'] = trend.as_boolean(df['Adj Close'][0])

# remove Adj Close 
df.pop("Adj Close")
df.head()

Unnamed: 0,Range,Change,Trend
0,1.0,0.437608,0.0
1,1.0,0.057881,0.5
2,1.0,0.0,0.5
3,1.0,0.0,0.5
4,0.989125,0.353968,1.0


In [89]:
# shape for input 
train = shape_dataset(df, 10, 'Change')

In [90]:
# split into training, eval, and testing sets
val = train.split(0.3)
test = val.split(0.3)
print(train.size)
print(val.size)
print(test.size)

2240
672
288


3211
