In [None]:
#default_exp finance.data

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
# export
from lib.data.lists import *
from pathlib import Path

import pandas as pd
import numpy as np

#import matplotlib.pyplot as plt

# Import the main functionality from the SimFin Python API.
import simfin as sf

# Import names used for easy access to SimFin's data-columns.
from simfin.names import *

In [None]:
# Version of the SimFin Python API.
sf.__version__

In [None]:
# export
def initiateSimFin(key='free'):
    # SimFin data-directory.
    sf.set_data_dir('~/simfin_data/')
    # SimFin load API key or use free data.
    sf.load_api_key(path='~/simfin_api_key.txt', default_key=key)
    return Path.home()/"simfin_data"


In [None]:
# export

#   extrem_increase,extrem__decrease = [], 2.5, -.73
#   extrem_increase,extrem__decrease = [], 1.5, -.5
EXTREM_HL = "extrem_hl"
def flagstocks_extrem_hl(df_prices, extrem_increase, extrem__decrease):
    #remove mature stocks with "nan" prices
    if EXTREM_HL in df_prices.columns:
        df_prices.drop(labels=EXTREM_HL,axis=1,inplace=True)

    gt_p = df_prices[[HIGH]].gt(extrem_increase)
    gt_p.insert(0,"lt",df_prices[[LOW]].lt(extrem__decrease))
    df_prices.insert(df_prices.columns.get_loc(SIMFIN_ID)+1,EXTREM_HL,gt_p.any(axis=1))
    return df_prices

TOO_FEW_DAYS = "too_few_days"
def flagstocks_too_few_trading_days(df_prices, minimum_tradingdays):
    grp_res = df_prices.groupby(["Ticker"]).apply(lambda group: len(group) < minimum_tradingdays)
    grp_res.name = TOO_FEW_DAYS
    merged = df_prices[[SIMFIN_ID]].join(grp_res, on=TICKER)
    if TOO_FEW_DAYS in df_prices.columns:
        df_prices.drop(labels=TOO_FEW_DAYS,axis=1,inplace=True)
    df_prices.insert(df_prices.columns.get_loc(SIMFIN_ID)+1, TOO_FEW_DAYS, merged[TOO_FEW_DAYS].values)
    return df_prices

HAS_NAN_DAYS = "has_nan_days"
def flagstocks_with_nan_days(df_prices):
    grp_res = df_prices.groupby(["Ticker"]).apply(lambda group: group[[OPEN,LOW,HIGH,CLOSE]].isnull().values.any() )
    grp_res.name = HAS_NAN_DAYS
    merged = df_prices[[SIMFIN_ID]].join(grp_res, on=TICKER)
    if HAS_NAN_DAYS in df_prices.columns:
        df_prices.drop(labels=HAS_NAN_DAYS,axis=1,inplace=True)
    df_prices.insert(df_prices.columns.get_loc(SIMFIN_ID)+1, HAS_NAN_DAYS, merged[HAS_NAN_DAYS].values)
    return df_prices

def findInValidStocks(df_prices):
    flags  = [HAS_NAN_DAYS,TOO_FEW_DAYS,EXTREM_HL]
    groups = df_prices.groupby(["Ticker"])
    return np.array([name for (name,group) in groups if group[flags].any(axis=1).any()])

def findValidStocks(df_prices):
    flags  = [HAS_NAN_DAYS,TOO_FEW_DAYS,EXTREM_HL]
    groups = df_prices.groupby(["Ticker"])
    return np.array([name for (name,group) in groups if not group[flags].any(axis=1).any()])


In [None]:
# export

from lib.data.lists import *
import torch
from torch.tensor import *

torch.set_default_dtype(torch.float64)
class Data():
    def __init__(self, df_prices):
        self.df_prices = df_prices
        self.np_prices = None
        self.seq_len   = None
        self.columns   = None  #training and test columns
        #insert a row counter/index
        #if "idx" not in self.df_prices.columns:
        #    self.df_prices.insert(loc=0, column="idx", value=np.arange(len(self.df_prices),dtype=int))
        self.df_prices["idx"] = np.arange(len(self.df_prices),dtype=int)

    #set sequence length and columns used for training and test
    def changeTraining(self,seq_len,columns,n_predict_columns):
        self.seq_len = seq_len
        self.columns = columns
        self.n_predict_columns = n_predict_columns
        
        #point to the first predictabel sample in a stock. 
        #this sample is predicted using df_prices[:seq_len,columns]
        ix_predict = np.arange(len(self.df_prices),dtype=int)        
        for g,d in self.df_prices.groupby(["Ticker"]):
            ix_predict[d.idx[:seq_len]] = -1
        self.df_prices["ix_predict"] = ix_predict
        self.np_prices = self.df_prices[self.columns].to_numpy()

    def getTensor(self, ix, r_mixin=0.0, r_ix=None):
        #pt: price target
        #pi: price input

        prices = self.np_prices #readability and local pointer is faster if used multiple times
        pi     = prices[ix - self.seq_len : ix ] 
        pt     = prices[ix, :self.n_predict_columns ]   
        if r_ix:
            #mix with another sequence
            ro  = 1 - r_mixin
            pi  = pi*ro + r_mixin*prices[r_ix - self.seq_len : r_ix]
            pt  = pt*ro + r_mixin*prices[r_ix,:self.n_predict_columns]
        return torch.tensor( pi ), torch.tensor( pt ) #, index
    """
    def getTensor(self, ix, r_mixin=0.0, ix_r=None):
        #pt: price target
        #pi: price input

        prices = self.np_prices #readability and local pointer is faster if used multiple times
        ixb    = ix - self.seq_len
        ixe    = ix+len(self.columns)
        pi     = torch.tensor( prices[ixb:ix ] )
        pt     = torch.tensor( prices[ ix,:]   )
        if r_mixin > 0.0:
            #mix with another sequence
            ix  = ix_r
            ixb = ix - self.seq_len
            ixe = ix + len(self.columns)
            ro  = 1 - r_mixin
            #print(f"ixb,ix,ixe: {ixb,ix,ixe}")
            pi  = pi.mul_(ro) + torch.tensor( prices[ixb:ix] ).mul_(r_mixin)
            pt  = pt.mul_(ro) + torch.tensor( prices[ix,:]   ).mul_(r_mixin)
            #pi  = pi*ro + torch.tensor( prices[ixb:ix] )*r_mixin
            #pt  = pt*ro + torch.tensor( prices[ix,:]   )*r_mixin
            #pi  = pi.mul_(ro).add_( torch.tensor( prices[ixb:ix] ).mul_(r_mixin) )
            #pt  = pt.mul_(ro).add_( torch.tensor( prices[ix,:]   ).mul_(r_mixin) )
        return pi, pt #, index
        """
    def getSequenceIndex(self,stocks):
        #idx_seq = self.df_prices.loc[stocks,"ix_predict"].to_numpy()
        idx_seq = self.df_prices.ix_predict.loc[stocks].to_numpy()
        idx_seq = idx_seq[idx_seq>=0]
        return idx_seq
    

In [None]:
import random
#Create a dataset that uses all the stocks and daily prices
#In order to save memory the df_prices is the same for both training and validation.
#np_prices contains interleaved prices of the columns used for training/validation
#fx if the require columns are "open", "close". Then np_prices contain row, columns values from
#df_price in the following order : open_row1, close_row1, open_row2, close_row2, ...

#for now self.np_prices are dublicated. Thus using more memory than needed.
#Test and validation contain different stock taken drom the same df_prices

#stocks    : are the stock tickers
#stock_days: is the number of days pr stock. The number of days is in different for each stock
#            The total number of samples is stock_days.sum()
#batch_size: is the number of stocks in a batch
#seq_length: is the number of stocks days in a batch.
#the number of sample in the batch is seq_length*len(column_names)

#As for now the dataset progress sequentially through the data during an epoch from a random offset.
#Notice that there is no end of sequence-token to reset the model when the sequencen wraps around
TRAIN_TEST   = "train_test"

        
class OHLCDataset(torch.utils.data.Dataset):
    #x, y significes input vs output
    def __init__(self, data, stocks, train_test="invalid"):
        self.data         = data #share data
        self.stocks       = stocks

        #Index to all start of sequences in the dataset
        #The target is taken from the day after the sequence
        self.idx_seq      = None
        self.mixin        = 0.0  #ration to mix to sequences
        self.train_test   = train_test        
        self.idx_seq      = data.getSequenceIndex(stocks)
        
        
    def changeAugmentation( self, mixing=0.5 ):
        self.mixin = mixing
        self.do_mix = self.mixin>0.0
        self.ix_mix_queue = -1   
        #self.idx_seq[random.randint(0, len(self)-1)] if self.mixin>0 else None

    def __len__(self):
        return len(self.idx_seq)

    def __getitem__(self, index):
        #pt: price target
        #pi: price input
        ix = self.idx_seq[index]

        #mix with another sequence
        
        """
        if self.do_mix:
            #performance optimization. It is much faste to generate many random numbers
            #and make som book keeping 
            if self.ix_mix_queue==-1:
                rng = np.random.default_rng()
                self.rand_idx_seq   = rng.integers(0, len(self.idx_seq)-1,size=len(self.idx_seq))
                #print(f"self.rand_idx_seq.shape :{self.rand_idx_seq.shape}\n{self.rand_idx_seq[:10]}")
                self.rand_fractions = rng.random(size=len(self.idx_seq))
                #print(f"self.rand_fractions.shape :{self.rand_fractions.shape}\n{self.rand_fractions[:10]}")
                self.ix_mix_queue   = len(self.rand_idx_seq)-1

            r_ix    = self.idx_seq[self.rand_idx_seq[self.ix_mix_queue]] # self.idx_seq[random.randint(0, len(self)-1)] if self.mixin>0 else None
            r_mixin = self.rand_fractions[self.ix_mix_queue] #random.uniform(0, self.mixin )               if self.mixin>0 else 0.0
            self.ix_mix_queue -= 1 
            #print(f"ix_mix_queue, r_ix,r_mixin÷{self.ix_mix_queue,r_ix,r_mixin}")
        else:
            r_mixin=0.0
            r_ix=None
        """
            
        r_ix    = self.idx_seq[random.randint(0, len(self)-1)] if self.do_mix else None
        r_mixin = random.uniform(0, self.mixin )               if self.do_mix else 0.0
            
        return self.data.getTensor(ix=ix, r_mixin=r_mixin, r_ix=r_ix)

    #stocks: stocks that you want the price_input and target for
    def getBatch(self, stocks):
        #pt: price target
        #pi: price input
        #extract the rows in df_proces containing the stocks in the batch
        idx = self.getSequenceIndexing(stocks);


        """
        df_prices_selected = self.df_prices[ self.df_prices.index.get_level_values(0).isin(stocks) ]
        stock_grps         = df_prices_selected.groupby(["Ticker"])
        idx, seq_len = [], self.seq_length
        for name,group in stock_grps:
            idx.extend(group["idx"].values[seq_len:len(group)])
        idx = np.array(idx,dtype=int)

        pt = self.df_prices.iloc[idx]
        pi = [self.df_prices.iloc[ib:ie, self.ix_columns].values for ib,ie in zip(pt.idx - seq_len, pt.idx)]
        return np.array(pi), self.df_prices.iloc[idx].copy()
        """

    def dataloader(self, batch_size:int, shuffle:bool, num_workers:int=0, drop_last=False):
        return torch.utils.data.DataLoader(self, batch_size=batch_size, shuffle=shuffle,
                                           num_workers=num_workers, drop_last=drop_last)

    def split2train_test(self, test_percentage):
        #split the stocks in train and test stocks
        #It might be better to make a split in historical vs new prices
        ix_all   = np.arange(len(self.stocks),dtype=int)
        np.random.shuffle(ix_all)

        nb_test  = int(round(test_percentage*len(self.stocks)))
        ix_train = ix_all[nb_test:]
        ix_test  = ix_all[:nb_test]
        train_stocks = self.stocks[ix_train]
        test_stocks  = self.stocks[ix_test]
        self.data.df_prices[TRAIN_TEST]   = "invalid"
        self.data.df_prices.loc[train_stocks,TRAIN_TEST] = "train"
        self.data.df_prices.loc[test_stocks,TRAIN_TEST ] = "test"
        return self.__class__(self.data, self.stocks[ix_train], "train"),\
               self.__class__(self.data, self.stocks[ix_test],  "test")


In [None]:
# export

def loadShareprices():
    #Load shareprices for testing and development
    dataPath = initiateSimFin(key='free')
    print(f"dataPath:{dataPath} exists:{dataPath.exists()}")

    # Data for USA.
    market = 'us'
    # Daily Share-Prices.
    return sf.load_shareprices(variant='daily', market=market)


PREV_CLOSE = "previous_close"
def alignPreviousClose(df_prices):
    # Make a new column to show the closing price from the previous day on the same line as current day.
    # This is done using the shift function for the dataserie
    #
    # Result: All stock and prices are listed in the same tabel. Therefore, the firat priceline of each stock vil now
    # contain the close of the previous stock. For the first stock this value vil be "nan".
    # These incoherent pricelines are removed in the nest step

    #if PREV_CLOSE not in df_prices.columns:
    if not PREV_CLOSE in df_prices.columns:
        df_prices.insert(df_prices.columns.get_loc(OPEN), PREV_CLOSE, df_prices[CLOSE].shift(), allow_duplicates=False)

    #identify the first stock in each stockgroup and the remove it
    stock_name    = df_prices.index.get_level_values(0)
    new_stock     = np.ones(len(df_prices), dtype=bool)
    new_stock[1:] = stock_name[0:len(stock_name)-1] != stock_name[1:len(stock_name)]
    df_prices.drop(df_prices.index[new_stock], axis=0, inplace=True)
    return df_prices

#extrem_increase,extrem__decrease = 2.5, -.73
#extrem_increase,extrem__decrease = 1.5, -.5
def procesSharePrices(df_prices, minimum_tradingdays=180, extrem_increase = 0.5, extrem__decrease = -0.5):

    # load and identify valid data
    df_prices = alignPreviousClose(df_prices)
    df_prices = logMinusPreviousClose(df_prices)

    flagstocks_extrem_hl(df_prices, extrem_increase, extrem__decrease)
    flagstocks_too_few_trading_days(df_prices,minimum_tradingdays)
    flagstocks_with_nan_days(df_prices)

    validStocks, inValidStocks = findValidStocks(df_prices), findInValidStocks(df_prices)

    stock_grps = df_prices.groupby(["Ticker"])
    stocks     = np.array(list(stock_grps.groups))
    sizes      = stock_grps.size()
    print(f"number of stocks:         {len(stocks)}")
    print(f"number of valid stocks:   {len(validStocks)}")
    print(f"number of invalid stocks: {len(inValidStocks)}")

    print(f"smallest pricelines pr stock: {sizes.sort_values()[:5]}")
    print(f"longest pricelines pr stock:  {sizes.sort_values()[-5:]}")

    return df_prices, stocks, validStocks, inValidStocks

predict_prefix="predict_"
PREDICT_OPEN  = predict_prefix+OPEN
PREDICT_HIGH  = predict_prefix+HIGH
PREDICT_LOW   = predict_prefix+LOW
PREDICT_CLOSE = predict_prefix+CLOSE

def predict_stocks(dataset, modelmanager, stocks, tfm_input ):
    price_sequences, price_targets = dataset.getBatch(stocks=stocks)
    predictions    = modelmanager.predict(price_sequences, tfm_input)

    # inser the preduction in price_targets
    prediction_columns = [ predict_prefix + name for name in dataset.column_names ]
    print(prediction_columns)
    for idx,name in enumerate(prediction_columns):
        print(idx, name)
        if name in price_targets.columns :
            price_targets.drop([name], axis='columns', inplace=True)
        else:
            price_targets.insert(len(price_targets.columns), name, predictions[:,idx-1].numpy())
    return price_targets, prediction_columns

In [None]:
# export

#Convert OHLC to percentages of the previous days closing price
#tahe the log onallprice action and subtract Previousclose from from price
#to arrive at log percentage change relativ til previous close
def logMinusPreviousClose(df_prices):
    df_prices[[PREV_CLOSE,OPEN,CLOSE,LOW,HIGH]] = df_prices[[PREV_CLOSE,OPEN,CLOSE,LOW,HIGH]].apply(np.log)
    df_prices[[OPEN,CLOSE,LOW,HIGH]]            = df_prices[[OPEN,CLOSE,LOW,HIGH]].sub(df_prices[PREV_CLOSE],axis=0)
    return df_prices

normalized_suffix="_normalized"
OPEN_NORM  = OPEN +normalized_suffix
HIGH_NORM  = HIGH +normalized_suffix
LOW_NORM   = LOW  +normalized_suffix
CLOSE_NORM = CLOSE+normalized_suffix

def normalizeData( df_prices, stats, normalized_suffix=normalized_suffix):
    normalized_columns = []
    for c in [OPEN,HIGH,LOW,CLOSE]:
        mean,std = [stats.loc["mean",c], stats.loc["std",c]]
        normalized_columns.append( c+normalized_suffix )
        df_prices[c+normalized_suffix] = df_prices[c].div(std)
    return df_prices,normalized_columns

def truncateExtremes(df_prices,training_columns,stats, percent_min, percent_max):
    for c in training_columns:
        v_min,v_max = [stats.loc[percent_min,c], stats.loc[percent_max,c]]
        ix = df_prices[c] < v_min
        df_prices.loc[ix,c] = v_min
        ix = df_prices[c] > v_max
        df_prices.loc[ix,c] = v_max
    return df_prices

# Test

# Statics on the mature stocks
min, max, mean, std, percentiles
calculate the normalization numbers

# Create dataset
The dataset must a batch with number of stock = batch_size
Each sequence of stockprice (ohlc) return from the dataset must be of the samme sequence_length During the training the network will process the sequence day by day

At the beginning of each epoch It must be possible to shuffle the stock It must be possible to shuffle the start date of the stockprices for each stock

The dataframe will remain fixed during the training using indirect indexing This will be faster and use less memory for large dataframes.

In [None]:
%time df_prices = loadShareprices()

In [None]:
%time df_prices, stocks, validStocks, inValidStocks = procesSharePrices(df_prices)

In [None]:
#statistics on the mature stocks
stats = df_prices.loc[validStocks,[PREV_CLOSE,OPEN,HIGH,LOW,CLOSE]].describe(percentiles=\
                                                                             [0.0002, 0.01, 0.25, 0.75, 0.99, 0.9998])        
stats

In [None]:
%%time
df_valid =df_prices.loc[validStocks]
print( f"percentage =>0: {df_valid[CLOSE].ge(0).sum()/len(df_valid)*100}")
print( f"percentage ==0: {df_valid[CLOSE].eq(0).sum()/len(df_valid)*100}")
print( f"percentage  <0: {df_valid[CLOSE].lt(0).sum()/len(df_valid)*100}")

print( f"mean=0 percentage =>0: {(df_valid[CLOSE]-stats.loc['mean',CLOSE]).ge(0).sum()/len(df_valid)*100}")
print( f"mean=0 percentage ==0: {(df_valid[CLOSE]-stats.loc['mean',CLOSE]).eq(0).sum()/len(df_valid)*100}")
print( f"mean=0 percentage  <0: {(df_valid[CLOSE]-stats.loc['mean',CLOSE]).lt(0).sum()/len(df_valid)*100}")
del df_valid

In [None]:
%%time 

from lib.learner.learner import*
from lib.learner.optimizers import*
from lib.model.model import*
from lib.model.modelmanager import*
import torch.nn as nn

training_columns = [CLOSE,OPEN]
#mean is so close to zero so we only devide by std
stats = df_prices.loc[validStocks].describe(percentiles=[0.0002, 0.01, 0.25, 0.75, 0.99, 0.9998])
df_prices, normalized_columns = normalizeData(df_prices,stats)

#truncate extrems
stats = df_prices.loc[validStocks,normalized_columns].describe(percentiles=[0.0002, 0.01, 0.25, 0.75, 0.99, 0.9998])
df_prices = truncateExtremes(df_prices,normalized_columns, stats, "1%", "99%")

In [None]:
seq_length = 360
training_columns = [CLOSE_NORM,OPEN_NORM]
n_predict_columns=1
print(f"seq_length: {seq_length} training columns = {training_columns} n_predict_columns:{n_predict_columns}")

data = Data(df_prices)
data.changeTraining(seq_length,training_columns,n_predict_columns=n_predict_columns)


In [None]:
%%time

ohlc_ds = OHLCDataset(data, stocks=validStocks)
train_ds, test_ds = ohlc_ds.split2train_test(0.25)
train_ds.changeAugmentation( mixing=0.5 )

print(f"number of stocks, train stocks, test stocks: {len(ohlc_ds.stocks)}, {len(train_ds.stocks)}, {len(test_ds.stocks)}")

databunch = DataBunch(train_ds.dataloader(batch_size=2048, shuffle=True,  drop_last=True), \
                      test_ds.dataloader( batch_size=4096, shuffle=False, drop_last=False), \
                      c_in=len(ohlc_ds.data.columns), c_out=len(ohlc_ds.data.columns))

#print("the following lengths must be the same")
#%time stock_days = [len(ohlc_ds.stock_grps.get_group(stock)) for stock in ohlc_ds.stocks]
#print(len(ohlc_ds), sum(stock_days)-len(stock_days)*(seq_length+1))

In [None]:
%time l = [len(b[0]) for b in databunch.train_dl]
#CPU times: user 1min 52s, sys: 2.09 s, total: 1min 54s
#Wall time: 1min 54s

In [None]:
df_prices[["idx","ix_predict"]].tail()

In [None]:
#hide
from nbdev.export import notebook2script
notebook2script()

Converted 00_test.ipynb.
Converted 01_data.external.ipynb.
Converted 02_lists.ipynb.
Converted 03_images.ipynb.
Converted 05_Learner.ipynb.
Converted 05_model.ipynb.
Converted 06_modelmanger.ipynb.
Converted 07_optimizers.ipynb.
Converted app_image_01_mnist_optimizers.ipynb.
Converted app_image_02_imagenette_optimizers.ipynb.
Converted fin_01_candlestick.ipynb.
Converted fin_02_simfin_data-Copy1.ipynb.
Converted fin_02_simfin_data.ipynb.
Converted fin_02_simfin_generated_data.ipynb.
Converted fin_02_simfin_training.ipynb.
Converted fin_03_graphs.ipynb.
Converted index.ipynb.
Converted parallel.ipynb.
Converted parallel_extern.ipynb.
