In [1]:
# !pip install -q transformers datasets evaluate accelerate gluonts ujson
# !pip install -q scikit-learn

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import torch, torchvision
from gluonts.dataset.multivariate_grouper import MultivariateGrouper
from gluonts.time_feature import time_features_from_frequency_str
from torch.utils.data import Dataset, DataLoader

## Prepping the Data

start with specific stock. Need to adjust other stock values with splits later

In [3]:
df_WFC = pd.read_csv('../../prebuilt_rob_data/WFC_prerob.csv')
df_WFC = df_WFC.drop(columns= ['fin_sentiment', 'finvader_tot', 'rob_score', 'Ticker', 'Sector', 'Headline', 'Text', 'URL'])
df_WFC.head()

Unnamed: 0,Publishing Time,Source,rob_sentiment,stock_time,open,high,low,close,volume,numtrades,vwap
0,2019-03-15 10:46:42+00:00,The Motley Fool,positive,2019-03-15 11:15:00+00:00,50.6,50.6,50.36,50.36,300,2,50.44
1,,,,2019-03-15 11:30:00+00:00,50.76,50.76,50.76,50.76,315,2,50.76
2,,,,2019-03-15 12:00:00+00:00,50.3501,50.3501,50.3501,50.3501,300,1,50.3501
3,,,,2019-03-15 12:30:00+00:00,50.56,50.56,50.42,50.42,200,2,50.49
4,,,,2019-03-15 13:00:00+00:00,50.44,50.5,50.3,50.3,3204,17,50.362482


Ideally, we would embed the sentiment scores as n-dimensional vectors, where each dimension corresponds to a source site. This would allow us to appropriately evalute sentiment from sources as having different weights, although may make the training slower. Not sure if effectiveness vs performance cost is worth, but may as well try. 

In [4]:
source_list = df_WFC['Source'].unique()
print(source_list, len(source_list))

['The Motley Fool' nan '24/7 Wall Street' 'Zacks Investment Research'
 'CNBC' 'Market Watch' 'Forbes' 'Investors Business Daily' 'Benzinga'
 'Seeking Alpha' 'Investopedia' 'Reuters' 'New York Post' 'InvestorPlace'
 'GuruFocus' 'CNN Business' 'Fast Company' 'Huffington Post' 'Invezz'
 'PRNewsWire' 'Newsfile Corp' 'Business Wire' 'Schaeffers Research'
 'PYMNTS' 'Business Insider' 'Pulse2' 'Insider Monkey' 'GlobeNewsWire'
 'Barrons' 'Fox Business' 'CNET' 'Proactive Investors' 'Kiplinger'
 'The Dog of Wall Street' 'Finbold' 'The Guardian' 'WSJ' 'FreightWaves'
 'MarketBeat'] 39


In [5]:
one_hot_df = pd.get_dummies(df_WFC['Source'])
one_hot_df.columns
#order here determines order in the tensor

Index(['24/7 Wall Street', 'Barrons', 'Benzinga', 'Business Insider',
       'Business Wire', 'CNBC', 'CNET', 'CNN Business', 'Fast Company',
       'Finbold', 'Forbes', 'Fox Business', 'FreightWaves', 'GlobeNewsWire',
       'GuruFocus', 'Huffington Post', 'Insider Monkey', 'Investopedia',
       'InvestorPlace', 'Investors Business Daily', 'Invezz', 'Kiplinger',
       'Market Watch', 'MarketBeat', 'New York Post', 'Newsfile Corp',
       'PRNewsWire', 'PYMNTS', 'Proactive Investors', 'Pulse2', 'Reuters',
       'Schaeffers Research', 'Seeking Alpha', 'The Dog of Wall Street',
       'The Guardian', 'The Motley Fool', 'WSJ', 'Zacks Investment Research'],
      dtype='object')

In [6]:
ohd_tensor = one_hot_df.apply(lambda row: torch.tensor(row.values, dtype=torch.float), axis=1)
ohd_tensor.iloc[0]

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
        0., 0.])

In [7]:
df_WFC['encoded_sentiment'] = ohd_tensor
df_WFC.head()

Unnamed: 0,Publishing Time,Source,rob_sentiment,stock_time,open,high,low,close,volume,numtrades,vwap,encoded_sentiment
0,2019-03-15 10:46:42+00:00,The Motley Fool,positive,2019-03-15 11:15:00+00:00,50.6,50.6,50.36,50.36,300,2,50.44,"[tensor(0.), tensor(0.), tensor(0.), tensor(0...."
1,,,,2019-03-15 11:30:00+00:00,50.76,50.76,50.76,50.76,315,2,50.76,"[tensor(0.), tensor(0.), tensor(0.), tensor(0...."
2,,,,2019-03-15 12:00:00+00:00,50.3501,50.3501,50.3501,50.3501,300,1,50.3501,"[tensor(0.), tensor(0.), tensor(0.), tensor(0...."
3,,,,2019-03-15 12:30:00+00:00,50.56,50.56,50.42,50.42,200,2,50.49,"[tensor(0.), tensor(0.), tensor(0.), tensor(0...."
4,,,,2019-03-15 13:00:00+00:00,50.44,50.5,50.3,50.3,3204,17,50.362482,"[tensor(0.), tensor(0.), tensor(0.), tensor(0...."


In [8]:
def sentiment_to_value(txt:str):
    if txt == 'positive':
        return 1
    elif txt == 'negative':
        return -1
    else:
        return 0

In [9]:
df_WFC['rob_sentiment'] = df_WFC['rob_sentiment'].apply(sentiment_to_value)
df_WFC.head()

Unnamed: 0,Publishing Time,Source,rob_sentiment,stock_time,open,high,low,close,volume,numtrades,vwap,encoded_sentiment
0,2019-03-15 10:46:42+00:00,The Motley Fool,1,2019-03-15 11:15:00+00:00,50.6,50.6,50.36,50.36,300,2,50.44,"[tensor(0.), tensor(0.), tensor(0.), tensor(0...."
1,,,0,2019-03-15 11:30:00+00:00,50.76,50.76,50.76,50.76,315,2,50.76,"[tensor(0.), tensor(0.), tensor(0.), tensor(0...."
2,,,0,2019-03-15 12:00:00+00:00,50.3501,50.3501,50.3501,50.3501,300,1,50.3501,"[tensor(0.), tensor(0.), tensor(0.), tensor(0...."
3,,,0,2019-03-15 12:30:00+00:00,50.56,50.56,50.42,50.42,200,2,50.49,"[tensor(0.), tensor(0.), tensor(0.), tensor(0...."
4,,,0,2019-03-15 13:00:00+00:00,50.44,50.5,50.3,50.3,3204,17,50.362482,"[tensor(0.), tensor(0.), tensor(0.), tensor(0...."


In [10]:
df_WFC['total_articles'] = df_WFC['Publishing Time'].notna().astype(int)
df_WFC.head()

Unnamed: 0,Publishing Time,Source,rob_sentiment,stock_time,open,high,low,close,volume,numtrades,vwap,encoded_sentiment,total_articles
0,2019-03-15 10:46:42+00:00,The Motley Fool,1,2019-03-15 11:15:00+00:00,50.6,50.6,50.36,50.36,300,2,50.44,"[tensor(0.), tensor(0.), tensor(0.), tensor(0....",1
1,,,0,2019-03-15 11:30:00+00:00,50.76,50.76,50.76,50.76,315,2,50.76,"[tensor(0.), tensor(0.), tensor(0.), tensor(0....",0
2,,,0,2019-03-15 12:00:00+00:00,50.3501,50.3501,50.3501,50.3501,300,1,50.3501,"[tensor(0.), tensor(0.), tensor(0.), tensor(0....",0
3,,,0,2019-03-15 12:30:00+00:00,50.56,50.56,50.42,50.42,200,2,50.49,"[tensor(0.), tensor(0.), tensor(0.), tensor(0....",0
4,,,0,2019-03-15 13:00:00+00:00,50.44,50.5,50.3,50.3,3204,17,50.362482,"[tensor(0.), tensor(0.), tensor(0.), tensor(0....",0


In [11]:
df_WFC['total_articles'] = df_WFC.groupby('stock_time')['total_articles'].transform(sum)
df_WFC[df_WFC['total_articles'] > 1].tail(6)

  df_WFC['total_articles'] = df_WFC.groupby('stock_time')['total_articles'].transform(sum)


Unnamed: 0,Publishing Time,Source,rob_sentiment,stock_time,open,high,low,close,volume,numtrades,vwap,encoded_sentiment,total_articles
57702,2024-01-12 15:32:15+00:00,Zacks Investment Research,0,2024-01-12 15:45:00+00:00,48.11,48.24,47.79,47.905,914492,7283,48.00006,"[tensor(0.), tensor(0.), tensor(0.), tensor(0....",2
57703,2024-01-12 15:32:35+00:00,Zacks Investment Research,1,2024-01-12 15:45:00+00:00,48.11,48.24,47.79,47.905,914492,7283,48.00006,"[tensor(0.), tensor(0.), tensor(0.), tensor(0....",2
57715,2024-01-12 18:30:39+00:00,Investopedia,-1,2024-01-12 18:45:00+00:00,47.425,47.61,47.395,47.585,589289,4505,47.479842,"[tensor(0.), tensor(0.), tensor(0.), tensor(0....",2
57716,2024-01-12 18:39:09+00:00,Seeking Alpha,0,2024-01-12 18:45:00+00:00,47.425,47.61,47.395,47.585,589289,4505,47.479842,"[tensor(0.), tensor(0.), tensor(0.), tensor(0....",2
58326,2024-02-03 16:40:00+00:00,Seeking Alpha,-1,2024-02-05 12:15:00+00:00,48.94,48.94,48.94,48.94,874,4,48.940057,"[tensor(0.), tensor(0.), tensor(0.), tensor(0....",2
58327,2024-02-05 09:31:00+00:00,The Motley Fool,1,2024-02-05 12:15:00+00:00,48.94,48.94,48.94,48.94,874,4,48.940057,"[tensor(0.), tensor(0.), tensor(0.), tensor(0....",2


In [12]:
df_WFC['encoded_sentiment'] = df_WFC['rob_sentiment'] * df_WFC['encoded_sentiment']
df_WFC['encoded_sentiment'].iloc[58326]


tensor([-0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0.,
        -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0.,
        -0., -0., -0., -0., -1., -0., -0., -0., -0., -0.])

In [13]:
df_WFC['encoded_sentiment'] = df_WFC.groupby('stock_time')['encoded_sentiment'].transform(lambda x: [torch.stack(x.to_list()).sum(dim=0)]*len(x))
df_WFC['encoded_sentiment'].iloc[58326]


tensor([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0., -1.,  0.,  0.,  1.,  0.,  0.])

In [14]:
df_WFC.head()

Unnamed: 0,Publishing Time,Source,rob_sentiment,stock_time,open,high,low,close,volume,numtrades,vwap,encoded_sentiment,total_articles
0,2019-03-15 10:46:42+00:00,The Motley Fool,1,2019-03-15 11:15:00+00:00,50.6,50.6,50.36,50.36,300,2,50.44,"[tensor(0.), tensor(0.), tensor(0.), tensor(0....",1
1,,,0,2019-03-15 11:30:00+00:00,50.76,50.76,50.76,50.76,315,2,50.76,"[tensor(0.), tensor(0.), tensor(0.), tensor(0....",0
2,,,0,2019-03-15 12:00:00+00:00,50.3501,50.3501,50.3501,50.3501,300,1,50.3501,"[tensor(0.), tensor(0.), tensor(0.), tensor(0....",0
3,,,0,2019-03-15 12:30:00+00:00,50.56,50.56,50.42,50.42,200,2,50.49,"[tensor(0.), tensor(0.), tensor(0.), tensor(0....",0
4,,,0,2019-03-15 13:00:00+00:00,50.44,50.5,50.3,50.3,3204,17,50.362482,"[tensor(0.), tensor(0.), tensor(0.), tensor(0....",0


The encoded_sentiment column now has a tensor that describes the source and sentiment of the articles, while total articles contains the amount of articles in that time period. 

Together, this gives a sentiment value + how many neutral articles appear. However, this loses the source of the neutral articles. This can be remieded by given neutral articles a very small weight, but we do not do that for now. 

In [15]:
df_WFC['Publishing Time'] = pd.to_datetime(df_WFC['Publishing Time'])
df_WFC['stock_time'] = pd.to_datetime(df_WFC['stock_time'])

Because of the encoding, we no longer need the source and sentiment values. The Publishing Time could be useful, but we drop it for now so that we have even timesteps. 

In [16]:
df_WFC.drop(columns =['Publishing Time', 'Source', 'rob_sentiment'], inplace =True)
df_WFC.columns

Index(['stock_time', 'open', 'high', 'low', 'close', 'volume', 'numtrades',
       'vwap', 'encoded_sentiment', 'total_articles'],
      dtype='object')

In [17]:
df_WFC.drop_duplicates(inplace =True, ignore_index=True)
df_WFC.head()

Unnamed: 0,stock_time,open,high,low,close,volume,numtrades,vwap,encoded_sentiment,total_articles
0,2019-03-15 11:15:00+00:00,50.6,50.6,50.36,50.36,300,2,50.44,"[tensor(0.), tensor(0.), tensor(0.), tensor(0....",1
1,2019-03-15 11:30:00+00:00,50.76,50.76,50.76,50.76,315,2,50.76,"[tensor(0.), tensor(0.), tensor(0.), tensor(0....",0
2,2019-03-15 12:00:00+00:00,50.3501,50.3501,50.3501,50.3501,300,1,50.3501,"[tensor(0.), tensor(0.), tensor(0.), tensor(0....",0
3,2019-03-15 12:30:00+00:00,50.56,50.56,50.42,50.42,200,2,50.49,"[tensor(0.), tensor(0.), tensor(0.), tensor(0....",0
4,2019-03-15 13:00:00+00:00,50.44,50.5,50.3,50.3,3204,17,50.362482,"[tensor(0.), tensor(0.), tensor(0.), tensor(0....",0


In [18]:
len(df_WFC), len(df_WFC['stock_time'].unique())

(59329, 59329)

Need to convert time series data into numerical values for use in the informer model. 

Also, need to fill in all the missing time dates. Want to have a tracker to make sure that we know which data is original and which was filled in. (Actually, this is unneeded, will clean this out later)

In [19]:
df_WFC.set_index('stock_time', inplace = True)
df_WFC['original'] = 1

In [20]:
df_WFC_filled = df_WFC.resample('15min').asfreq()
df_WFC_filled.head()

Unnamed: 0_level_0,open,high,low,close,volume,numtrades,vwap,encoded_sentiment,total_articles,original
stock_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2019-03-15 11:15:00+00:00,50.6,50.6,50.36,50.36,300.0,2.0,50.44,"[tensor(0.), tensor(0.), tensor(0.), tensor(0....",1.0,1.0
2019-03-15 11:30:00+00:00,50.76,50.76,50.76,50.76,315.0,2.0,50.76,"[tensor(0.), tensor(0.), tensor(0.), tensor(0....",0.0,1.0
2019-03-15 11:45:00+00:00,,,,,,,,,,
2019-03-15 12:00:00+00:00,50.3501,50.3501,50.3501,50.3501,300.0,1.0,50.3501,"[tensor(0.), tensor(0.), tensor(0.), tensor(0....",0.0,1.0
2019-03-15 12:15:00+00:00,,,,,,,,,,


In [21]:
df_WFC_filled['original'] = df_WFC_filled['original'].fillna(0)
df_WFC_filled.head()

Unnamed: 0_level_0,open,high,low,close,volume,numtrades,vwap,encoded_sentiment,total_articles,original
stock_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2019-03-15 11:15:00+00:00,50.6,50.6,50.36,50.36,300.0,2.0,50.44,"[tensor(0.), tensor(0.), tensor(0.), tensor(0....",1.0,1.0
2019-03-15 11:30:00+00:00,50.76,50.76,50.76,50.76,315.0,2.0,50.76,"[tensor(0.), tensor(0.), tensor(0.), tensor(0....",0.0,1.0
2019-03-15 11:45:00+00:00,,,,,,,,,,0.0
2019-03-15 12:00:00+00:00,50.3501,50.3501,50.3501,50.3501,300.0,1.0,50.3501,"[tensor(0.), tensor(0.), tensor(0.), tensor(0....",0.0,1.0
2019-03-15 12:15:00+00:00,,,,,,,,,,0.0


We now have a row original that distinguishes between the filled and not filled columns. We then forward fill all the other rows

In [22]:
df_WFC_filled = df_WFC_filled.ffill()
df_WFC_filled.head()

Unnamed: 0_level_0,open,high,low,close,volume,numtrades,vwap,encoded_sentiment,total_articles,original
stock_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2019-03-15 11:15:00+00:00,50.6,50.6,50.36,50.36,300.0,2.0,50.44,"[tensor(0.), tensor(0.), tensor(0.), tensor(0....",1.0,1.0
2019-03-15 11:30:00+00:00,50.76,50.76,50.76,50.76,315.0,2.0,50.76,"[tensor(0.), tensor(0.), tensor(0.), tensor(0....",0.0,1.0
2019-03-15 11:45:00+00:00,50.76,50.76,50.76,50.76,315.0,2.0,50.76,"[tensor(0.), tensor(0.), tensor(0.), tensor(0....",0.0,0.0
2019-03-15 12:00:00+00:00,50.3501,50.3501,50.3501,50.3501,300.0,1.0,50.3501,"[tensor(0.), tensor(0.), tensor(0.), tensor(0....",0.0,1.0
2019-03-15 12:15:00+00:00,50.3501,50.3501,50.3501,50.3501,300.0,1.0,50.3501,"[tensor(0.), tensor(0.), tensor(0.), tensor(0....",0.0,0.0


now we extract time series data into values that can be inputted into informer. 

In [23]:
df_WFC_filled.reset_index(inplace = True)
df_WFC_filled.head()

Unnamed: 0,stock_time,open,high,low,close,volume,numtrades,vwap,encoded_sentiment,total_articles,original
0,2019-03-15 11:15:00+00:00,50.6,50.6,50.36,50.36,300.0,2.0,50.44,"[tensor(0.), tensor(0.), tensor(0.), tensor(0....",1.0,1.0
1,2019-03-15 11:30:00+00:00,50.76,50.76,50.76,50.76,315.0,2.0,50.76,"[tensor(0.), tensor(0.), tensor(0.), tensor(0....",0.0,1.0
2,2019-03-15 11:45:00+00:00,50.76,50.76,50.76,50.76,315.0,2.0,50.76,"[tensor(0.), tensor(0.), tensor(0.), tensor(0....",0.0,0.0
3,2019-03-15 12:00:00+00:00,50.3501,50.3501,50.3501,50.3501,300.0,1.0,50.3501,"[tensor(0.), tensor(0.), tensor(0.), tensor(0....",0.0,1.0
4,2019-03-15 12:15:00+00:00,50.3501,50.3501,50.3501,50.3501,300.0,1.0,50.3501,"[tensor(0.), tensor(0.), tensor(0.), tensor(0....",0.0,0.0


In [24]:
df_WFC_filled['stock_time'] = df_WFC_filled['stock_time'].apply(lambda x: pd.Period(x, freq = '15min'))
df_WFC_filled['stock_time']

0         2019-03-15 11:15
1         2019-03-15 11:30
2         2019-03-15 11:45
3         2019-03-15 12:00
4         2019-03-15 12:15
                ...       
175429    2024-03-15 20:30
175430    2024-03-15 20:45
175431    2024-03-15 21:00
175432    2024-03-15 21:15
175433    2024-03-15 21:30
Name: stock_time, Length: 175434, dtype: period[15min]

In [25]:
time_features = time_features_from_frequency_str('15min')
time_features

[<function gluonts.time_feature._base.minute_of_hour(index: pandas.core.indexes.period.PeriodIndex) -> numpy.ndarray>,
 <function gluonts.time_feature._base.hour_of_day(index: pandas.core.indexes.period.PeriodIndex) -> numpy.ndarray>,
 <function gluonts.time_feature._base.day_of_week(index: pandas.core.indexes.period.PeriodIndex) -> numpy.ndarray>,
 <function gluonts.time_feature._base.day_of_month(index: pandas.core.indexes.period.PeriodIndex) -> numpy.ndarray>,
 <function gluonts.time_feature._base.day_of_year(index: pandas.core.indexes.period.PeriodIndex) -> numpy.ndarray>]

In [26]:
time_names = {0:'minute_of_hour', 1:'hour_of_day', 2:'day_of_week', 3:'day_of_month', 4:'day_of_year'}
for i,function in enumerate(time_features):
    df_WFC_filled[time_names[i]] = df_WFC_filled['stock_time'].apply(function)


In [27]:
df_WFC_filled.head()

Unnamed: 0,stock_time,open,high,low,close,volume,numtrades,vwap,encoded_sentiment,total_articles,original,minute_of_hour,hour_of_day,day_of_week,day_of_month,day_of_year
0,2019-03-15 11:15,50.6,50.6,50.36,50.36,300.0,2.0,50.44,"[tensor(0.), tensor(0.), tensor(0.), tensor(0....",1.0,1.0,-0.245763,-0.021739,0.166667,-0.033333,-0.3
1,2019-03-15 11:30,50.76,50.76,50.76,50.76,315.0,2.0,50.76,"[tensor(0.), tensor(0.), tensor(0.), tensor(0....",0.0,1.0,0.008475,-0.021739,0.166667,-0.033333,-0.3
2,2019-03-15 11:45,50.76,50.76,50.76,50.76,315.0,2.0,50.76,"[tensor(0.), tensor(0.), tensor(0.), tensor(0....",0.0,0.0,0.262712,-0.021739,0.166667,-0.033333,-0.3
3,2019-03-15 12:00,50.3501,50.3501,50.3501,50.3501,300.0,1.0,50.3501,"[tensor(0.), tensor(0.), tensor(0.), tensor(0....",0.0,1.0,-0.5,0.021739,0.166667,-0.033333,-0.3
4,2019-03-15 12:15,50.3501,50.3501,50.3501,50.3501,300.0,1.0,50.3501,"[tensor(0.), tensor(0.), tensor(0.), tensor(0....",0.0,0.0,-0.245763,0.021739,0.166667,-0.033333,-0.3


Messed up. Didn't need the filling with time series. 

In [28]:
df_WFC_filled = df_WFC_filled[df_WFC_filled['original']==1].drop(columns = 'original')
df_WFC_filled

Unnamed: 0,stock_time,open,high,low,close,volume,numtrades,vwap,encoded_sentiment,total_articles,minute_of_hour,hour_of_day,day_of_week,day_of_month,day_of_year
0,2019-03-15 11:15,50.6000,50.6000,50.3600,50.3600,300.0,2.0,50.440000,"[tensor(0.), tensor(0.), tensor(0.), tensor(0....",1.0,-0.245763,-0.021739,0.166667,-0.033333,-0.30000
1,2019-03-15 11:30,50.7600,50.7600,50.7600,50.7600,315.0,2.0,50.760000,"[tensor(0.), tensor(0.), tensor(0.), tensor(0....",0.0,0.008475,-0.021739,0.166667,-0.033333,-0.30000
3,2019-03-15 12:00,50.3501,50.3501,50.3501,50.3501,300.0,1.0,50.350100,"[tensor(0.), tensor(0.), tensor(0.), tensor(0....",0.0,-0.500000,0.021739,0.166667,-0.033333,-0.30000
5,2019-03-15 12:30,50.5600,50.5600,50.4200,50.4200,200.0,2.0,50.490000,"[tensor(0.), tensor(0.), tensor(0.), tensor(0....",0.0,0.008475,0.021739,0.166667,-0.033333,-0.30000
7,2019-03-15 13:00,50.4400,50.5000,50.3000,50.3000,3204.0,17.0,50.362482,"[tensor(0.), tensor(0.), tensor(0.), tensor(0....",0.0,-0.500000,0.065217,0.166667,-0.033333,-0.30000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175426,2024-03-15 19:45,57.5700,57.6400,57.3800,57.5000,2032933.0,13678.0,57.481044,"[tensor(0.), tensor(0.), tensor(0.), tensor(0....",0.0,0.262712,0.326087,0.166667,-0.033333,-0.29726
175427,2024-03-15 20:00,57.5000,57.5100,57.4800,57.5100,19420277.0,169.0,57.510064,"[tensor(0.), tensor(0.), tensor(0.), tensor(0....",0.0,-0.500000,0.369565,0.166667,-0.033333,-0.29726
175430,2024-03-15 20:45,57.5100,57.5400,57.5100,57.5400,101634.0,14.0,57.513111,"[tensor(0.), tensor(0.), tensor(0.), tensor(0....",0.0,0.262712,0.369565,0.166667,-0.033333,-0.29726
175431,2024-03-15 21:00,57.5200,57.5200,57.5200,57.5200,200.0,1.0,57.520000,"[tensor(0.), tensor(0.), tensor(0.), tensor(0....",0.0,-0.500000,0.413043,0.166667,-0.033333,-0.29726


## Scale and Split
14 features. Split the data into training set and the holdout set(for validation/testing), saving last 18 months. 

However, to accomdate for 'encoded sentiment', we need to split the it into one variable per dimension of the encoding space. This means we would have 14+37 features... is it worth?? There should be a better way to embed the variables

Likely want to test data again by training over small increments in validation set, since stock data is much more influenced by recent performance. 

In [29]:
train_WFC = df_WFC_filled[df_WFC_filled['stock_time'] <= '2022-09-15']
holdout_WFC = df_WFC_filled[df_WFC_filled['stock_time'] > '2022-09-15']
print(len(train_WFC), len(holdout_WFC), len(df_WFC_filled))
#roughly 18 months leftover for validation and testing... 

42928 16401 59329


In [30]:
train_WFC.set_index('stock_time', inplace = True)
holdout_WFC.set_index('stock_time', inplace = True)

In [31]:
scaler = MinMaxScaler()
to_scale = ['open', 'high', 'low', 'close', 'volume', 'numtrades', 'vwap']

In [32]:
train_WFC[to_scale] = scaler.fit_transform(train_WFC[to_scale])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_WFC[to_scale] = scaler.fit_transform(train_WFC[to_scale])


In [33]:
holdout_WFC[to_scale] = scaler.transform(holdout_WFC[to_scale])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  holdout_WFC[to_scale] = scaler.transform(holdout_WFC[to_scale])


## Building the Model

I'm not so sure how to build the informer model with the right parameters. What will follow is to use it directly from the original github source for informer, as well as possibly using the HuggingFace variant. I don't think there should be a difference, but we will see which is easier/harder to use.

First, I want to batch the training data, and figure out how far back we want to include in terms of analysis. 

We will start with predicting one day in advance, using the past two days of data as input. (64 time steps per day because of extended hours)

One argument for not using extended hours: It greatly increases the amount of necessary time steps, and it may not be worth the much increased computational complexity. 

In [34]:
train_WFC.columns

Index(['open', 'high', 'low', 'close', 'volume', 'numtrades', 'vwap',
       'encoded_sentiment', 'total_articles', 'minute_of_hour', 'hour_of_day',
       'day_of_week', 'day_of_month', 'day_of_year'],
      dtype='object')

In [35]:
df_tensor = train_WFC[['open', 'high', 'low', 'close', 'volume', 'numtrades', 'vwap', 'total_articles', 'minute_of_hour', 'hour_of_day',
       'day_of_week', 'day_of_month', 'day_of_year']].map(lambda x: torch.tensor(x, dtype = torch.float64).unsqueeze(0))
df_tensor


Unnamed: 0_level_0,open,high,low,close,volume,numtrades,vwap,total_articles,minute_of_hour,hour_of_day,day_of_week,day_of_month,day_of_year
stock_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2019-03-15 11:15,"[tensor(0.7575, dtype=torch.float64)]","[tensor(0.7544, dtype=torch.float64)]","[tensor(0.7559, dtype=torch.float64)]","[tensor(0.7513, dtype=torch.float64)]","[tensor(2.5406e-06, dtype=torch.float64)]","[tensor(1.1191e-05, dtype=torch.float64)]","[tensor(0.7544, dtype=torch.float64)]","[tensor(1., dtype=torch.float64)]","[tensor(-0.2458, dtype=torch.float64)]","[tensor(-0.0217, dtype=torch.float64)]","[tensor(0.1667, dtype=torch.float64)]","[tensor(-0.0333, dtype=torch.float64)]","[tensor(-0.3000, dtype=torch.float64)]"
2019-03-15 11:30,"[tensor(0.7616, dtype=torch.float64)]","[tensor(0.7584, dtype=torch.float64)]","[tensor(0.7661, dtype=torch.float64)]","[tensor(0.7615, dtype=torch.float64)]","[tensor(2.6680e-06, dtype=torch.float64)]","[tensor(1.1191e-05, dtype=torch.float64)]","[tensor(0.7626, dtype=torch.float64)]","[tensor(0., dtype=torch.float64)]","[tensor(0.0085, dtype=torch.float64)]","[tensor(-0.0217, dtype=torch.float64)]","[tensor(0.1667, dtype=torch.float64)]","[tensor(-0.0333, dtype=torch.float64)]","[tensor(-0.3000, dtype=torch.float64)]"
2019-03-15 12:00,"[tensor(0.7512, dtype=torch.float64)]","[tensor(0.7480, dtype=torch.float64)]","[tensor(0.7556, dtype=torch.float64)]","[tensor(0.7511, dtype=torch.float64)]","[tensor(2.5406e-06, dtype=torch.float64)]","[tensor(0., dtype=torch.float64)]","[tensor(0.7521, dtype=torch.float64)]","[tensor(0., dtype=torch.float64)]","[tensor(-0.5000, dtype=torch.float64)]","[tensor(0.0217, dtype=torch.float64)]","[tensor(0.1667, dtype=torch.float64)]","[tensor(-0.0333, dtype=torch.float64)]","[tensor(-0.3000, dtype=torch.float64)]"
2019-03-15 12:30,"[tensor(0.7565, dtype=torch.float64)]","[tensor(0.7534, dtype=torch.float64)]","[tensor(0.7574, dtype=torch.float64)]","[tensor(0.7528, dtype=torch.float64)]","[tensor(1.6909e-06, dtype=torch.float64)]","[tensor(1.1191e-05, dtype=torch.float64)]","[tensor(0.7557, dtype=torch.float64)]","[tensor(0., dtype=torch.float64)]","[tensor(0.0085, dtype=torch.float64)]","[tensor(0.0217, dtype=torch.float64)]","[tensor(0.1667, dtype=torch.float64)]","[tensor(-0.0333, dtype=torch.float64)]","[tensor(-0.3000, dtype=torch.float64)]"
2019-03-15 13:00,"[tensor(0.7535, dtype=torch.float64)]","[tensor(0.7518, dtype=torch.float64)]","[tensor(0.7543, dtype=torch.float64)]","[tensor(0.7498, dtype=torch.float64)]","[tensor(2.7216e-05, dtype=torch.float64)]","[tensor(0.0002, dtype=torch.float64)]","[tensor(0.7525, dtype=torch.float64)]","[tensor(0., dtype=torch.float64)]","[tensor(-0.5000, dtype=torch.float64)]","[tensor(0.0652, dtype=torch.float64)]","[tensor(0.1667, dtype=torch.float64)]","[tensor(-0.0333, dtype=torch.float64)]","[tensor(-0.3000, dtype=torch.float64)]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-09-14 21:45,"[tensor(0.5743, dtype=torch.float64)]","[tensor(0.5718, dtype=torch.float64)]","[tensor(0.5774, dtype=torch.float64)]","[tensor(0.5742, dtype=torch.float64)]","[tensor(1.1598e-05, dtype=torch.float64)]","[tensor(8.9528e-05, dtype=torch.float64)]","[tensor(0.5748, dtype=torch.float64)]","[tensor(0., dtype=torch.float64)]","[tensor(0.2627, dtype=torch.float64)]","[tensor(0.4130, dtype=torch.float64)]","[tensor(-0.1667, dtype=torch.float64)]","[tensor(-0.0667, dtype=torch.float64)]","[tensor(0.2014, dtype=torch.float64)]"
2022-09-14 22:00,"[tensor(0.5743, dtype=torch.float64)]","[tensor(0.5720, dtype=torch.float64)]","[tensor(0.5779, dtype=torch.float64)]","[tensor(0.5744, dtype=torch.float64)]","[tensor(2.9484e-06, dtype=torch.float64)]","[tensor(3.3573e-05, dtype=torch.float64)]","[tensor(0.5750, dtype=torch.float64)]","[tensor(0., dtype=torch.float64)]","[tensor(-0.5000, dtype=torch.float64)]","[tensor(0.4565, dtype=torch.float64)]","[tensor(-0.1667, dtype=torch.float64)]","[tensor(-0.0667, dtype=torch.float64)]","[tensor(0.2014, dtype=torch.float64)]"
2022-09-14 22:45,"[tensor(0.5743, dtype=torch.float64)]","[tensor(0.5718, dtype=torch.float64)]","[tensor(0.5779, dtype=torch.float64)]","[tensor(0.5742, dtype=torch.float64)]","[tensor(8.4119e-07, dtype=torch.float64)]","[tensor(0., dtype=torch.float64)]","[tensor(0.5750, dtype=torch.float64)]","[tensor(0., dtype=torch.float64)]","[tensor(0.2627, dtype=torch.float64)]","[tensor(0.4565, dtype=torch.float64)]","[tensor(-0.1667, dtype=torch.float64)]","[tensor(-0.0667, dtype=torch.float64)]","[tensor(0.2014, dtype=torch.float64)]"
2022-09-14 23:30,"[tensor(0.5745, dtype=torch.float64)]","[tensor(0.5720, dtype=torch.float64)]","[tensor(0.5779, dtype=torch.float64)]","[tensor(0.5742, dtype=torch.float64)]","[tensor(1.2737e-05, dtype=torch.float64)]","[tensor(1.1191e-05, dtype=torch.float64)]","[tensor(0.5751, dtype=torch.float64)]","[tensor(0., dtype=torch.float64)]","[tensor(0.0085, dtype=torch.float64)]","[tensor(0.5000, dtype=torch.float64)]","[tensor(-0.1667, dtype=torch.float64)]","[tensor(-0.0667, dtype=torch.float64)]","[tensor(0.2014, dtype=torch.float64)]"


In [36]:
df_tensor['encoded_sentiment'] = train_WFC['encoded_sentiment']

In [37]:
rows = [torch.cat([df_tensor[col][i] for col in df_tensor.columns]) for i in range(len(df_tensor))]
final = torch.stack(rows)
final.shape

  rows = [torch.cat([df_tensor[col][i] for col in df_tensor.columns]) for i in range(len(df_tensor))]


torch.Size([42928, 51])

In [38]:
final[0:3, 3]

tensor([0.7513, 0.7615, 0.7511], dtype=torch.float64)

In [39]:
# def data_splitter(data, input_length, predict_length):
#     X, y = [], []
#     for i in range(len(data) - input_length - predict_length):
#         X.append(data.iloc[i:i+input_length].values)
#         y.append(data.iloc[i+input_length:i+input_length+predict_length]['close'].values)
#     return np.array(X), np.array(y)

In [40]:
# input_length = 128 
# predict_length = 64
# X, y = data_splitter(train_WFC, input_length, predict_length)

convert X to a tensor. To do this, need to convert the non-tuple column separately. Also want two cases, depending on whether I want to make it a single value tensor or work with the encoded values. 

Create Dataset to feed into informer

In [41]:
class TimeData(Dataset):
    def __init__(self, data, seq_len, pred_len):
        self.data = data
        self.seq_len = seq_len
        self.pred_len = pred_len
    
    def __len__(self):
        return len(self.data) - self.seq_len - self.pred_len

    def __getitem__(self, idx):
        seq_X = self.data[idx:idx+self.seq_len]
        seq_y = self.data[idx + self.seq_len: idx + self.seq_len + self.pred_len, 3]

        return seq_X, seq_y


In [42]:
dataset = TimeData(final, 128, 64)

In [43]:
dataloader = DataLoader(dataset, batch_size= 16, shuffle = False, num_workers = 4)

In [44]:
device = torch.device('mps')

In [45]:
batch = next(iter(dataloader))

: 

: 

Some things to try include:

Training an informer model on multiple stock time series, with Ticker/Sector as static variables. This may end up improving the model, especially within the sector. However, this may also take forever to train... 