# Data processing and sentiment analysis
This notebook illustrates the pre-processing steps performed on the gathered data, performs sentiment analysis of the news titles and formulates the feature set taken as inputs by the Informer model

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd

## The gathered data overview
Read and describe the basic information about the datasets
The *general_news* dataset contains articles related to the stock market at large
The *ticker_news* dataset contains news relevant to a given ticker

In [3]:
general_news = pd.read_json("../bp-scrapper-data/news.jsonl", lines=True)
prices = pd.read_json("../bp-scrapper-data/price.jsonl", lines=True)
ticker_news = pd.read_json("../bp-scrapper-data/ticker_news.jsonl", lines=True)

In [4]:
general_news.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13418 entries, 0 to 13417
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   url       13418 non-null  object        
 1   title     13418 non-null  object        
 2   summary   13418 non-null  object        
 3   date      13418 non-null  datetime64[ns]
 4   category  13418 non-null  object        
 5   tickers   13418 non-null  object        
dtypes: datetime64[ns](1), object(5)
memory usage: 629.1+ KB


In [5]:
prices.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5445 entries, 0 to 5444
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   ticker  5445 non-null   object        
 1   price   5445 non-null   float64       
 2   date    5445 non-null   datetime64[ns]
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 127.7+ KB


In [6]:
ticker_news.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7115 entries, 0 to 7114
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   url       7115 non-null   object        
 1   title     7115 non-null   object        
 2   summary   7115 non-null   object        
 3   date      7115 non-null   datetime64[ns]
 4   category  0 non-null      float64       
 5   tickers   7115 non-null   object        
dtypes: datetime64[ns](1), float64(1), object(4)
memory usage: 333.6+ KB


In [7]:
tickers = prices.ticker.unique()
tickers

array(['META', 'AAPL', 'GOOGL', 'NFLX', 'AMZN', '^GSPC'], dtype=object)

## Initial text preprocessing
Remove *Yahoo!Finance* special terms from the news titles

In [8]:
def remove_terms(row):
    i = row.title.find('-')
    if i != -1 and i < 30 and (row.title[i + 1].isupper() or row.title[i + 1].isnumeric()):
        row.title = row.title.split('-', 1)[1]
    return row

def df_remove_terms(df):
    return df.apply(lambda row: remove_terms(row), axis=1)

Drop duplicates [url, titles]

In [9]:
def df_drop_duplicates(df, columns):
    return df.drop_duplicates(columns)

Drop news unrelated to relevant tickers

In [10]:
def has_relevant_tickers(news_tickers):
    return len(set(news_tickers)) == 0 or len(set(news_tickers).intersection(set(tickers))) != 0

def df_drop_irrelevant_tickers(df):
    return df[df.apply(lambda row: has_relevant_tickers(row.tickers), axis=1)]

Drop titles containing less than 30 symbols

In [11]:
def df_drop_short_titles(df):
    return df[df.apply(lambda row: len(row.title) >= 30, axis=1)]

Remove titles from *general_news* that are already included in *ticker_news* dataset

In [12]:
def df_remove_titles(df):
    return df[~df.title.isin(ticker_news.title.values.tolist())]

### Perform pre-processing steps
Use the preprocessing steps introduced above on *general_news* and *ticker_news* datasets

In [13]:
news_len = len(general_news)

general_news = df_remove_terms(general_news)
general_news = df_drop_duplicates(general_news, ['url'])
general_news = df_drop_duplicates(general_news, ['title'])
general_news = df_drop_short_titles(general_news)
general_news = df_drop_irrelevant_tickers(general_news)
general_news = df_remove_titles(general_news)

print(f'news: {news_len} -> {len(general_news)}')

news: 13418 -> 6587


In [14]:
ticker_news_len = {}
for ticker in tickers:
    ticker_news_len[ticker] = len(ticker_news[ticker_news.apply(lambda row: ticker in list(row.tickers), axis=1)])

ticker_news = df_remove_terms(ticker_news)
ticker_news = df_drop_duplicates(ticker_news, ['url'])
ticker_news = df_drop_duplicates(ticker_news, ['title'])
ticker_news = df_drop_short_titles(ticker_news)
ticker_news = df_drop_irrelevant_tickers(ticker_news)

for ticker in tickers:
    print(f'{ticker} news: {ticker_news_len[ticker]} -> {len(ticker_news[ticker_news.apply(lambda row: ticker in list(row.tickers), axis=1)])}')

META news: 601 -> 445
AAPL news: 1057 -> 771
GOOGL news: 664 -> 433
NFLX news: 348 -> 257
AMZN news: 1053 -> 788
^GSPC news: 3392 -> 2383


## Sentiment Classification Pipeline
Prepare the text classification pipeline used to assign news articles respective sentiment scores
The pipeline uses **WordPiece** tokenization and **FinBERT** text classification model

In [15]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig, TextClassificationPipeline

In [16]:
model_name = "ProsusAI/finbert"
labels = ['negative', 'neutral', 'positive']

In [17]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [18]:
config = AutoConfig.from_pretrained(model_name, num_labels=len(labels))
model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config).to('cuda')

In [19]:
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, device=0)

## Sentiment scores
The text classification pipeline assigns each title a sentiment score: the probability of it expressing a *negative*, *neutral* or *positive* market sentiment.

In [20]:
def sentiment_score(row, col='title'):
    scores = pipe(row[col], top_k=len(labels))
    score_dict = {score['label']: score['score'] for score in scores}
    row[f'{col}_label'] = scores[0]['label']
    row.negative, row.neutral, row.positive = score_dict['negative'], score_dict['neutral'], score_dict['positive']
    return row

def df_sentiment_score(df, col='title'):
    df[f'{col}_label'] = None
    df['negative'] = df['neutral'] = df['positive'] = 0
    return df.apply(lambda row: sentiment_score(row, col), axis=1)

Assign sentiment scores to *general_news* and *ticker_news* article titles
Save labels for article titles and summary for comparison

In [21]:
general_news = df_sentiment_score(general_news, 'title')
general_news.head()

Unnamed: 0,url,title,summary,date,category,tickers,title_label,negative,neutral,positive
0,https://finance.yahoo.com/news/1-u-n-mulls-sen...,"As U.N. mulls sending troops to Haiti, a gang'...",As Haiti's gang-induced humanitarian crisis de...,2022-10-29 12:43:23,World,[],negative,0.598501,0.33435,0.067149
1,https://finance.yahoo.com/news/britain-denies-...,Britain denies Russian claims that its navy pe...,Britain on Saturday denied Russian claims that...,2022-10-29 12:47:11,World,[],negative,0.86572,0.098223,0.036057
4,https://finance.yahoo.com/news/ukraine-questio...,Ukraine questions Twitter takeover amid precar...,A senior Ukrainian official expressed sceptici...,2022-10-29 14:21:07,World,[],negative,0.946472,0.038674,0.014854
6,https://finance.yahoo.com/news/ukraine-latest-...,Ukraine Latest: Russia Says UK Involved in Nav...,(Bloomberg) -- Most Read from BloombergTesla E...,2022-10-29 14:32:04,World,[],neutral,0.132021,0.829115,0.038864
7,https://finance.yahoo.com/news/strong-dollar-s...,Strong Dollar Seen Hurting US Outlook and Even...,(Bloomberg) -- A strong dollar is likely to we...,2022-10-29 13:15:00,Business,[],negative,0.925865,0.037888,0.036247


In [22]:
ticker_news = df_sentiment_score(ticker_news, 'title')
ticker_news.head()

Unnamed: 0,url,title,summary,date,category,tickers,title_label,negative,neutral,positive
0,https://www.barrons.com/articles/bitcoin-digit...,"Digital Dollars Could Be a Boon for Amazon, Al...","Companies like Amazon.com, Alphabet, and Accen...",2022-10-29 12:27:00,,[AMZN],neutral,0.064423,0.681635,0.253942
2,https://finance.yahoo.com/news/making-sense-bi...,Making Sense of Big Tech Earnings After Amazon...,The earnings bombshells from Amazon and Meta a...,2022-10-28 22:44:10,,[AMZN],neutral,0.016832,0.815436,0.167732
4,https://www.bizjournals.com/denver/news/2022/1...,"ULA, partners start expansions to ahead of Pro...",Colorado aerospace company United Launch Allia...,2022-10-28 20:57:12,,[AMZN],positive,0.010957,0.334704,0.654339
5,https://www.thestreet.com/technology/amazon-lo...,Amazon Briefly Leaves the $1 Trillion Club,The e-commerce giant falls after delivering di...,2022-10-28 20:47:00,,[AMZN],neutral,0.337673,0.627862,0.034464
6,https://www.investors.com/market-trend/stock-m...,"Dow Jones Futures: Market Rally Revs Higher, T...",The stock market rally faced hurdles but revve...,2022-10-29 12:10:31,,[META],neutral,0.066571,0.724576,0.208853


### Sentiment score analysis
Analyse *general_news* and *ticker_news* datasets sentiment classes distribution

In [23]:
from collections import Counter

In [24]:
Counter(general_news.title_label)

Counter({'negative': 2558, 'neutral': 2460, 'positive': 1569})

In [25]:
Counter(ticker_news.title_label)

Counter({'neutral': 2423, 'positive': 848, 'negative': 1806})

## Ticker prices pre-processing
All the operations on the *prices* dataset are performed per ticker
The ticker prices were generally scrapped with 15 min intervals
To ease the processing of the timestamps they are standardized (rounded up to the nearest quarter), redundant timestamps are removed

In [26]:
prices.head()

Unnamed: 0,ticker,price,date
0,META,99.2,2022-10-28 20:00:04
1,AAPL,155.74,2022-10-28 20:00:04
2,GOOGL,96.29,2022-10-28 20:00:04
3,NFLX,295.72,2022-10-28 20:00:04
4,AMZN,103.41,2022-10-28 20:00:04


In [27]:
def round_dt(df, row, delta='15min'):
    dt_floor, dt_ceil = row.date.floor(delta), row.date.ceil(delta)
    row.date = dt_ceil if dt_floor in df.date.tolist() else dt_floor
    return row

def df_round_dt(df, delta='15min'):
    return df.apply(lambda row: round_dt(df, row, delta), axis=1)

Add an hour to timestamps before Sunday, November 6 to account for the US daylight saving time

In [28]:
def account_dst(row):
    if row.date < pd.Timestamp('2022-11-06'):
        row.date += pd.DateOffset(hours=1)
    return row

def df_account_dst(df):
    return df.apply(lambda row: account_dst(row), axis=1)

In [29]:
ticker_dfs = {}

for ticker in tickers:
    ticker_df = prices[prices.ticker == ticker]
    df_len = len(ticker_df)

    ticker_df = df_round_dt(ticker_df)
    ticker_df = df_account_dst(ticker_df)
    ticker_df = df_drop_duplicates(ticker_df, 'date')

    ticker_dfs[ticker] = ticker_df
    print(f'{ticker} prices:\t{df_len} -> {len(ticker_df)}')

META prices:	889 -> 873
AAPL prices:	897 -> 878
GOOGL prices:	903 -> 881
NFLX prices:	892 -> 876
AMZN prices:	900 -> 884
^GSPC prices:	964 -> 937


## Technical indicators
Add intraday *returns* and *moving overage* indicators with periods in [10, 30, 60]

In [30]:
def df_returns(df):
    df['return'] = df.price - df.price.shift(1)
    return df

In [31]:
def df_ma(df, period):
    df[f'ma_{period}'] = df.price.rolling(period).mean()
    return df

In [32]:
for ticker, ticker_df in ticker_dfs.items():
    ticker_df = df_returns(ticker_df)
    ticker_df = df_ma(ticker_df, 10)
    ticker_df = df_ma(ticker_df, 30)
    ticker_df = df_ma(ticker_df, 60)

In [33]:
ticker_dfs['GOOGL'].tail()

Unnamed: 0,ticker,price,date,return,ma_10,ma_30,ma_60
5414,GOOGL,90.065,2022-12-16 20:00:00,0.175,89.94016,90.272003,91.326163
5423,GOOGL,90.35,2022-12-16 20:15:00,0.285,89.99366,90.266503,91.22908
5428,GOOGL,90.66,2022-12-16 20:30:00,0.31,90.0565,90.266837,91.163747
5436,GOOGL,90.7,2022-12-16 20:45:00,0.04,90.139,90.26117,91.094913
5437,GOOGL,90.26,2022-12-16 21:00:00,-0.44,90.168,90.237503,91.028247


Analyse the percentage of negative returns for each ticker

In [34]:
for ticker, ticker_df in ticker_dfs.items():
    print(f'Negative returns share {ticker}:\t{round(len(ticker_df[ticker_df["return"] < 0]) / len(ticker_df) * 100)}%')

Negative returns share META:	49%
Negative returns share AAPL:	49%
Negative returns share GOOGL:	51%
Negative returns share NFLX:	50%
Negative returns share AMZN:	52%
Negative returns share ^GSPC:	46%


## Fundamental indicators: sentiment values
Add sentiment value for each data point in *prices* dataset. Sentiment value is calculated as a sum of sentiment scores of individual news articles with date of issue in (previous timestamp, current timestamp). General news are considered to have the same impact on all tickers, while ticker specific news are computed individually.

In [35]:
def compute_sent_value(df, dt1, dt2, prefix='general', mode='sum'):
    if mode == 'sum':
        return df[(df.date >= dt1) & (df.date <= dt2)][labels].sum().fillna(0).rename(lambda c: f'{prefix}_{c}_{mode}').to_frame().T
    else:
        return df[(df.date >= dt1) & (df.date <= dt2)][labels].mean().fillna(0).rename(lambda c: f'{prefix}_{c}_{mode}').to_frame().T

def df_compute_sent_value(df, df_sentiment, prefix='general', mode='sum'):
    sentiment_values = pd.concat([compute_sent_value(df_sentiment, dt1, dt2, prefix, mode) for dt1, dt2 in zip(list(df.date), list(df.shift(-1).date))])
    return pd.concat([df.reset_index(drop=True), sentiment_values.reset_index(drop=True)], axis=1)

Add sentiment value for each data point in *prices* dataset. Sentiment value is calculated as a count of individual negative, neutral and positive news articles with date of issue in (previous timestamp, current timestamp). General news are considered to have the same impact on all tickers, while ticker specific news are computed individually.

In [36]:
def count_sent_value(df, dt1, dt2, prefix='general'):
    return df[(df.date >= dt1) & (df.date <= dt2)][[f'{l}_count' for l in labels]].sum().fillna(0).rename(lambda c: f'{prefix}_{c}').to_frame().T

def df_count_sent_value(df, df_sentiment, prefix='general'):
    df_sentiment_new = pd.concat([df_sentiment, pd.get_dummies(df_sentiment.title_label).rename(lambda c: f'{c}_count', axis=1)], axis=1)
    sentiment_values = pd.concat([count_sent_value(df_sentiment_new, dt1, dt2, prefix) for dt1, dt2 in zip(list(df.date), list(df.shift(-1).date))])
    return pd.concat([df.reset_index(drop=True), sentiment_values.reset_index(drop=True)], axis=1)

Add sentiment value for each data point in prices dataset. Sentiment value is calculated as a mean of general news mean sentiment and company specific mean sentiment

In [37]:
def df_compute_gen_com_sent_value(df):
    df['gen_com_negative_mean'] = df[['general_negative_mean', 'company_negative_mean']].mean(axis=1)
    df['gen_com_neutral_mean'] = df[['general_neutral_mean', 'company_neutral_mean']].mean(axis=1)
    df['gen_com_positive_mean'] = df[['general_positive_mean', 'company_positive_mean']].mean(axis=1)
    return df

Add fundamental indicators

In [38]:
for ticker, ticker_df in ticker_dfs.items():
    ticker_df = df_compute_sent_value(ticker_df, general_news, 'general', 'sum')
    ticker_df = df_compute_sent_value(ticker_df, general_news, 'general', 'mean')

    ticker_df = df_compute_sent_value(ticker_df, ticker_news[ticker_news.apply(lambda row: ticker in list(row.tickers), axis=1)], 'company', 'sum')
    ticker_df = df_compute_sent_value(ticker_df, ticker_news[ticker_news.apply(lambda row: ticker in list(row.tickers), axis=1)], 'company', 'mean')

    ticker_df = df_count_sent_value(ticker_df, general_news, 'general')

    ticker_df = df_count_sent_value(ticker_df, ticker_news[ticker_news.apply(lambda row: ticker in list(row.tickers), axis=1)], 'company')

    ticker_df = df_compute_gen_com_sent_value(ticker_df)

    ticker_dfs[ticker] = ticker_df

Potential feature set

In [39]:
ticker_dfs['AAPL'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878 entries, 0 to 877
Data columns (total 28 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   ticker                  878 non-null    object        
 1   price                   878 non-null    float64       
 2   date                    878 non-null    datetime64[ns]
 3   return                  877 non-null    float64       
 4   ma_10                   869 non-null    float64       
 5   ma_30                   849 non-null    float64       
 6   ma_60                   819 non-null    float64       
 7   general_negative_sum    878 non-null    float64       
 8   general_neutral_sum     878 non-null    float64       
 9   general_positive_sum    878 non-null    float64       
 10  general_negative_mean   878 non-null    float64       
 11  general_neutral_mean    878 non-null    float64       
 12  general_positive_mean   878 non-null    float64   

# Hyperparameter tuning
A subset of features computed above will be used for Infromer's hyperparameter tuning
AAPL dataset with features [date, price, ma_30, general_negative_sum, general_neutral_sum, general_positive_sum, company_negative_sum, company_neutral_sum, company_positive_sum] is selected

In [40]:
features = ['date', 'price', 'return', 'ma_10', 'general_negative_sum', 'general_neutral_sum', 'general_positive_sum', 'company_negative_sum', 'company_neutral_sum', 'company_positive_sum']
feature_num = len(features) - 1 # date is not considered a feature by the Informer
target = 'price'

In [41]:
aapl = ticker_dfs['AAPL'][features].dropna()
aapl = aapl.set_index('date')
aapl

Unnamed: 0_level_0,price,return,ma_10,general_negative_sum,general_neutral_sum,general_positive_sum,company_negative_sum,company_neutral_sum,company_positive_sum
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2022-10-31 16:30:00,153.6350,0.3250,153.28650,0.935387,0.045367,0.019246,0.000000,0.000000,0.000000
2022-10-31 16:45:00,153.1750,-0.4600,153.03000,1.073090,0.195716,0.731193,0.000000,0.000000,0.000000
2022-10-31 17:00:00,153.6001,0.4251,153.05801,0.991520,0.891117,1.117363,0.000000,0.000000,0.000000
2022-10-31 17:15:00,153.9350,0.3349,153.14751,1.725424,1.371037,0.903539,1.801441,0.160080,0.038479
2022-10-31 17:30:00,153.1900,-0.7450,153.20301,0.920477,0.068049,0.011474,0.875410,0.111744,0.012846
...,...,...,...,...,...,...,...,...,...
2022-12-16 20:00:00,134.5467,0.3817,134.20297,1.308567,0.465379,0.226054,0.000000,0.000000,0.000000
2022-12-16 20:15:00,134.7516,0.2049,134.24063,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2022-12-16 20:30:00,135.1750,0.4234,134.33213,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2022-12-16 20:45:00,135.0200,-0.1550,134.42713,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


Saving dataset

In [42]:
aapl.to_csv("./data/stock/data-fine-tuning.csv")

Setting Informer's default parameters

In [43]:
default_args = {
    'model': 'informer',

    'data': 'data-fine-tuning',
    'root_path': './data/stock',
    'data_path': 'AAPL.csv',
    'features': 'MS',
    'ftr_num': feature_num,
    'd_out': 1,
    'target': target,
    'freq': '15t',

    'seq_len': 27,
    'pred_len': 54,

    'itr': 10,
    'train_epochs': 10,
    'batch_size': 24,
    'patience': 5,
    'learning_rate': 0.0001,
    'loss': 'mse',
    'lradj': 'type1',
    'inverse': False,

    'd_model': 512,
    'n_heads': 4,
    'e_layers': 4,
    'd_ff': 2048,

    'embed': 't2v',
    'activation': 'gelu',
    'padding': 0,
    'dropout': 0.05,

    'output_attention': False,
    'predict': False,

    'num_workers': 0,
    'use_gpu': True,
    'gpu': 0,
    'use_multi_gpu': False,
    'devices': '0'
}

Hyperparameter tuning is performed using a grid search


In [44]:
parameters = {
    'batch_size': [6, 27, 54],
    'd_model': [256, 512],
    'n_heads': [4, 6, 10],
    'e_layers': [4, 6, 10],
    'activation': ['gelu', 'relu']
}

In [45]:
import itertools

In [46]:
combinations = list(itertools.product(*parameters.values()))
total = len(combinations)
print(f'Param combinations: {total}')

Param combinations: 108


Performing grid search

In [47]:
from run_informer import train_informer

In [82]:
params_evals = {}
counter = 0

for batch_size, d_model, n_heads, e_layers, activation in combinations:
    model_args = default_args.copy()
    model_args['batch_size'] = batch_size
    model_args['d_model'] = d_model
    model_args['n_heads'] = n_heads
    model_args['e_layers'] = e_layers
    model_args['d_ff'] = 4 * d_model
    model_args['activation'] = activation

    exp = train_informer(model_args, supress_output=True)
    params_evals[(batch_size, d_model, n_heads, e_layers, activation)] = exp.val_loss_min

    counter += 1
    print(f'{counter} out of {total} - mse: {exp.val_loss_min}')

1 out of 108 - mse: 0.26899251341819763
2 out of 108 - mse: 0.20969799160957336
3 out of 108 - mse: 0.193546861410141
4 out of 108 - mse: 0.2284192442893982
5 out of 108 - mse: 0.22204357385635376
6 out of 108 - mse: 0.3281112313270569
7 out of 108 - mse: 0.2816500663757324
8 out of 108 - mse: 0.2421703338623047
9 out of 108 - mse: 0.2289111316204071
10 out of 108 - mse: 0.2516852021217346
11 out of 108 - mse: 0.1628008782863617
12 out of 108 - mse: 0.263691246509552
13 out of 108 - mse: 0.1896105855703354
14 out of 108 - mse: 0.18860214948654175
15 out of 108 - mse: 0.2411007434129715
16 out of 108 - mse: 0.2560585141181946
17 out of 108 - mse: 0.20932245254516602
18 out of 108 - mse: 0.3189963102340698
19 out of 108 - mse: 0.23817308247089386
20 out of 108 - mse: 0.2724705934524536
21 out of 108 - mse: 0.2017807960510254
22 out of 108 - mse: 0.24467429518699646
23 out of 108 - mse: 0.2420923113822937
24 out of 108 - mse: 0.20827147364616394
25 out of 108 - mse: 0.18005388975143433
26

Saving best performing arguments
Printing top 10

In [88]:
params_evals = sorted(params_evals.items(), key=lambda it: it[1])
best_args = params_evals[0][0]
print('{: <10} {: <7} {: <6} {: <8} {: <10} | {: <10}'.format(*parameters.keys(), 'mse'))
print('-' * 69)
for k, v in params_evals[:10]:
    print('{: <10} {: <7} {: <7} {: <8} {: <10} | {: <10.4f}'.format(*k, v))

batch_size d_model n_heads e_layers activation | mse       
---------------------------------------------------------------------
6          512     10      6        gelu       | 0.1604    
6          256     6       10       gelu       | 0.1628    
54         512     10      10       relu       | 0.1722    
54         512     4       10       relu       | 0.1725    
6          512     6       4        gelu       | 0.1801    
54         512     6       10       relu       | 0.1819    
54         512     4       10       gelu       | 0.1845    
6          256     10      4        relu       | 0.1886    
6          256     10      4        gelu       | 0.1896    
6          256     4       6        gelu       | 0.1935    


In [90]:
model_args = default_args
(model_args['batch_size'], model_args['d_model'], model_args['n_heads'], model_args['e_layers'], model_args['activation']) = best_args
model_args['d_ff'] = 4 * model_args['d_model']
model_args

{'model': 'informer',
 'data': 'data-fine-tuning',
 'root_path': './data/stock',
 'data_path': 'AAPL.csv',
 'features': 'MS',
 'ftr_num': 9,
 'd_out': 1,
 'target': 'price',
 'freq': '15t',
 'seq_len': 27,
 'pred_len': 54,
 'itr': 10,
 'train_epochs': 10,
 'batch_size': 6,
 'patience': 5,
 'learning_rate': 0.0001,
 'loss': 'mse',
 'lradj': 'type1',
 'inverse': False,
 'd_model': 512,
 'n_heads': 10,
 'e_layers': 6,
 'd_ff': 2048,
 'embed': 't2v',
 'activation': 'gelu',
 'padding': 0,
 'dropout': 0.05,
 'output_attention': False,
 'predict': False,
 'num_workers': 0,
 'use_gpu': True,
 'gpu': 0,
 'use_multi_gpu': False,
 'devices': '0'}

## Feature set analysis
Analysing which composition of features performs best on a tuned model
Checking if media sentiment analysis is beneficial

In [91]:
ticker_dfs['AAPL'].columns.tolist()

['ticker',
 'price',
 'date',
 'return',
 'ma_10',
 'ma_30',
 'ma_60',
 'general_negative_sum',
 'general_neutral_sum',
 'general_positive_sum',
 'general_negative_mean',
 'general_neutral_mean',
 'general_positive_mean',
 'company_negative_sum',
 'company_neutral_sum',
 'company_positive_sum',
 'company_negative_mean',
 'company_neutral_mean',
 'company_positive_mean',
 'general_negative_count',
 'general_neutral_count',
 'general_positive_count',
 'company_negative_count',
 'company_neutral_count',
 'company_positive_count',
 'gen_com_negative_mean',
 'gen_com_neutral_mean',
 'gen_com_positive_mean']

In [92]:
feature_sets = [
    ['date', 'price'],
    ['date', 'price', 'return'],
    ['date', 'price', 'ma_10'],
    ['date', 'price', 'ma_30'],
    ['date', 'price', 'ma_60'],
    ['date', 'price', 'return', 'ma_10'],
    ['date', 'price', 'general_negative_sum', 'general_neutral_sum', 'general_positive_sum'],
    ['date', 'price', 'general_negative_mean', 'general_neutral_mean', 'general_positive_mean'],
    ['date', 'price', 'general_negative_count', 'general_neutral_count', 'general_positive_count'],
    ['date', 'price', 'company_negative_sum', 'company_neutral_sum', 'company_positive_sum'],
    ['date', 'price', 'company_negative_mean', 'company_neutral_mean', 'company_positive_mean'],
    ['date', 'price', 'company_negative_count', 'company_neutral_count', 'company_positive_count'],
    ['date', 'price', 'gen_com_negative_mean', 'gen_com_neutral_mean', 'gen_com_positive_mean'],
    ['date', 'price', 'general_negative_sum', 'general_neutral_sum', 'general_positive_sum', 'company_negative_sum', 'company_neutral_sum', 'company_positive_sum'],
    ['date', 'price', 'general_negative_mean', 'general_neutral_mean', 'general_positive_mean', 'company_negative_mean', 'company_neutral_mean', 'company_positive_mean'],
    ['date', 'price', 'general_negative_count', 'general_neutral_count', 'general_positive_count', 'company_negative_count', 'company_neutral_count', 'company_positive_count'],
    ['date', 'price', 'return', 'general_negative_mean', 'general_neutral_mean', 'general_positive_mean'],
    ['date', 'price', 'return', 'gen_com_negative_mean', 'gen_com_neutral_mean', 'gen_com_positive_mean'],
]

total = len(feature_sets)

In [93]:
def eval_ftr_set(label, ftr_set, seq_len, pred_len):

    df = ticker_dfs[label][ftr_set].dropna()
    df = df.set_index('date')
    df.to_csv(f'./data/stock/{label}.csv')

    args = model_args.copy()
    args['data_path'] = f'{label}.csv'
    args['ftr_num'] = len(ftr_set) - 1
    args['seq_len'] = seq_len
    args['pred_len'] = pred_len

    return train_informer(args, supress_output=True).test()[0]

Perform feature set analysis over variable length sequences

In [94]:
model_evals = {}
seq_pred_lengths = [[27, 27], [27, 54], [27, 81]]

counter = 0
total = len(seq_pred_lengths) * len(tickers) * len(feature_sets)
print(total)

for ticker in tickers:
    model_evals[ticker] = {}
    for feature_set in feature_sets:
        model_evals[ticker][tuple(feature_set)] = {}
        for seq_pred in seq_pred_lengths:
            loss = eval_ftr_set(ticker, feature_set, seq_pred[0], seq_pred[1])
            model_evals[ticker][tuple(feature_set)][tuple(seq_pred)] = loss

            counter += 1
            print(f'{counter} out of {total} - mse: {loss}')

324
1 out of 324 - mse: 0.21827782690525055
2 out of 324 - mse: 0.6128498315811157
3 out of 324 - mse: 0.6515957713127136
4 out of 324 - mse: 0.27997922897338867
5 out of 324 - mse: 0.42263373732566833
6 out of 324 - mse: 0.4873882234096527
7 out of 324 - mse: 0.25352951884269714
8 out of 324 - mse: 0.37310996651649475
9 out of 324 - mse: 0.6503525376319885
10 out of 324 - mse: 0.29422491788864136
11 out of 324 - mse: 0.44784048199653625
12 out of 324 - mse: 0.6508237719535828
13 out of 324 - mse: 0.3948972523212433
14 out of 324 - mse: 0.5443266034126282
15 out of 324 - mse: 0.7468590140342712
16 out of 324 - mse: 0.31534966826438904
17 out of 324 - mse: 0.5051328539848328
18 out of 324 - mse: 0.5515772700309753
19 out of 324 - mse: 0.3814734220504761
20 out of 324 - mse: 0.7256429195404053
21 out of 324 - mse: 0.7738322615623474
22 out of 324 - mse: 0.41666311025619507
23 out of 324 - mse: 0.6965992450714111
24 out of 324 - mse: 0.8064910173416138
25 out of 324 - mse: 0.4024934768676

Display the results

In [135]:
import numpy as np

In [139]:
features = [
    'price',
    'price + return',
    'price + ma_10',
    'price + ma_30',
    'price + ma_60',
    'price + return + ma_10',
    'price + general_sum',
    'price + general_mean',
    'price + general_count',
    'price + company_sum',
    'price + company_mean',
    'price + company_count',
    'price + gen_com_mean',
    'price + general_sum + company_sum',
    'price + general_mean + company_mean',
    'price + general_count + company_count',
    'price + return + general_mean',
    'price + return + gen_com_mean'
]

In [224]:
rows = []

for ftr_set in feature_sets:
    row = []
    for pred_len in [27, 54, 81]:
        for ticker in tickers:
            row.append(round(model_evals[ticker][tuple(ftr_set)][tuple([27, pred_len])], 3))
    rows.append(row)

rows = np.array(rows)

In [None]:
res27 = pd.DataFrame(data=rows[:, :6], index=features)
res27.columns = list(tickers)

mins27 = res27.min(numeric_only=True)
best27 = res27.apply(lambda row: (row.eq(mins27).values == True).sum(), axis=1)

res27['Average'] = res27.mean(numeric_only=True, axis=1).round(3)
res27['Best'] = best27

In [None]:
res54 = pd.DataFrame(data=rows[:, 6:12], index=features)
res54.columns = list(tickers)

mins54 = res54.min(numeric_only=True)
best54 = res54.apply(lambda row: (row.eq(mins54).values == True).sum(), axis=1)

res54['Average'] = res54.mean(numeric_only=True, axis=1).round(3)
res54['Best'] = best54

In [None]:
res81 = pd.DataFrame(data=rows[:, 12:], index=features)
res81.columns = list(tickers)

mins81 = res81.min(numeric_only=True)
best81 = res81.apply(lambda row: (row.eq(mins81).values == True).sum(), axis=1)

res81['Average'] = res81.mean(numeric_only=True, axis=1).round(3)
res81['Best'] = best81

In [52]:
res27 = pd.read_csv('res27.csv', index_col=0)
res54 = pd.read_csv('res54.csv', index_col=0)
res81 = pd.read_csv('res81.csv', index_col=0)

In [53]:
res27.sort_values(by=['Average', 'Best'], ascending=[1, 0])

Unnamed: 0,META,AAPL,GOOGL,NFLX,AMZN,^GSPC,Average,Best
price + return + general_mean,0.286,0.316,0.349,0.774,0.323,0.326,0.396,1
price + return + gen_com_mean,0.295,0.422,0.382,0.714,0.356,0.266,0.406,0
price + return,0.28,0.42,0.385,0.719,0.348,0.307,0.41,0
price + general_sum,0.381,0.335,0.39,0.728,0.255,0.379,0.411,0
price,0.218,0.587,0.403,0.597,0.337,0.346,0.415,2
price + return + ma_10,0.315,0.418,0.454,0.703,0.334,0.275,0.416,0
price + general_count,0.402,0.336,0.394,0.753,0.273,0.382,0.423,0
price + ma_10,0.254,0.643,0.475,0.597,0.366,0.334,0.445,1
price + gen_com_mean,0.36,0.935,0.499,0.629,0.219,0.306,0.491,1
price + general_count + company_count,0.367,0.275,0.45,1.14,0.403,0.404,0.506,1


In [74]:
res27.sort_values(by=['Best', 'Average'], ascending=[0, 1]).head(6)

Unnamed: 0,META,AAPL,GOOGL,NFLX,AMZN,^GSPC,Average,Best
price,0.218,0.587,0.403,0.597,0.337,0.346,0.415,2
price + return + general_mean,0.286,0.316,0.349,0.774,0.323,0.326,0.396,1
price + ma_10,0.254,0.643,0.475,0.597,0.366,0.334,0.445,1
price + gen_com_mean,0.36,0.935,0.499,0.629,0.219,0.306,0.491,1
price + general_count + company_count,0.367,0.275,0.45,1.14,0.403,0.404,0.506,1
price + ma_30,0.294,0.773,0.551,0.724,0.464,0.263,0.512,1


In [55]:
res54.sort_values(by=['Average', 'Best'], ascending=[1, 0])

Unnamed: 0,META,AAPL,GOOGL,NFLX,AMZN,^GSPC,Average,Best
price + general_sum,0.726,0.891,0.783,1.238,0.394,0.566,0.766,2
price,0.613,1.071,1.113,0.996,0.408,0.474,0.779,1
price + return,0.423,1.23,0.906,1.085,0.64,0.462,0.791,0
price + return + gen_com_mean,0.448,1.288,0.674,1.187,0.771,0.448,0.803,0
price + general_count,0.638,1.08,0.891,1.233,0.446,0.636,0.821,0
price + ma_10,0.373,1.504,1.187,1.017,0.475,0.423,0.83,1
price + return + general_mean,0.468,1.08,0.592,1.267,0.993,0.602,0.834,1
price + return + ma_10,0.505,1.281,0.866,1.28,0.696,0.414,0.84,0
price + gen_com_mean,0.648,1.637,0.754,1.004,0.471,0.58,0.849,0
price + general_mean + company_mean,0.565,1.189,0.858,1.541,0.503,0.796,0.909,0


In [73]:
res54.sort_values(by=['Best', 'Average'], ascending=[0, 1]).head(5)

Unnamed: 0,META,AAPL,GOOGL,NFLX,AMZN,^GSPC,Average,Best
price + general_sum,0.726,0.891,0.783,1.238,0.394,0.566,0.766,2
price,0.613,1.071,1.113,0.996,0.408,0.474,0.779,1
price + ma_10,0.373,1.504,1.187,1.017,0.475,0.423,0.83,1
price + return + general_mean,0.468,1.08,0.592,1.267,0.993,0.602,0.834,1
price + ma_30,0.448,1.577,0.93,1.135,1.214,0.367,0.945,1


In [57]:
res81.sort_values(by=['Average', 'Best'], ascending=[1, 0])

Unnamed: 0,META,AAPL,GOOGL,NFLX,AMZN,^GSPC,Average,Best
price + general_mean,0.806,1.234,0.841,1.25,0.812,0.626,0.928,0
price + general_sum,0.774,1.119,0.719,1.767,0.747,0.453,0.93,0
price + general_count,0.818,1.367,0.745,1.925,0.727,0.486,1.011,1
price + gen_com_mean,0.524,1.647,0.794,1.703,0.941,0.511,1.02,0
price + ma_10,0.65,1.445,1.511,1.22,1.079,0.458,1.06,0
price + company_mean,0.454,1.034,0.679,1.46,2.305,0.598,1.088,2
price + ma_30,0.651,1.569,0.753,1.475,1.629,0.493,1.095,0
price + return + ma_10,0.552,1.391,0.868,1.796,1.479,0.523,1.102,0
price + return + gen_com_mean,0.54,1.541,0.886,1.642,1.599,0.595,1.134,0
price,0.652,1.726,2.177,1.094,0.802,0.478,1.155,1


In [72]:
res81.sort_values(by=['Best', 'Average'], ascending=[0, 1]).head(5)

Unnamed: 0,META,AAPL,GOOGL,NFLX,AMZN,^GSPC,Average,Best
price + company_mean,0.454,1.034,0.679,1.46,2.305,0.598,1.088,2
price + general_count,0.818,1.367,0.745,1.925,0.727,0.486,1.011,1
price,0.652,1.726,2.177,1.094,0.802,0.478,1.155,1
price + return,0.487,1.504,0.795,1.861,1.904,0.441,1.165,1
price + general_mean + company_mean,0.611,1.424,0.546,2.161,1.399,1.097,1.206,1


In [79]:
ticker_means = {}

for ticker in tickers:
    df_con = pd.concat([res27[ticker], res54[ticker], res81[ticker]])
    ticker_means[ticker] = df_con.groupby(df_con.index).mean().T

In [94]:
ticker_means = pd.concat(ticker_means.values(), axis=1)
ticker_means

Unnamed: 0,META,AAPL,GOOGL,NFLX,AMZN,^GSPC
price,0.494333,1.128,1.231,0.895667,0.515667,0.432667
price + company_count,0.754333,1.029667,1.402667,1.648,1.419667,0.497
price + company_mean,0.399667,1.028,0.807667,1.130333,1.27,0.559
price + company_sum,0.564333,0.817333,1.339667,2.225333,1.494,0.482333
price + gen_com_mean,0.510667,1.406333,0.682333,1.112,0.543667,0.465667
price + general_count,0.619333,0.927667,0.676667,1.303667,0.482,0.501333
price + general_count + company_count,0.500667,1.106,0.983667,1.566667,1.822333,0.502
price + general_mean,0.64,1.286667,0.807333,1.076333,0.591333,0.622
price + general_mean + company_mean,0.504667,1.024,0.681,1.471667,0.781333,0.789667
price + general_sum,0.627,0.781667,0.630667,1.244333,0.465333,0.466


In [85]:
ticker_means['META'].sort_values()

price + return                           0.396667
price + company_mean                     0.399667
price + return + general_mean            0.423000
price + ma_10                            0.425667
price + return + gen_com_mean            0.427667
price + return + ma_10                   0.457333
price + ma_30                            0.464333
price                                    0.494333
price + general_sum + company_sum        0.495000
price + general_count + company_count    0.500667
price + general_mean + company_mean      0.504667
price + gen_com_mean                     0.510667
price + ma_60                            0.562000
price + company_sum                      0.564333
price + general_count                    0.619333
price + general_sum                      0.627000
price + general_mean                     0.640000
price + company_count                    0.754333
Name: META, dtype: float64

In [87]:
ticker_means['AAPL'].sort_values()

price + general_sum                      0.781667
price + company_sum                      0.817333
price + general_count                    0.927667
price + return + general_mean            0.935667
price + general_mean + company_mean      1.024000
price + company_mean                     1.028000
price + company_count                    1.029667
price + return + ma_10                   1.030000
price + return                           1.051333
price + general_sum + company_sum        1.080333
price + return + gen_com_mean            1.083667
price + general_count + company_count    1.106000
price                                    1.128000
price + ma_10                            1.197333
price + general_mean                     1.286667
price + ma_30                            1.306333
price + ma_60                            1.393667
price + gen_com_mean                     1.406333
Name: AAPL, dtype: float64

In [88]:
ticker_means['GOOGL'].sort_values()

price + general_sum                      0.630667
price + return + gen_com_mean            0.647333
price + return + general_mean            0.649000
price + general_count                    0.676667
price + general_mean + company_mean      0.681000
price + gen_com_mean                     0.682333
price + return                           0.695333
price + return + ma_10                   0.729333
price + ma_30                            0.744667
price + general_mean                     0.807333
price + company_mean                     0.807667
price + ma_60                            0.875000
price + general_sum + company_sum        0.903000
price + general_count + company_count    0.983667
price + ma_10                            1.057667
price                                    1.231000
price + company_sum                      1.339667
price + company_count                    1.402667
Name: GOOGL, dtype: float64

In [90]:
ticker_means['NFLX'].sort_values()

price                                    0.895667
price + ma_10                            0.944667
price + general_mean                     1.076333
price + ma_30                            1.111333
price + gen_com_mean                     1.112000
price + company_mean                     1.130333
price + return + gen_com_mean            1.181000
price + return                           1.221667
price + general_sum                      1.244333
price + return + ma_10                   1.259667
price + general_count                    1.303667
price + ma_60                            1.351333
price + return + general_mean            1.389333
price + general_mean + company_mean      1.471667
price + general_count + company_count    1.566667
price + general_sum + company_sum        1.626667
price + company_count                    1.648000
price + company_sum                      2.225333
Name: NFLX, dtype: float64

In [91]:
ticker_means['AMZN'].sort_values()

price + general_sum                      0.465333
price + general_count                    0.482000
price                                    0.515667
price + gen_com_mean                     0.543667
price + general_mean                     0.591333
price + ma_10                            0.640000
price + general_mean + company_mean      0.781333
price + return + ma_10                   0.836333
price + return + gen_com_mean            0.908667
price + return + general_mean            0.914000
price + return                           0.964000
price + ma_30                            1.102333
price + company_mean                     1.270000
price + company_count                    1.419667
price + company_sum                      1.494000
price + general_sum + company_sum        1.570000
price + general_count + company_count    1.822333
price + ma_60                            2.125333
Name: AMZN, dtype: float64

In [92]:
ticker_means['^GSPC'].sort_values()

price + ma_60                            0.371000
price + ma_30                            0.374333
price + return                           0.403333
price + return + ma_10                   0.404000
price + ma_10                            0.405000
price                                    0.432667
price + return + gen_com_mean            0.436333
price + general_sum + company_sum        0.458667
price + gen_com_mean                     0.465667
price + general_sum                      0.466000
price + company_sum                      0.482333
price + company_count                    0.497000
price + general_count                    0.501333
price + general_count + company_count    0.502000
price + return + general_mean            0.552667
price + company_mean                     0.559000
price + general_mean                     0.622000
price + general_mean + company_mean      0.789667
Name: ^GSPC, dtype: float64

In [238]:
res27.to_csv('res27.csv')
res54.to_csv('res54.csv')
res81.to_csv('res81.csv')

In [None]:
# for row in res81.sort_values(by=['Average', 'Best'], ascending=[1, 0]).values.tolist():
#     print(' & '.join(['%.3f' % e for e in row ]))