# Data processing and sentiment analysis
This notebook illustrates the pre-processing steps performed on the gathered data, performs sentiment analysis of the news titles and formulates the feature set taken as inputs by the Informer model

In [201]:
import warnings
warnings.filterwarnings('ignore')

In [202]:
import pandas as pd

## The gathered data overview
Read and describe the basic information about the datasets
The *general_news* dataset contains articles related to the stock market at large
The *ticker_news* dataset contains news relevant to a given ticker

In [203]:
general_news = pd.read_json("../bp-scrapper-data/news.jsonl", lines=True)
prices = pd.read_json("../bp-scrapper-data/price.jsonl", lines=True)
ticker_news = pd.read_json("../bp-scrapper-data/ticker_news.jsonl", lines=True)

In [204]:
general_news.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10638 entries, 0 to 10637
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   url       10638 non-null  object        
 1   title     10638 non-null  object        
 2   summary   10638 non-null  object        
 3   date      10638 non-null  datetime64[ns]
 4   category  10638 non-null  object        
 5   tickers   10638 non-null  object        
dtypes: datetime64[ns](1), object(5)
memory usage: 498.8+ KB


In [205]:
prices.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4240 entries, 0 to 4239
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   ticker  4240 non-null   object        
 1   price   4240 non-null   float64       
 2   date    4240 non-null   datetime64[ns]
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 99.5+ KB


In [206]:
ticker_news.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5696 entries, 0 to 5695
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   url       5696 non-null   object        
 1   title     5696 non-null   object        
 2   summary   5696 non-null   object        
 3   date      5696 non-null   datetime64[ns]
 4   category  0 non-null      float64       
 5   tickers   5696 non-null   object        
dtypes: datetime64[ns](1), float64(1), object(4)
memory usage: 267.1+ KB


In [207]:
tickers = prices.ticker.unique()
tickers

array(['META', 'AAPL', 'GOOGL', 'NFLX', 'AMZN', '^GSPC'], dtype=object)

## Initial text preprocessing
Remove *Yahoo!Finance* special terms from the news titles

In [208]:
def remove_terms(row):
    i = row.title.find('-')
    if i != -1 and i < 30 and (row.title[i + 1].isupper() or row.title[i + 1].isnumeric()):
        row.title = row.title.split('-', 1)[1]
    return row

def df_remove_terms(df):
    return df.apply(lambda row: remove_terms(row), axis=1)

Drop duplicates [url, titles]

In [209]:
def df_drop_duplicates(df, columns):
    return df.drop_duplicates(columns)

Drop news unrelated to relevant tickers

In [210]:
def has_relevant_tickers(news_tickers):
    return len(set(news_tickers)) == 0 or len(set(news_tickers).intersection(set(tickers))) != 0

def df_drop_irrelevant_tickers(df):
    return df[df.apply(lambda row: has_relevant_tickers(row.tickers), axis=1)]

Drop titles containing less than 30 symbols

In [211]:
def df_drop_short_titles(df):
    return df[df.apply(lambda row: len(row.title) >= 30, axis=1)]

Remove titles from *general_news* that are already included in *ticker_news* dataset

In [212]:
def df_remove_titles(df):
    return df[~df.title.isin(ticker_news.title.values.tolist())]

### Perform pre-processing steps
Use the preprocessing steps introduced above on *general_news* and *ticker_news* datasets

In [213]:
news_len = len(general_news)

general_news = df_remove_terms(general_news)
general_news = df_drop_duplicates(general_news, ['url'])
general_news = df_drop_duplicates(general_news, ['title'])
general_news = df_drop_short_titles(general_news)
general_news = df_drop_irrelevant_tickers(general_news)
general_news = df_remove_titles(general_news)

print(f'news: {news_len} -> {len(general_news)}')

news: 10638 -> 5197


In [214]:
ticker_news_len = len(ticker_news)

ticker_news = df_remove_terms(ticker_news)
ticker_news = df_drop_duplicates(ticker_news, ['url'])
ticker_news = df_drop_duplicates(ticker_news, ['title'])
ticker_news = df_drop_short_titles(ticker_news)
ticker_news = df_drop_irrelevant_tickers(ticker_news)

print(f'ticker_news: {ticker_news_len} -> {len(ticker_news)}')

ticker_news: 5696 -> 4093


## Sentiment Classification Pipeline
Prepare the text classification pipeline used to assign news articles respective sentiment scores
The pipeline uses **WordPiece** tokenization and **FinBert** text classification model

In [215]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig, TextClassificationPipeline

In [216]:
model_name = "ProsusAI/finbert"
labels = ['negative', 'neutral', 'positive']

In [217]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [218]:
config = AutoConfig.from_pretrained(model_name, num_labels=len(labels))
model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config).to('cuda')

In [219]:
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, device=0)

## Sentiment scores
The text classification pipeline assigns each title a sentiment score: the probability of it expressing a *negative*, *neutral* or *positive* market sentiment.

In [220]:
def sentiment_score(row, col='title'):
    scores = pipe(row[col], top_k=len(labels))
    score_dict = {score['label']: score['score'] for score in scores}
    row[f'{col}_label'] = scores[0]['label']
    row.negative, row.neutral, row.positive = score_dict['negative'], score_dict['neutral'], score_dict['positive']
    return row

def df_sentiment_score(df, col='title'):
    df[f'{col}_label'] = None
    df['negative'] = df['neutral'] = df['positive'] = 0
    return df.apply(lambda row: sentiment_score(row, col), axis=1)

Assign sentiment scores to *general_news* and *ticker_news* article titles
Save labels for article titles and summary for comparison

In [221]:
general_news = df_sentiment_score(general_news, 'title')
general_news.head()

Unnamed: 0,url,title,summary,date,category,tickers,title_label,negative,neutral,positive
0,https://finance.yahoo.com/news/1-u-n-mulls-sen...,"As U.N. mulls sending troops to Haiti, a gang'...",As Haiti's gang-induced humanitarian crisis de...,2022-10-29 12:43:23,World,[],negative,0.598501,0.33435,0.067149
1,https://finance.yahoo.com/news/britain-denies-...,Britain denies Russian claims that its navy pe...,Britain on Saturday denied Russian claims that...,2022-10-29 12:47:11,World,[],negative,0.86572,0.098223,0.036057
4,https://finance.yahoo.com/news/ukraine-questio...,Ukraine questions Twitter takeover amid precar...,A senior Ukrainian official expressed sceptici...,2022-10-29 14:21:07,World,[],negative,0.946472,0.038674,0.014854
6,https://finance.yahoo.com/news/ukraine-latest-...,Ukraine Latest: Russia Says UK Involved in Nav...,(Bloomberg) -- Most Read from BloombergTesla E...,2022-10-29 14:32:04,World,[],neutral,0.132021,0.829115,0.038864
7,https://finance.yahoo.com/news/strong-dollar-s...,Strong Dollar Seen Hurting US Outlook and Even...,(Bloomberg) -- A strong dollar is likely to we...,2022-10-29 13:15:00,Business,[],negative,0.925865,0.037888,0.036247


In [222]:
ticker_news = df_sentiment_score(ticker_news, 'title')
ticker_news.head()

Unnamed: 0,url,title,summary,date,category,tickers,title_label,negative,neutral,positive
0,https://www.barrons.com/articles/bitcoin-digit...,"Digital Dollars Could Be a Boon for Amazon, Al...","Companies like Amazon.com, Alphabet, and Accen...",2022-10-29 12:27:00,,[AMZN],neutral,0.064423,0.681635,0.253942
2,https://finance.yahoo.com/news/making-sense-bi...,Making Sense of Big Tech Earnings After Amazon...,The earnings bombshells from Amazon and Meta a...,2022-10-28 22:44:10,,[AMZN],neutral,0.016832,0.815436,0.167732
4,https://www.bizjournals.com/denver/news/2022/1...,"ULA, partners start expansions to ahead of Pro...",Colorado aerospace company United Launch Allia...,2022-10-28 20:57:12,,[AMZN],positive,0.010957,0.334704,0.654339
5,https://www.thestreet.com/technology/amazon-lo...,Amazon Briefly Leaves the $1 Trillion Club,The e-commerce giant falls after delivering di...,2022-10-28 20:47:00,,[AMZN],neutral,0.337673,0.627862,0.034464
6,https://www.investors.com/market-trend/stock-m...,"Dow Jones Futures: Market Rally Revs Higher, T...",The stock market rally faced hurdles but revve...,2022-10-29 12:10:31,,[META],neutral,0.066571,0.724576,0.208853


### Sentiment score analysis
Analyse *general_news* and *ticker_news* datasets sentiment classes distribution

In [223]:
from collections import Counter

In [224]:
Counter(general_news.title_label)

Counter({'negative': 2041, 'neutral': 1926, 'positive': 1230})

In [225]:
Counter(ticker_news.title_label)

Counter({'neutral': 1908, 'positive': 681, 'negative': 1504})

## Ticker prices pre-processing
All the operations on the *prices* dataset are performed per ticker
The ticker prices were generally scrapped with 15 min intervals
To ease the processing of the timestamps they are standardized (rounded up to the nearest quarter), redundant timestamps are removed

In [226]:
prices.head()

Unnamed: 0,ticker,price,date
0,META,99.2,2022-10-28 20:00:04
1,AAPL,155.74,2022-10-28 20:00:04
2,GOOGL,96.29,2022-10-28 20:00:04
3,NFLX,295.72,2022-10-28 20:00:04
4,AMZN,103.41,2022-10-28 20:00:04


In [227]:
def round_dt(df, row, delta='15min'):
    dt_floor, dt_ceil = row.date.floor(delta), row.date.ceil(delta)
    row.date = dt_ceil if dt_floor in df.date.tolist() else dt_floor
    return row

def df_round_dt(df, delta='15min'):
    return df.apply(lambda row: round_dt(df, row, delta), axis=1)

Add an hour to timestamps before Sunday, November 6 to account for the US daylight saving time

In [228]:
def account_dst(row):
    if row.date < pd.Timestamp('2022-11-06'):
        row.date += pd.DateOffset(hours=1)
    return row

def df_account_dst(df):
    return df.apply(lambda row: account_dst(row), axis=1)

In [229]:
ticker_dfs = {}

for ticker in tickers:
    ticker_df = prices[prices.ticker == ticker]
    df_len = len(ticker_df)

    ticker_df = df_round_dt(ticker_df)
    ticker_df = df_account_dst(ticker_df)
    ticker_df = df_drop_duplicates(ticker_df, 'date')

    ticker_dfs[ticker] = ticker_df
    print(f'{ticker}_prices:\t{df_len} -> {len(ticker_df)}')

META_prices:	693 -> 677
AAPL_prices:	697 -> 682
GOOGL_prices:	705 -> 684
NFLX_prices:	695 -> 681
AMZN_prices:	700 -> 686
^GSPC_prices:	750 -> 724


## Technical indicators
Add intraday *returns* and *moving overage* indicators with periods in [10, 30, 60]

In [230]:
def df_returns(df):
    df['return'] = df.price - df.price.shift(1)
    return df

In [231]:
def df_ma(df, period):
    df[f'ma_{period}'] = df.price.rolling(period).mean()
    return df

In [232]:
for ticker, ticker_df in ticker_dfs.items():
    ticker_df = df_returns(ticker_df)
    ticker_df = df_ma(ticker_df, 10)
    ticker_df = df_ma(ticker_df, 30)
    ticker_df = df_ma(ticker_df, 60)

In [233]:
ticker_dfs['GOOGL'].tail()

Unnamed: 0,ticker,price,date,return,ma_10,ma_30,ma_60
4213,GOOGL,95.09,2022-12-07 17:15:00,-0.275,95.48912,96.504787,97.955903
4218,GOOGL,95.085,2022-12-07 17:30:00,-0.005,95.41362,96.44062,97.85882
4227,GOOGL,95.45,2022-12-07 17:45:00,0.365,95.41862,96.38212,97.763153
4233,GOOGL,95.562,2022-12-07 18:00:00,0.112,95.45682,96.32652,97.67552
4239,GOOGL,95.82,2022-12-07 18:15:00,0.258,95.44488,96.268853,97.60252


Analyse the percentage of negative returns for each ticker

In [234]:
for ticker, ticker_df in ticker_dfs.items():
    print(f'Negative returns share {ticker}:\t{round(len(ticker_df[ticker_df["return"] < 0]) / len(ticker_df) * 100)}%')

Negative returns share META:	49%
Negative returns share AAPL:	48%
Negative returns share GOOGL:	49%
Negative returns share NFLX:	49%
Negative returns share AMZN:	54%
Negative returns share ^GSPC:	47%


## Fundamental indicators: sentiment values
Add sentiment value for each data point in *prices* dataset. Sentiment value is calculated as a sum of sentiment scores of individual news articles with date of issue in (previous timestamp, current timestamp). General news are considered to have the same impact on all tickers, while ticker specific news are computed individually.

In [235]:
def compute_sent_value(df, dt1, dt2, prefix='general', mode='sum'):
    if mode == 'sum':
        return df[(df.date >= dt1) & (df.date <= dt2)][labels].sum().fillna(0).rename(lambda c: f'{prefix}_{c}_{mode}').to_frame().T
    else:
        return df[(df.date >= dt1) & (df.date <= dt2)][labels].mean().fillna(0).rename(lambda c: f'{prefix}_{c}_{mode}').to_frame().T

def df_compute_sent_value(df, df_sentiment, prefix='general', mode='sum'):
    sentiment_values = pd.concat([compute_sent_value(df_sentiment, dt1, dt2, prefix, mode) for dt1, dt2 in zip(list(df.date), list(df.shift(-1).date))])
    return pd.concat([df.reset_index(drop=True), sentiment_values.reset_index(drop=True)], axis=1)

Add sentiment value for each data point in *prices* dataset. Sentiment value is calculated as a count of individual negative, neutral and positive news articles with date of issue in (previous timestamp, current timestamp). General news are considered to have the same impact on all tickers, while ticker specific news are computed individually.

In [236]:
def compute_sent_value_count(df, dt1, dt2, prefix='general'):
    return df[(df.date >= dt1) & (df.date <= dt2)][[f'{l}_count' for l in labels]].sum().fillna(0).rename(lambda c: f'{prefix}_{c}').to_frame().T

def df_compute_sent_value_count(df, df_sentiment, prefix='general'):
    df_sentiment_new = pd.concat([df_sentiment, pd.get_dummies(df_sentiment.title_label).rename(lambda c: f'{c}_count', axis=1)], axis=1)
    sentiment_values = pd.concat([compute_sent_value_count(df_sentiment_new, dt1, dt2, prefix) for dt1, dt2 in zip(list(df.date), list(df.shift(-1).date))])
    return pd.concat([df.reset_index(drop=True), sentiment_values.reset_index(drop=True)], axis=1)

Add sentiment value for each data point in prices dataset. Sentiment value is calculated as a mean of general news mean sentiment and company specific mean sentiment

In [237]:
def compute_gen_com_sent_mean(df):
    df['gen_com_negative_mean'] = df[['general_negative_mean', 'company_negative_mean']].mean(axis=1)
    df['gen_com_neutral_mean'] = df[['general_neutral_mean', 'company_neutral_mean']].mean(axis=1)
    df['gen_com_positive_mean'] = df[['general_positive_mean', 'company_positive_mean']].mean(axis=1)
    return df

Add fundamental indicators

In [238]:
for ticker, ticker_df in ticker_dfs.items():
    ticker_df = df_compute_sent_value(ticker_df, general_news, 'general', 'sum')
    ticker_df = df_compute_sent_value(ticker_df, general_news, 'general', 'mean')

    ticker_df = df_compute_sent_value(ticker_df, ticker_news[ticker_news.apply(lambda row: ticker in list(row.tickers), axis=1)], 'company', 'sum')
    ticker_df = df_compute_sent_value(ticker_df, ticker_news[ticker_news.apply(lambda row: ticker in list(row.tickers), axis=1)], 'company', 'mean')

    ticker_df = df_compute_sent_value_count(ticker_df, general_news, 'general')

    ticker_df = df_compute_sent_value_count(ticker_df, ticker_news[ticker_news.apply(lambda row: ticker in list(row.tickers), axis=1)], 'company')

    ticker_df = compute_gen_com_sent_mean(ticker_df)

    ticker_dfs[ticker] = ticker_df

Potential feature set

In [239]:
ticker_dfs['AAPL'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 682 entries, 0 to 681
Data columns (total 28 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   ticker                  682 non-null    object        
 1   price                   682 non-null    float64       
 2   date                    682 non-null    datetime64[ns]
 3   return                  681 non-null    float64       
 4   ma_10                   673 non-null    float64       
 5   ma_30                   653 non-null    float64       
 6   ma_60                   623 non-null    float64       
 7   general_negative_sum    682 non-null    float64       
 8   general_neutral_sum     682 non-null    float64       
 9   general_positive_sum    682 non-null    float64       
 10  general_negative_mean   682 non-null    float64       
 11  general_neutral_mean    682 non-null    float64       
 12  general_positive_mean   682 non-null    float64   

# Hyperparameter tuning
A subset of features computed above will be used for Infromer's hyperparameter tuning
AAPL dataset with features [date, price, ma_30, general_negative_sum, general_neutral_sum, general_positive_sum, company_negative_sum, company_neutral_sum, company_positive_sum] is selected

In [240]:
features = ['date', 'price', 'ma_30', 'general_negative_sum', 'general_neutral_sum', 'general_positive_sum', 'company_negative_sum', 'company_neutral_sum', 'company_positive_sum']
feature_num = len(features) - 1 # date is not considered a feature by the Informer
target = 'price'

In [166]:
aapl = ticker_dfs['AAPL'][features].dropna()
aapl = aapl.set_index('date')
aapl

Unnamed: 0_level_0,price,ma_30,general_negative_sum,general_neutral_sum,general_positive_sum,company_negative_sum,company_neutral_sum,company_positive_sum
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2022-11-01 14:45:00,152.9900,153.462473,0.900609,1.035208,0.064183,0.000000,0.000000,0.000000
2022-11-01 15:00:00,150.7400,153.295807,0.022777,0.852089,0.125134,0.000000,0.000000,0.000000
2022-11-01 15:15:00,150.7450,153.209973,2.162112,1.192913,0.644975,0.964930,0.021709,0.013361
2022-11-01 15:30:00,149.7200,153.099307,0.244573,0.722935,0.032492,0.000000,0.000000,0.000000
2022-11-01 15:45:00,149.9850,153.010973,0.470736,0.736079,0.793184,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...
2022-12-07 17:15:00,140.9409,142.513247,1.560727,0.490618,0.948655,0.020859,0.884477,0.094664
2022-12-07 17:30:00,140.8700,142.388413,0.062620,0.121923,0.815456,0.929925,0.058949,0.011125
2022-12-07 17:45:00,141.0299,142.266077,1.133490,1.698595,0.167915,1.834223,0.817429,0.348348
2022-12-07 18:00:00,140.8850,142.141910,0.825675,1.801118,1.373207,0.040683,2.090289,0.869028


Saving dataset

In [167]:
aapl.to_csv("./data/stock/aapl.csv")

Setting Informer's default parameters

In [241]:
default_args = {
    'model': 'informer',

    'data': 'aapl',
    'root_path': './data/stock',
    'data_path': 'aapl.csv',
    'features': 'MS',
    'ftr_num': feature_num,
    'd_out': 1,
    'target': target,
    'freq': '15t',

    'seq_len': 48,
    'pred_len': 24,

    'itr': 20,
    'train_epochs': 6,
    'batch_size': 48,
    'patience': 3,
    'learning_rate': 0.0001,
    'loss': 'mse',
    'lradj': 'type1',
    'inverse': False,

    'd_model': 512,
    'n_heads': 4,
    'e_layers': 4,
    'd_ff': 2048,

    'embed': 't2v',
    'activation': 'gelu',
    'padding': 0,
    'dropout': 0.05,

    'output_attention': False,
    'predict': False,

    'num_workers': 0,
    'use_gpu': True,
    'gpu': 0,
    'use_multi_gpu': False,
    'devices': '0'
}

Hyperparameter tuning is performed using a grid search


In [9]:
parameters = {
    'batch_size': [24, 48, 56],
    'learning_rate': [1e-4, 5e-4, 1e-5],
    'd_model': [256, 512],
    'n_heads': [4, 6, 12],
    'e_layers': [4, 6, 12],
    'activation': ['gelu', 'relu']
}

In [10]:
import itertools

In [12]:
combinations = list(itertools.product(*parameters.values()))
total = len(combinations)
print(f'Param combinations: {total}')

Param combinations: 324


Performing grid search

In [13]:
from run_informer import run_informer

In [15]:
results = {}
counter = 0

for batch_size, learning_rate, d_model, n_heads, e_layers, activation in combinations:
    args = default_args.copy()
    args['batch_size'] = batch_size
    args['learning_rate'] = learning_rate
    args['d_model'] = d_model
    args['n_heads'] = n_heads
    args['e_layers'] = e_layers
    args['d_ff'] = 4 * d_model
    args['activation'] = activation

    loss, _, _ = run_informer(args, supress_output=True)
    results[(batch_size, learning_rate, d_model, n_heads, e_layers, activation)] = loss

    counter += 1
    print(f'{counter} out of {total} - mse: {loss[0]}')

1 out of 324 - mse: 0.19969245791435242
2 out of 324 - mse: 0.1973266452550888
3 out of 324 - mse: 0.2071698158979416
4 out of 324 - mse: 0.13370837271213531
5 out of 324 - mse: 0.15923327207565308
6 out of 324 - mse: 0.17413237690925598
7 out of 324 - mse: 0.1393631398677826
8 out of 324 - mse: 0.22362521290779114
9 out of 324 - mse: 0.16694486141204834
10 out of 324 - mse: 0.15823392570018768
11 out of 324 - mse: 0.12205640226602554
12 out of 324 - mse: 0.14133282005786896
13 out of 324 - mse: 0.1930864006280899
14 out of 324 - mse: 0.1583566516637802
15 out of 324 - mse: 0.17440353333950043
16 out of 324 - mse: 0.1589001715183258
17 out of 324 - mse: 0.17963466048240662
18 out of 324 - mse: 0.11621011793613434
19 out of 324 - mse: 0.1888125091791153
20 out of 324 - mse: 0.21865573525428772
21 out of 324 - mse: 0.13286446034908295
22 out of 324 - mse: 0.1317530870437622
23 out of 324 - mse: 0.23739811778068542
24 out of 324 - mse: 0.11906949430704117
25 out of 324 - mse: 0.2107106149

Saving best performing arguments
Printing top 10

In [49]:
results = sorted(results.items(), key=lambda v: v[1][0])
best_args = results[0][0]

print('{: <10} {: <10} {: <7} {: <6} {: <8} {: <10} | {: <10}'.format(*parameters.keys(), 'mse'))
print('-' * 69)
for k, v in results[:10]:
    print('{: <10} {: <13} {: <7} {: <7} {: <8} {: <10} | {: <10.4f}'.format(*k, v[0]))

batch_size learning_rate d_model n_heads e_layers activation | mse       
---------------------------------------------------------------------
48         0.0001        512     4       4        gelu       | 0.0781    
48         0.0001        256     6       12       relu       | 0.0871    
48         1e-05         512     12      6        relu       | 0.0875    
48         1e-05         256     4       12       gelu       | 0.0903    
48         0.0001        512     6       4        gelu       | 0.0923    
24         1e-05         256     12      6        relu       | 0.0941    
48         1e-05         512     6       6        relu       | 0.0952    
48         1e-05         512     4       6        gelu       | 0.0964    
48         0.0001        256     4       12       gelu       | 0.0984    
24         1e-05         512     12      6        gelu       | 0.0998    


In [116]:
args = default_args
(args['batch_size'], args['learning_rate'], args['d_model'], args['n_heads'], args['e_layers'], args['activation']) = best_args
args['d_ff'] = 4 * args['d_model']
args

{'model': 'informer',
 'data': 'aapl',
 'root_path': './data/stock',
 'data_path': 'aapl.csv',
 'features': 'MS',
 'ftr_num': 8,
 'd_out': 1,
 'target': 'price',
 'freq': '15t',
 'seq_len': 48,
 'pred_len': 24,
 'itr': 20,
 'train_epochs': 6,
 'batch_size': 48,
 'patience': 3,
 'learning_rate': 0.0001,
 'loss': 'mse',
 'lradj': 'type1',
 'inverse': False,
 'd_model': 512,
 'n_heads': 4,
 'e_layers': 4,
 'd_ff': 2048,
 'embed': 't2v',
 'activation': 'gelu',
 'padding': 0,
 'dropout': 0.05,
 'output_attention': False,
 'predict': False,
 'num_workers': 0,
 'use_gpu': True,
 'gpu': 0,
 'use_multi_gpu': False,
 'devices': '0'}

## Feature set analysis
Analysing which composition of features performs best on a tuned model
Checking if media sentiment analysis is beneficial

In [242]:
ticker_dfs['AAPL'].columns.tolist()

['ticker',
 'price',
 'date',
 'return',
 'ma_10',
 'ma_30',
 'ma_60',
 'general_negative_sum',
 'general_neutral_sum',
 'general_positive_sum',
 'general_negative_mean',
 'general_neutral_mean',
 'general_positive_mean',
 'company_negative_sum',
 'company_neutral_sum',
 'company_positive_sum',
 'company_negative_mean',
 'company_neutral_mean',
 'company_positive_mean',
 'general_negative_count',
 'general_neutral_count',
 'general_positive_count',
 'company_negative_count',
 'company_neutral_count',
 'company_positive_count',
 'gen_com_negative_mean',
 'gen_com_neutral_mean',
 'gen_com_positive_mean']

In [252]:
feature_sets = [
    ['date', 'price'],
    ['date', 'price', 'return'],
    ['date', 'price', 'ma_10'],
    ['date', 'price', 'ma_30'],
    ['date', 'price', 'ma_60'],
    ['date', 'price', 'return', 'ma_10'],
    ['date', 'price', 'general_negative_sum', 'general_neutral_sum', 'general_positive_sum'],
    ['date', 'price', 'general_negative_mean', 'general_neutral_mean', 'general_positive_mean'],
    ['date', 'price', 'general_negative_count', 'general_neutral_count', 'general_positive_count'],
    ['date', 'price', 'company_negative_sum', 'company_neutral_sum', 'company_positive_sum'],
    ['date', 'price', 'company_negative_mean', 'company_neutral_mean', 'company_positive_mean'],
    ['date', 'price', 'company_negative_count', 'company_neutral_count', 'company_positive_count'],
    ['date', 'price', 'gen_com_negative_mean', 'gen_com_neutral_mean', 'gen_com_positive_mean'],
    ['date', 'price', 'general_negative_sum', 'general_neutral_sum', 'general_positive_sum', 'company_negative_sum', 'company_neutral_sum', 'company_positive_sum'],
    ['date', 'price', 'general_negative_mean', 'general_neutral_mean', 'general_positive_mean', 'company_negative_mean', 'company_neutral_mean', 'company_positive_mean'],
    ['date', 'price', 'general_negative_count', 'general_neutral_count', 'general_positive_count', 'company_negative_count', 'company_neutral_count', 'company_positive_count'],
    ['date', 'price', 'return', 'general_negative_mean', 'general_neutral_mean', 'general_positive_mean'],
    ['date', 'price', 'return', 'gen_com_negative_mean', 'gen_com_neutral_mean', 'gen_com_positive_mean'],
]

total = len(feature_sets)

In [253]:
results = {}
counter = 0

for features in feature_sets:
    aapl = ticker_dfs['AAPL'][features].dropna()
    aapl = aapl.set_index('date')
    aapl.to_csv("./data/stock/aapl.csv")

    args['ftr_num'] = len(features) - 1
    loss = run_informer(args, supress_output=True)[0]
    results[str(features)] = loss

    counter += 1
    print(f'{counter} out of {total} - mse: {loss[0]}')

1 out of 18 - mse: 0.12239911407232285
2 out of 18 - mse: 0.06711364537477493
3 out of 18 - mse: 0.12721404433250427
4 out of 18 - mse: 0.12403389066457748
5 out of 18 - mse: 0.27386605739593506
6 out of 18 - mse: 0.07086274772882462
7 out of 18 - mse: 0.10739133507013321
8 out of 18 - mse: 0.07534480839967728
9 out of 18 - mse: 0.10063232481479645
10 out of 18 - mse: 0.1259147971868515
11 out of 18 - mse: 0.2083088606595993
12 out of 18 - mse: 0.16404522955417633
13 out of 18 - mse: 0.07537199556827545
14 out of 18 - mse: 0.10281088203191757
15 out of 18 - mse: 0.20570918917655945
16 out of 18 - mse: 0.10542090982198715
17 out of 18 - mse: 0.06383358687162399
18 out of 18 - mse: 0.06020493060350418


In [322]:
for k, v in sorted(results.items(), key=lambda v: v[1][0]):
    print('{:.4f} {}'.format(v[0], k))

0.0602 ['date', 'price', 'return', 'gen_com_negative_mean', 'gen_com_neutral_mean', 'gen_com_positive_mean']
0.0638 ['date', 'price', 'return', 'general_negative_mean', 'general_neutral_mean', 'general_positive_mean']
0.0671 ['date', 'price', 'return']
0.0709 ['date', 'price', 'return', 'ma_10']
0.0753 ['date', 'price', 'general_negative_mean', 'general_neutral_mean', 'general_positive_mean']
0.0754 ['date', 'price', 'gen_com_negative_mean', 'gen_com_neutral_mean', 'gen_com_positive_mean']
0.1006 ['date', 'price', 'general_negative_count', 'general_neutral_count', 'general_positive_count']
0.1028 ['date', 'price', 'general_negative_sum', 'general_neutral_sum', 'general_positive_sum', 'company_negative_sum', 'company_neutral_sum', 'company_positive_sum']
0.1054 ['date', 'price', 'general_negative_count', 'general_neutral_count', 'general_positive_count', 'company_negative_count', 'company_neutral_count', 'company_positive_count']
0.1074 ['date', 'price', 'general_negative_sum', 'general