# Data processing and sentiment analysis
This notebook illustrates the pre-processing steps performed on the gathered data, performs sentiment analysis of the news titles and formulates the feature set taken as inputs by the Informer model

In [99]:
import warnings

warnings.filterwarnings('ignore')

In [100]:
import pandas as pd

## The gathered data overview
Read and describe the basic information about the datasets
The *general_news* dataset contains articles related to the stock market at large
The *ticker_news* dataset contains news relevant to a given ticker

In [101]:
general_news = pd.read_json("../bp-scrapper-data/news.jsonl", lines=True)
prices = pd.read_json("../bp-scrapper-data/price.jsonl", lines=True)
ticker_news = pd.read_json("../bp-scrapper-data/ticker_news.jsonl", lines=True)

In [102]:
general_news.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13418 entries, 0 to 13417
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   url       13418 non-null  object        
 1   title     13418 non-null  object        
 2   summary   13418 non-null  object        
 3   date      13418 non-null  datetime64[ns]
 4   category  13418 non-null  object        
 5   tickers   13418 non-null  object        
dtypes: datetime64[ns](1), object(5)
memory usage: 629.1+ KB


In [103]:
prices.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5445 entries, 0 to 5444
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   ticker  5445 non-null   object        
 1   price   5445 non-null   float64       
 2   date    5445 non-null   datetime64[ns]
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 127.7+ KB


In [104]:
ticker_news.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7115 entries, 0 to 7114
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   url       7115 non-null   object        
 1   title     7115 non-null   object        
 2   summary   7115 non-null   object        
 3   date      7115 non-null   datetime64[ns]
 4   category  0 non-null      float64       
 5   tickers   7115 non-null   object        
dtypes: datetime64[ns](1), float64(1), object(4)
memory usage: 333.6+ KB


In [105]:
tickers = prices.ticker.unique()
tickers

array(['META', 'AAPL', 'GOOGL', 'NFLX', 'AMZN', '^GSPC'], dtype=object)

## Initial text preprocessing
Remove *Yahoo!Finance* special terms from the news titles

In [106]:
def remove_terms(row):
    i = row.title.find('-')
    if i != -1 and i < 30 and (row.title[i + 1].isupper() or row.title[i + 1].isnumeric()):
        row.title = row.title.split('-', 1)[1]
    return row


def df_remove_terms(df):
    return df.apply(lambda row: remove_terms(row), axis=1)

Drop duplicates based on provided columns

In [107]:
def df_drop_duplicates(df, columns):
    return df.drop_duplicates(columns)

Drop titles containing less than 30 symbols

In [108]:
def df_drop_short_titles(df):
    return df[df.apply(lambda row: len(row.title) >= 30, axis=1)]

Remove titles from *general_news* that are already included in *ticker_news* dataset

In [109]:
def df_remove_titles(df, df_other):
    return df[~df.title.isin(df_other.title.values.tolist())]

### Perform pre-processing steps
Use the preprocessing steps introduced above on *general_news* and *ticker_news* datasets

In [110]:
ticker_news_dfs = {}

for ticker in tickers:
    df = ticker_news[ticker_news.apply(lambda row: ticker in list(row.tickers), axis=1)]
    df_len = len(df)

    df = df_remove_terms(df)
    df = df_drop_duplicates(df, ['url'])
    df = df_drop_duplicates(df, ['title'])
    df = df_drop_short_titles(df)

    ticker_news_dfs[ticker] = df
    print(f'{ticker:>5} news: {df_len:4} -> {len(df):4}')

 META news:  601 ->  511
 AAPL news: 1057 ->  876
GOOGL news:  664 ->  592
 NFLX news:  348 ->  304
 AMZN news: 1053 ->  945
^GSPC news: 3392 -> 2471


In [111]:
general_news = general_news
general_news_len = len(general_news)

general_news = df_remove_terms(general_news)
general_news = df_drop_duplicates(general_news, ['url'])
general_news = df_drop_duplicates(general_news, ['title'])
general_news = df_drop_short_titles(general_news)

print(f'General news: {general_news_len:>5} -> {len(general_news):>5}')

General news: 13418 -> 10812


## Sentiment Classification Pipeline
Prepare the text classification pipeline used to assign news articles respective sentiment scores
The pipeline uses **WordPiece** tokenization and **FinBERT** text classification model

In [182]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig, TextClassificationPipeline

In [183]:
model_name = "ProsusAI/finbert"
labels = ['negative', 'neutral', 'positive']

In [184]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [185]:
config = AutoConfig.from_pretrained(model_name, num_labels=len(labels))
model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config).to('cuda')

In [186]:
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, device=0)

## Sentiment scores
The text classification pipeline assigns each title a sentiment score: the probability of it expressing a *negative*, *neutral* or *positive* market sentiment.

In [187]:
def sentiment_score(row, col='title'):
    scores = pipe(row[col], top_k=len(labels))
    score_dict = {score['label']: score['score'] for score in scores}
    row[f'{col}_label'] = scores[0]['label']
    row.negative, row.neutral, row.positive = score_dict['negative'], score_dict['neutral'], score_dict['positive']
    return row


def df_sentiment_score(df, col='title'):
    df[f'{col}_label'] = None
    df['negative'] = df['neutral'] = df['positive'] = 0
    return df.apply(lambda row: sentiment_score(row, col), axis=1)

Assign sentiment scores to *general_news* and *ticker_news* article titles
Save labels for article titles and summary for comparison

In [188]:
general_news = df_sentiment_score(general_news, 'title')

In [189]:
general_news_dfs = {}

for ticker in tickers:
    df = general_news
    df_len = len(general_news)

    df = df_remove_titles(df, ticker_news_dfs[ticker])

    general_news_dfs[ticker] = df
    print(f'{ticker:>5} general news: {df_len:>5} -> {len(df):>5}')

 META general news: 10812 -> 10741
 AAPL general news: 10812 -> 10642
GOOGL general news: 10812 -> 10743
 NFLX general news: 10812 -> 10780
 AMZN general news: 10812 -> 10717
^GSPC general news: 10812 -> 10201


In [190]:
general_news_dfs['AAPL'].head()

Unnamed: 0,url,title,summary,date,category,tickers,title_label,negative,neutral,positive
0,https://finance.yahoo.com/news/1-u-n-mulls-sen...,"As U.N. mulls sending troops to Haiti, a gang'...",As Haiti's gang-induced humanitarian crisis de...,2022-10-29 12:43:23,World,[],negative,0.598501,0.33435,0.067149
1,https://finance.yahoo.com/news/britain-denies-...,Britain denies Russian claims that its navy pe...,Britain on Saturday denied Russian claims that...,2022-10-29 12:47:11,World,[],negative,0.86572,0.098223,0.036057
2,https://finance.yahoo.com/news/1-russia-says-u...,"Russia says UK navy blew up Nord Stream, Londo...",Russia's defence ministry said on Saturday tha...,2022-10-29 11:05:22,World,[NG=F],neutral,0.384665,0.56097,0.054365
3,https://finance.yahoo.com/news/twitter-frees-b...,Twitter Frees Up Billions for Arbitrage Trader...,(Bloomberg) -- Billions of dollars in arbitrag...,2022-10-29 12:15:00,Business,"[TWTR, VMW, ATVI, MSFT]",neutral,0.012227,0.684478,0.303295
4,https://finance.yahoo.com/news/ukraine-questio...,Ukraine questions Twitter takeover amid precar...,A senior Ukrainian official expressed sceptici...,2022-10-29 14:21:07,World,[],negative,0.946472,0.038674,0.014854


In [191]:
for ticker in tickers:
    ticker_news_dfs[ticker] = df_sentiment_score(ticker_news_dfs[ticker], 'title')

ticker_news_dfs['AAPL'].head()

Unnamed: 0,url,title,summary,date,category,tickers,title_label,negative,neutral,positive
12,https://finance.yahoo.com/news/insiders-apple-...,Insiders at Apple Inc. (NASDAQ:AAPL) sold US$1...,While it’s been a great week for Apple Inc. ( ...,2022-10-29 12:00:54,,[AAPL],negative,0.659887,0.264876,0.075237
14,https://www.fool.com/investing/2022/10/29/what...,What to Watch as the Big Tech Companies Report...,Motley Fool engineering manager Tim White join...,2022-10-29 11:00:00,,[AAPL],neutral,0.054464,0.923324,0.022212
15,https://www.fool.com/investing/2022/10/29/appl...,What Does Apple's Controversial NFT Policy Mea...,The good news is that Apple (NASDAQ: AAPL) is ...,2022-10-29 09:42:00,,[AAPL],neutral,0.34417,0.628426,0.027404
16,https://finance.yahoo.com/news/making-sense-bi...,Making Sense of Big Tech Earnings After Amazon...,The earnings bombshells from Amazon and Meta a...,2022-10-28 22:44:10,,[AAPL],neutral,0.016832,0.815436,0.167732
18,https://www.fool.com/investing/2022/10/28/why-...,Here's Why Apple Stock Soared Today,Defensive-minded investors have come to apprec...,2022-10-28 22:00:56,,[AAPL],neutral,0.021003,0.650387,0.32861


### Sentiment score analysis
Analyse *general_news* and *ticker_news* datasets sentiment classes distribution

In [122]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

In [362]:
colors_list = [
    "rgb(243, 222, 156)",
    "rgb(204, 151, 142)",
    "rgb(0, 119, 194)",
]

gen_news_data = general_news.title_label.value_counts()
gen_news_data_labels = ['neutral', 'negative', 'positive']
gen_news_data_values = [gen_news_data[x] for x in gen_news_data_labels]

pies = {"General news": go.Pie(labels=gen_news_data_labels,
                               # labels=gen_news_data.index.tolist(),
                               values=gen_news_data_values,
                               # values=gen_news_data.values.tolist(),
                               title='General news',
                               name='General news')}

for ticker in tickers:
    data = ticker_news_dfs[ticker].title_label.value_counts()
    pies[f'{ticker} news'] = go.Pie(labels=data.index.tolist(),
                                    values=data.values.tolist(),
                                    title=ticker,
                                    name=ticker,
                                    scalegroup='one')

In [363]:
fig_pies = make_subplots(rows=3, cols=3,
                         specs=[[{"colspan": 3, 'type': 'domain'}, None, None],
                                [{'type': 'domain'}, {'type': 'domain'}, {'type': 'domain'}],
                                [{'type': 'domain'}, {'type': 'domain'}, {'type': 'domain'}]])

fig_pies.add_trace(pies['General news'], row=1, col=1)

pie_index = 0
for ticker in tickers:
    fig_pies.add_trace(pies[f'{ticker} news'], row=2 + pie_index // 3, col=1 + pie_index % 3)
    pie_index += 1

fig_pies.update_traces(texttemplate="%{value} <br>%{percent}",
                       textposition='outside',
                       marker_colors=colors_list)
fig_pies.update_layout(showlegend=True,
                       title_text="News sentiment labels",
                       width=1200, height=1000,
                       font_family="Verdana",
                       font_color="black",
                       font_size=15,
                       legend_title_font_color="black",
                       paper_bgcolor='rgba(0,0,0,0)',
                       plot_bgcolor='rgba(0,0,0,0)')
fig_pies.show()

In [365]:
import os

In [366]:
if not os.path.exists("images"):
    os.mkdir("images")

fig_pies.write_image("images/pies.png")

## Ticker prices pre-processing
All the operations on the *prices* dataset are performed per ticker
The ticker prices were generally scrapped with 15 min intervals
To ease the processing of the timestamps they are standardized (rounded up to the nearest quarter), redundant timestamps are removed

In [127]:
prices.head()

Unnamed: 0,ticker,price,date
0,META,99.2,2022-10-28 20:00:04
1,AAPL,155.74,2022-10-28 20:00:04
2,GOOGL,96.29,2022-10-28 20:00:04
3,NFLX,295.72,2022-10-28 20:00:04
4,AMZN,103.41,2022-10-28 20:00:04


In [128]:
def round_dt(df, row, delta='15min'):
    dt_floor, dt_ceil = row.date.floor(delta), row.date.ceil(delta)
    row.date = dt_ceil if dt_floor in df.date.tolist() else dt_floor
    return row


def df_round_dt(df, delta='15min'):
    return df.apply(lambda row: round_dt(df, row, delta), axis=1)

Add an hour to timestamps before Sunday, November 6 to account for the US daylight saving time

In [129]:
def account_dst(row):
    if row.date < pd.Timestamp('2022-11-06'):
        row.date += pd.DateOffset(hours=1)
    return row


def df_account_dst(df):
    return df.apply(lambda row: account_dst(row), axis=1)

In [130]:
ticker_dfs = {}

for ticker in tickers:
    ticker_df = prices[prices.ticker == ticker]
    df_len = len(ticker_df)

    ticker_df = df_round_dt(ticker_df)
    ticker_df = df_account_dst(ticker_df)
    ticker_df = df_drop_duplicates(ticker_df, 'date')

    ticker_dfs[ticker] = ticker_df
    print(f'{ticker:>5} prices: {df_len:3} -> {len(ticker_df):3}')

 META prices: 889 -> 873
 AAPL prices: 897 -> 878
GOOGL prices: 903 -> 881
 NFLX prices: 892 -> 876
 AMZN prices: 900 -> 884
^GSPC prices: 964 -> 937


Plotting the stock prices

In [322]:
prices = {}

colors_list = [
    "rgb(95, 142, 211)",
    "rgb(231, 191, 61)",
    "rgb(204, 151, 142)",
    "rgb(0, 119, 194)",
    "rgb(192, 0, 0)",
    "rgb(127, 127, 127)",
]

count = 0
for ticker, df in ticker_dfs.items():
    prices[ticker] = go.Scatter(x=df.date,
                                y=df.price,
                                name=ticker,
                                mode='markers',
                                marker=dict(color=colors_list[count]))
    count += 1

In [323]:
fig_prices = make_subplots(
    rows=3, cols=2,
    specs=[[{}, {}],
           [{}, {}],
           [{}, {}]])

pie_index = 0
for ticker in tickers:
    fig_prices.add_trace(prices[ticker], row=1 + pie_index // 2, col=1 + pie_index % 2)
    pie_index += 1

fig_prices.update_traces(marker=dict(size=6, line=dict(width=0.5, color='rgba(255,255,255,255)')),
                         selector=dict(mode='markers'))

fig_prices.update_xaxes(
    rangebreaks=[
        dict(bounds=[21.5, 14.1], pattern="hour"),
        dict(bounds=["sat", "mon"]),
    ],
    gridcolor='lightgray'
)

fig_prices.update_yaxes(gridcolor='lightgray')

fig_prices.add_vrect(x0="2022-10-31",
                     x1="2022-11-29",
                     annotation_text="train",
                     annotation_position="bottom left",
                     fillcolor="black",
                     opacity=0.05,
                     line_width=0)

fig_prices.add_vrect(x0="2022-11-29",
                     x1="2022-12-08",
                     annotation_text="val",
                     annotation_position="bottom left",
                     fillcolor="black",
                     opacity=0.15,
                     line_width=0)

fig_prices.add_vrect(x0="2022-12-08",
                     x1="2022-12-17",
                     annotation_text="test",
                     annotation_position="bottom left",
                     fillcolor="black",
                     opacity=0.30,
                     line_width=0)

fig_prices.update_layout(showlegend=True,
                         title_text="Stock prices",
                         width=1600, height=900,
                         margin_pad=20,
                         font_family="Verdana",
                         font_color="black",
                         font_size=15,
                         margin=dict(t=50),
                         legend_title_font_color="black",
                         paper_bgcolor='rgba(0,0,0,0)',
                         plot_bgcolor='rgba(0,0,0,0)')
fig_prices.show()

In [325]:
if not os.path.exists("images"):
    os.mkdir("images")

fig_prices.write_image("images/prices.png")

## Technical indicators
Add intraday *returns* and *moving overage* indicators with periods in [10, 30, 60]

In [134]:
def df_returns(df):
    df['return'] = df.price - df.price.shift(1)
    return df

In [135]:
def df_ma(df, period):
    df[f'ma_{period}'] = df.price.rolling(period).mean()
    return df

In [136]:
for ticker, ticker_df in ticker_dfs.items():
    ticker_df = df_returns(ticker_df)
    ticker_df = df_ma(ticker_df, 15)
    ticker_df = df_ma(ticker_df, 30)

In [137]:
ticker_dfs['GOOGL'].tail()

Unnamed: 0,ticker,price,date,return,ma_15,ma_30
5414,GOOGL,90.065,2022-12-16 20:00:00,0.175,89.958673,90.272003
5423,GOOGL,90.35,2022-12-16 20:15:00,0.285,89.98144,90.266503
5428,GOOGL,90.66,2022-12-16 20:30:00,0.31,90.02944,90.266837
5436,GOOGL,90.7,2022-12-16 20:45:00,0.04,90.086773,90.26117
5437,GOOGL,90.26,2022-12-16 21:00:00,-0.44,90.098107,90.237503


Analyse the percentage of negative returns for each ticker

In [138]:
for ticker, ticker_df in ticker_dfs.items():
    print(f'Negative returns share {ticker}:\t{round(len(ticker_df[ticker_df["return"] < 0]) / len(ticker_df) * 100)}%')

Negative returns share META:	49%
Negative returns share AAPL:	49%
Negative returns share GOOGL:	51%
Negative returns share NFLX:	50%
Negative returns share AMZN:	52%
Negative returns share ^GSPC:	46%


## Fundamental indicators: sentiment values
Add sentiment value for each data point in *prices* dataset. Sentiment value is calculated as a sum of sentiment scores of individual news articles with date of issue in (previous timestamp, current timestamp). General news are considered to have the same impact on all tickers, while ticker specific news are computed individually.

In [139]:
def compute_sent_value(df, dt1, dt2, prefix='general', mode='sum'):
    if mode == 'sum':
        return df[(df.date >= dt1) & (df.date <= dt2)][labels].sum().fillna(0).rename(
            lambda c: f'{prefix}_{c}_{mode}').to_frame().T
    else:
        return df[(df.date >= dt1) & (df.date <= dt2)][labels].mean().fillna(0).rename(
            lambda c: f'{prefix}_{c}_{mode}').to_frame().T


def df_compute_sent_value(df, df_sentiment, prefix='general', mode='sum'):
    sentiment_values = pd.concat([compute_sent_value(df_sentiment, dt1, dt2, prefix, mode) for dt1, dt2 in
                                  zip(list(df.date), list(df.shift(-1).date))])
    return pd.concat([df.reset_index(drop=True), sentiment_values.reset_index(drop=True)], axis=1)

Add sentiment value for each data point in *prices* dataset. Sentiment value is calculated as a count of individual negative, neutral and positive news articles with date of issue in (previous timestamp, current timestamp). General news are considered to have the same impact on all tickers, while ticker specific news are computed individually.

In [140]:
def count_sent_value(df, dt1, dt2, prefix='general'):
    return df[(df.date >= dt1) & (df.date <= dt2)][[f'{l}_count' for l in labels]].sum().fillna(0).rename(
        lambda c: f'{prefix}_{c}').to_frame().T


def df_count_sent_value(df, df_sentiment, prefix='general'):
    df_sentiment_new = pd.concat(
        [df_sentiment, pd.get_dummies(df_sentiment.title_label).rename(lambda c: f'{c}_count', axis=1)], axis=1)
    sentiment_values = pd.concat([count_sent_value(df_sentiment_new, dt1, dt2, prefix) for dt1, dt2 in
                                  zip(list(df.date), list(df.shift(-1).date))])
    return pd.concat([df.reset_index(drop=True), sentiment_values.reset_index(drop=True)], axis=1)

Add fundamental indicators

In [141]:
for ticker, ticker_df in ticker_dfs.items():
    ticker_df = df_compute_sent_value(ticker_df, general_news_dfs[ticker], 'general', 'sum')
    ticker_df = df_compute_sent_value(ticker_df, general_news_dfs[ticker], 'general', 'mean')

    ticker_df = df_compute_sent_value(ticker_df, ticker_news_dfs[ticker], 'company', 'sum')
    ticker_df = df_compute_sent_value(ticker_df, ticker_news_dfs[ticker], 'company', 'mean')

    ticker_df = df_count_sent_value(ticker_df, general_news_dfs[ticker], 'general')

    ticker_df = df_count_sent_value(ticker_df, ticker_news_dfs[ticker], 'company')

    ticker_dfs[ticker] = ticker_df

Potential feature set

In [142]:
ticker_dfs['AAPL'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878 entries, 0 to 877
Data columns (total 24 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   ticker                  878 non-null    object        
 1   price                   878 non-null    float64       
 2   date                    878 non-null    datetime64[ns]
 3   return                  877 non-null    float64       
 4   ma_15                   864 non-null    float64       
 5   ma_30                   849 non-null    float64       
 6   general_negative_sum    878 non-null    float64       
 7   general_neutral_sum     878 non-null    float64       
 8   general_positive_sum    878 non-null    float64       
 9   general_negative_mean   878 non-null    float64       
 10  general_neutral_mean    878 non-null    float64       
 11  general_positive_mean   878 non-null    float64       
 12  company_negative_sum    878 non-null    float64   

# Hyperparameter tuning
A subset of features computed above will be used for Infromer's hyperparameter tuning
AAPL dataset with features [date, price, ma_30, general_negative_sum, general_neutral_sum, general_positive_sum, company_negative_sum, company_neutral_sum, company_positive_sum] is selected

In [143]:
features = ['date', 'price', 'ma_15', 'general_negative_sum', 'general_neutral_sum', 'general_positive_sum',
            'general_negative_count', 'general_neutral_count', 'general_positive_count']
feature_num = len(features) - 1  # date is not considered a feature by the Informer
target = 'price'

In [144]:
aapl = ticker_dfs['AAPL'][features].dropna()
aapl = aapl.set_index('date')
aapl

Unnamed: 0_level_0,price,ma_15,general_negative_sum,general_neutral_sum,general_positive_sum,general_negative_count,general_neutral_count,general_positive_count
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2022-10-31 17:45:00,153.5800,153.356340,2.168076,1.421074,0.410850,3.0,1.0,0.0
2022-10-31 18:00:00,153.7592,153.224287,1.135553,1.535193,1.329254,1.0,2.0,1.0
2022-10-31 18:15:00,153.9900,153.268953,0.911712,0.075464,0.012824,1.0,0.0,0.0
2022-10-31 18:30:00,153.6499,153.309613,0.000000,0.000000,0.000000,0.0,0.0,0.0
2022-10-31 18:45:00,153.7150,153.381613,1.276192,2.867404,0.856404,1.0,3.0,1.0
...,...,...,...,...,...,...,...,...
2022-12-16 20:00:00,134.5467,134.405313,1.308567,0.465379,0.226054,1.0,1.0,0.0
2022-12-16 20:15:00,134.7516,134.397420,0.000000,0.000000,0.000000,0.0,0.0,0.0
2022-12-16 20:30:00,135.1750,134.438753,1.286704,0.615677,0.097619,1.0,1.0,0.0
2022-12-16 20:45:00,135.0200,134.456753,0.000000,0.000000,0.000000,0.0,0.0,0.0


Saving dataset

In [145]:
aapl.to_csv("./data/stock/data-tuning.csv")

Setting Informer's default parameters

In [146]:
default_args = {
    'model': 'informer',

    'data': 'data-fine-tuning',
    'root_path': './data/stock',
    'data_path': 'data-tuning.csv',
    'features': 'MS',
    'ftr_num': feature_num,
    'd_out': 1,
    'target': target,
    'freq': '15t',

    'seq_len': 30,
    'pred_len': 30,

    'itr': 8,
    'train_epochs': 6,
    'batch_size': 6,
    'patience': 3,
    'learning_rate': 0.0001,
    'loss': 'mse',
    'lradj': 'type1',
    'inverse': False,

    'd_model': 512,
    'n_heads': 10,
    'e_layers': 6,
    'd_ff': 2048,

    'embed': 't2v',
    'activation': 'gelu',
    'padding': 0,
    'dropout': 0.05,

    'output_attention': False,
    'predict': False,

    'num_workers': 0,
    'use_gpu': True,
    'gpu': 0,
    'use_multi_gpu': False,
    'devices': '0'
}

Hyperparameter tuning is performed using a grid search


In [147]:
parameters = {
    'batch_size': [6, 12, 24],
    'd_model': [256, 512],
    'n_heads': [4, 6, 10],
    'e_layers': [4, 6, 10],
    'activation': ['gelu', 'relu']
}

In [148]:
import itertools

In [149]:
combinations = list(itertools.product(*parameters.values()))
total = len(combinations)
print(f'Param combinations: {total}')

Param combinations: 108


Performing grid search

In [150]:
import time
import numpy as np
from run_informer import train_informer

In [151]:
params_evals = {}
counter = 0

start_time = time.time()
avg_exp_time_d = []

for batch_size, d_model, n_heads, e_layers, activation in combinations:
    model_args = default_args.copy()
    model_args['batch_size'] = batch_size
    model_args['d_model'] = d_model
    model_args['n_heads'] = n_heads
    model_args['e_layers'] = e_layers
    model_args['d_ff'] = 4 * d_model
    model_args['activation'] = activation

    exp_time_d = time.time()

    exp = train_informer(model_args, supress_output=True)
    params_evals[(batch_size, d_model, n_heads, e_layers, activation)] = exp.val_loss_min

    exp_time_d = time.time() - exp_time_d
    avg_exp_time_d.append(exp_time_d)

    hours, remainder = divmod(time.time() - start_time, 3600)
    minutes, seconds = divmod(remainder, 60)

    hours_r, remainder_r = divmod((total - counter + 1) * np.mean(avg_exp_time_d), 3600)
    minutes_r, seconds_r = divmod(remainder_r, 60)

    counter += 1
    print(
        f'{counter:>3} out of {total} - mse: {exp.val_loss_min:.5f} - running for {hours:02.0f}:{minutes:02.0f}:{seconds:02.0f}, time left {hours_r:02.0f}:{minutes_r:02.0f}:{seconds_r:02.0f}')

  1 out of 108 - mse: 0.13374 - running for 00:01:42, time left 03:05:25
  2 out of 108 - mse: 0.14708 - running for 00:03:11, time left 02:52:20
  3 out of 108 - mse: 0.14511 - running for 00:04:59, time left 02:57:33
  4 out of 108 - mse: 0.13989 - running for 00:06:49, time left 03:00:48
  5 out of 108 - mse: 0.16879 - running for 00:09:58, time left 03:29:21
  6 out of 108 - mse: 0.19460 - running for 00:12:57, time left 03:44:24
  7 out of 108 - mse: 0.15451 - running for 00:14:14, time left 03:29:21
  8 out of 108 - mse: 0.14923 - running for 00:15:25, time left 03:16:31
  9 out of 108 - mse: 0.14149 - running for 00:17:17, time left 03:13:52
 10 out of 108 - mse: 0.13013 - running for 00:18:56, time left 03:09:16
 11 out of 108 - mse: 0.18908 - running for 00:21:33, time left 03:13:56
 12 out of 108 - mse: 0.18291 - running for 00:24:22, time left 03:19:00
 13 out of 108 - mse: 0.14683 - running for 00:25:32, time left 03:10:33
 14 out of 108 - mse: 0.15086 - running for 00:26:4

Saving best performing arguments
Printing top 10

In [152]:
params_evals = sorted(params_evals.items(), key=lambda it: it[1])
best_args = params_evals[0][0]
print('{: <10} {: <7} {: <6} {: <8} {: <10} | {: <10}'.format(*parameters.keys(), 'mse'))
print('-' * 69)
for k, v in params_evals[:10]:
    print('{: <10} {: <7} {: <7} {: <8} {: <10} | {: <10.4f}'.format(*k, v))

batch_size d_model n_heads e_layers activation | mse       
---------------------------------------------------------------------
6          512     10      4        relu       | 0.0838    
6          512     4       4        relu       | 0.0914    
6          512     4       6        relu       | 0.0946    
6          512     6       10       relu       | 0.0983    
6          512     10      4        gelu       | 0.0986    
12         512     4       6        gelu       | 0.0997    
12         512     4       4        gelu       | 0.1000    
6          512     4       6        gelu       | 0.1013    
6          512     4       10       gelu       | 0.1058    
12         512     4       4        relu       | 0.1078    


In [153]:
model_args = default_args
(model_args['batch_size'], model_args['d_model'], model_args['n_heads'], model_args['e_layers'],
 model_args['activation']) = best_args
model_args['d_ff'] = 4 * model_args['d_model']
model_args

{'model': 'informer',
 'data': 'data-fine-tuning',
 'root_path': './data/stock',
 'data_path': 'data-tuning.csv',
 'features': 'MS',
 'ftr_num': 8,
 'd_out': 1,
 'target': 'price',
 'freq': '15t',
 'seq_len': 30,
 'pred_len': 30,
 'itr': 8,
 'train_epochs': 6,
 'batch_size': 6,
 'patience': 3,
 'learning_rate': 0.0001,
 'loss': 'mse',
 'lradj': 'type1',
 'inverse': False,
 'd_model': 512,
 'n_heads': 10,
 'e_layers': 4,
 'd_ff': 2048,
 'embed': 't2v',
 'activation': 'relu',
 'padding': 0,
 'dropout': 0.05,
 'output_attention': False,
 'predict': False,
 'num_workers': 0,
 'use_gpu': True,
 'gpu': 0,
 'use_multi_gpu': False,
 'devices': '0'}

## Feature set analysis
Analysing which composition of features performs best on a tuned model
Checking if media sentiment analysis is beneficial

In [470]:
for ticker in tickers:
    ticker_dfs[ticker].loc[:, ~ticker_dfs[ticker].columns.isin(['ticker'])].dropna().set_index('date').to_csv(f'./data/stock/{ticker}.csv')

In [154]:
ticker_dfs['AAPL'].columns.tolist()

['ticker',
 'price',
 'date',
 'return',
 'ma_15',
 'ma_30',
 'general_negative_sum',
 'general_neutral_sum',
 'general_positive_sum',
 'general_negative_mean',
 'general_neutral_mean',
 'general_positive_mean',
 'company_negative_sum',
 'company_neutral_sum',
 'company_positive_sum',
 'company_negative_mean',
 'company_neutral_mean',
 'company_positive_mean',
 'general_negative_count',
 'general_neutral_count',
 'general_positive_count',
 'company_negative_count',
 'company_neutral_count',
 'company_positive_count']

In [179]:
feature_sets = [
    ['date', 'price'],
    ['date', 'price', 'return'],
    ['date', 'price', 'ma_15'],
    ['date', 'price', 'ma_30'],
    ['date', 'price', 'return', 'ma_15'],
    ['date', 'price', 'general_negative_sum', 'general_neutral_sum', 'general_positive_sum'],
    ['date', 'price', 'general_negative_sum', 'general_neutral_sum', 'general_positive_sum', 'general_negative_count',
     'general_neutral_count', 'general_positive_count'],
    ['date', 'price', 'general_negative_mean', 'general_neutral_mean', 'general_positive_mean'],
    ['date', 'price', 'general_negative_mean', 'general_neutral_mean', 'general_positive_mean',
     'general_negative_count', 'general_neutral_count', 'general_positive_count'],
    ['date', 'price', 'company_negative_sum', 'company_neutral_sum', 'company_positive_sum'],
    ['date', 'price', 'company_negative_sum', 'company_neutral_sum', 'company_positive_sum', 'company_negative_count',
     'company_neutral_count', 'company_positive_count'],
    ['date', 'price', 'company_negative_mean', 'company_neutral_mean', 'company_positive_mean'],
    ['date', 'price', 'company_negative_mean', 'company_neutral_mean', 'company_positive_mean',
     'company_negative_count', 'company_neutral_count', 'company_positive_count'],
    ['date', 'price', 'general_negative_sum', 'general_neutral_sum', 'general_positive_sum', 'company_negative_sum',
     'company_neutral_sum', 'company_positive_sum'],
    ['date', 'price', 'general_negative_sum', 'general_neutral_sum', 'general_positive_sum', 'general_negative_count',
     'general_neutral_count', 'general_positive_count', 'company_negative_sum', 'company_neutral_sum',
     'company_positive_sum', 'company_negative_count', 'company_neutral_count', 'company_positive_count'],
    ['date', 'price', 'general_negative_mean', 'general_neutral_mean', 'general_positive_mean', 'company_negative_mean',
     'company_neutral_mean', 'company_positive_mean'],
    ['date', 'price', 'general_negative_mean', 'general_neutral_mean', 'general_positive_mean', 'company_negative_mean',
     'company_neutral_mean', 'company_positive_mean', 'company_negative_count', 'company_neutral_count',
     'company_positive_count'],
    ['date', 'price', 'return', 'general_negative_mean', 'general_neutral_mean', 'general_positive_mean'],
    ['date', 'price', 'ma_15', 'general_negative_mean', 'general_neutral_mean', 'general_positive_mean'],
]

total = len(feature_sets)

In [180]:
def eval_ftr_set(label, ftr_set, seq_len, pred_len):
    df = ticker_dfs[label][ftr_set].dropna()
    df = df.set_index('date')
    df.to_csv(f'./data/stock/{label}.csv')

    args = model_args.copy()
    args['data_path'] = f'{label}.csv'
    args['ftr_num'] = len(ftr_set) - 1
    args['seq_len'] = seq_len
    args['pred_len'] = pred_len

    return train_informer(args, supress_output=True).test()[0]

Perform feature set analysis over variable length sequences

In [181]:
model_evals = {}
seq_pred_lengths = [[30, 15], [30, 30], [30, 75]]

counter = 0
total = len(seq_pred_lengths) * len(tickers) * len(feature_sets)

start_time = time.time()
avg_exp_time_d = []

for ticker in tickers:
    model_evals[ticker] = {}
    for feature_set in feature_sets:
        model_evals[ticker][tuple(feature_set)] = {}
        for seq_pred in seq_pred_lengths:
            exp_time_d = time.time()

            mse = eval_ftr_set(ticker, feature_set, seq_pred[0], seq_pred[1])
            model_evals[ticker][tuple(feature_set)][tuple(seq_pred)] = mse

            exp_time_d = time.time() - exp_time_d
            avg_exp_time_d.append(exp_time_d)

            hours, remainder = divmod(time.time() - start_time, 3600)
            minutes, seconds = divmod(remainder, 60)

            hours_r, remainder_r = divmod((total - counter + 1) * np.mean(avg_exp_time_d), 3600)
            minutes_r, seconds_r = divmod(remainder_r, 60)

            counter += 1
            print(
                f'{counter:>3} out of {total} - mse: {mse:.5f} - running for {hours:02.0f}:{minutes:02.0f}:{seconds:02.0f}, time left {hours_r:02.0f}:{minutes_r:02.0f}:{seconds_r:02.0f}')

  1 out of 342 - mse: 0.16575 - running for 00:01:36, time left 09:06:22
  2 out of 342 - mse: 0.32248 - running for 00:03:01, time left 08:35:08
  3 out of 342 - mse: 0.66664 - running for 00:04:08, time left 07:49:13
  4 out of 342 - mse: 0.15108 - running for 00:05:25, time left 07:39:56
  5 out of 342 - mse: 0.29221 - running for 00:06:44, time left 07:37:04
  6 out of 342 - mse: 0.46681 - running for 00:07:55, time left 07:26:04
  7 out of 342 - mse: 0.16865 - running for 00:09:33, time left 07:39:29
  8 out of 342 - mse: 0.23345 - running for 00:11:00, time left 07:42:04
  9 out of 342 - mse: 0.76873 - running for 00:12:11, time left 07:33:15
 10 out of 342 - mse: 0.25482 - running for 00:13:41, time left 07:36:50
 11 out of 342 - mse: 0.32915 - running for 00:15:05, time left 07:36:34
 12 out of 342 - mse: 0.66253 - running for 00:16:20, time left 07:32:05
 13 out of 342 - mse: 0.21040 - running for 00:17:33, time left 07:26:46
 14 out of 342 - mse: 0.39878 - running for 00:18:5

Presenting the results of feature set analysis per each studied prediction length

In [192]:
import numpy as np

In [410]:
features = [
    'fs-price',
    'fs-return',
    'fs-ma15',
    'fs-ma30',
    'fs-return-ma15',
    'fs-general-sum',
    'fs-general-sum-count',
    'fs-general-mean',
    'fs-general-mean-count',
    'fs-company-sum',
    'fs-company-sum-count',
    'fs-company-mean',
    'fs-company-mean-count',
    'fs-general-company-sum',
    'fs-general-company-sum-count',
    'fs-general-company-mean',
    'fs-general-company-mean-count',
    'fs-return-general-mean',
    'fs-ma15-general-mean'
]

In [228]:
rows = []

for ftr_set in feature_sets:
    row = []
    for pred_len in [15, 30, 75]:
        for ticker in tickers:
            row.append(round(model_evals[ticker][tuple(ftr_set)][tuple([30, pred_len])], 3))
    rows.append(row)

rows = np.array(rows)

In [229]:
def df_avg_best(data):
    df = pd.DataFrame(data=data, index=features)
    df.columns = list(tickers)

    tmp = df.loc[:, ~df.columns.isin(['^GSPC'])]
    mins = tmp.min(numeric_only=True)
    tmp['Average'] = ((tmp-tmp.min())/(tmp.max()-tmp.min())).mean(numeric_only=True, axis=1).round(3)
    tmp['Best'] = tmp.apply(lambda row: (row.eq(mins).values == True).sum(), axis=1)
    tmp['^GSPC'] = df['^GSPC']

    return tmp

In [230]:
res15 = df_avg_best(rows[:, :6])
res30 = df_avg_best(rows[:, 6:12])
res75 = df_avg_best(rows[:, 12:18])

In [414]:
res15.to_csv('./results/res15.csv')
res30.to_csv('./results/res30.csv')
res75.to_csv('./results/res75.csv')

In [260]:
res15 = pd.read_csv('./results/res15.csv', index_col=0)
res30 = pd.read_csv('./results/res30.csv', index_col=0)
res90 = pd.read_csv('./results/res75.csv', index_col=0)

In [283]:
tmp = res15.loc[:, ~res15.columns.isin(['Average', 'Best', '^GSPC'])]
mins = tmp.min(numeric_only=True)
tmp['Average'] = ((tmp-tmp.min())/(tmp.max()-tmp.min())).mean(numeric_only=True, axis=1).round(3)
tmp['Best'] = tmp.apply(lambda row: (row.eq(mins).values == True).sum(), axis=1)
tmp['^GSPC'] = res15['^GSPC']
res15 = tmp

In [284]:
tmp = res30.loc[:, ~res30.columns.isin(['Average', 'Best', '^GSPC'])]
mins = tmp.min(numeric_only=True)
tmp['Average'] = ((tmp-tmp.min())/(tmp.max()-tmp.min())).mean(numeric_only=True, axis=1).round(3)
tmp['Best'] = tmp.apply(lambda row: (row.eq(mins).values == True).sum(), axis=1)
tmp['^GSPC'] = res30['^GSPC']
res30 = tmp

In [285]:
tmp = res75.loc[:, ~res75.columns.isin(['Average', 'Best', '^GSPC'])]
mins = tmp.min(numeric_only=True)
tmp['Average'] = ((tmp-tmp.min())/(tmp.max()-tmp.min())).mean(numeric_only=True, axis=1).round(3)
tmp['Best'] = tmp.apply(lambda row: (row.eq(mins).values == True).sum(), axis=1)
tmp['^GSPC'] = res75['^GSPC']
res75 = tmp

In [None]:
tmp = res75
tmp['feature-set'] = features
res75 = tmp.set_index("feature-set")
res75

In [415]:
res15.sort_values(by=['Average', 'Best'], ascending=[1, 0])

Unnamed: 0_level_0,META,AAPL,GOOGL,NFLX,AMZN,Average,Best,^GSPC
feature-set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
fs-price,0.172,0.215,0.241,0.421,0.217,0.075,0,0.244
fs-return-general-mean,0.156,0.182,0.297,0.472,0.189,0.094,1,0.18
fs-return-ma15,0.244,0.207,0.268,0.444,0.164,0.107,1,0.166
fs-return,0.202,0.216,0.245,0.541,0.201,0.123,0,0.169
fs-general-sum-count,0.412,0.226,0.233,0.376,0.188,0.168,2,0.233
fs-general-sum,0.368,0.227,0.275,0.378,0.186,0.175,0,0.23
fs-ma15,0.201,0.286,0.3,0.525,0.257,0.215,0,0.187
fs-ma30,0.213,0.326,0.275,0.497,0.304,0.237,0,0.198
fs-general-company-sum,0.315,0.232,0.263,0.742,0.206,0.265,0,0.233
fs-general-company-sum-count,0.316,0.17,0.241,1.019,0.213,0.314,1,0.264


In [416]:
res15.sort_values(by=['Best', 'Average'], ascending=[0, 1]).head(4)

Unnamed: 0_level_0,META,AAPL,GOOGL,NFLX,AMZN,Average,Best,^GSPC
feature-set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
fs-general-sum-count,0.412,0.226,0.233,0.376,0.188,0.168,2,0.233
fs-return-general-mean,0.156,0.182,0.297,0.472,0.189,0.094,1,0.18
fs-return-ma15,0.244,0.207,0.268,0.444,0.164,0.107,1,0.166
fs-general-company-sum-count,0.316,0.17,0.241,1.019,0.213,0.314,1,0.264


In [417]:
res30.sort_values(by=['Average', 'Best'], ascending=[1, 0])

Unnamed: 0_level_0,META,AAPL,GOOGL,NFLX,AMZN,Average,Best,^GSPC
feature-set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
fs-return,0.295,0.434,0.387,0.833,0.362,0.149,1,0.295
fs-return-ma15,0.256,0.436,0.472,0.756,0.34,0.149,0,0.269
fs-general-sum-count,0.345,0.488,0.415,0.698,0.366,0.164,0,0.404
fs-price,0.379,0.634,0.506,0.592,0.285,0.198,0,0.409
fs-general-sum,0.575,0.47,0.498,0.688,0.279,0.251,1,0.411
fs-ma15,0.345,0.661,0.526,0.674,0.45,0.265,0,0.285
fs-return-general-mean,0.214,0.435,0.691,0.652,0.56,0.272,1,0.376
fs-general-company-sum,0.447,0.258,0.569,1.255,0.33,0.319,0,0.419
fs-general-company-sum-count,0.383,0.33,0.459,1.152,0.581,0.32,0,0.437
fs-general-company-mean-count,0.369,0.787,0.509,0.572,0.692,0.344,1,0.736


In [418]:
res30.sort_values(by=['Best', 'Average'], ascending=[0, 1]).head(6)

Unnamed: 0_level_0,META,AAPL,GOOGL,NFLX,AMZN,Average,Best,^GSPC
feature-set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
fs-return,0.295,0.434,0.387,0.833,0.362,0.149,1,0.295
fs-general-sum,0.575,0.47,0.498,0.688,0.279,0.251,1,0.411
fs-return-general-mean,0.214,0.435,0.691,0.652,0.56,0.272,1,0.376
fs-general-company-mean-count,0.369,0.787,0.509,0.572,0.692,0.344,1,0.736
fs-company-sum-count,0.76,0.207,0.493,1.606,0.439,0.493,1,0.498
fs-return-ma15,0.256,0.436,0.472,0.756,0.34,0.149,0,0.269


In [419]:
res75.sort_values(by=['Average', 'Best'], ascending=[1, 0])

Unnamed: 0_level_0,META,AAPL,GOOGL,NFLX,AMZN,Average,Best,^GSPC
feature-set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
fs-general-company-mean,0.708,0.928,0.666,1.296,0.838,0.162,1,0.904
fs-general-company-mean-count,0.903,0.982,0.588,0.876,1.933,0.257,1,0.633
fs-return-general-mean,0.521,1.019,0.87,1.642,1.208,0.267,0,0.414
fs-ma15,0.769,1.072,1.051,1.188,1.197,0.283,0,0.486
fs-general-sum-count,0.638,0.949,1.234,1.675,0.929,0.287,0,0.55
fs-company-mean,0.845,0.941,0.7,1.676,1.282,0.29,0,0.566
fs-ma15-general-mean,0.735,1.268,0.558,1.413,1.259,0.309,0,0.501
fs-return,0.467,1.114,0.846,1.694,1.405,0.31,1,0.49
fs-general-mean-count,0.668,1.62,0.376,1.762,0.539,0.346,2,0.741
fs-general-mean,0.699,1.22,1.239,1.536,0.926,0.355,0,0.521


In [420]:
res75.sort_values(by=['Best', 'Average'], ascending=[0, 1]).head(5)

Unnamed: 0_level_0,META,AAPL,GOOGL,NFLX,AMZN,Average,Best,^GSPC
feature-set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
fs-general-mean-count,0.668,1.62,0.376,1.762,0.539,0.346,2,0.741
fs-general-company-mean,0.708,0.928,0.666,1.296,0.838,0.162,1,0.904
fs-general-company-mean-count,0.903,0.982,0.588,0.876,1.933,0.257,1,0.633
fs-return,0.467,1.114,0.846,1.694,1.405,0.31,1,0.49
fs-return-general-mean,0.521,1.019,0.87,1.642,1.208,0.267,0,0.414


Displaying the ticker average MSE over all prediction lengths.

In [421]:
def min_max_norm(data):
    return (data-data.min())/(data.max()-data.min())

In [422]:
ticker_means = {}

for ticker in tickers:
    df_con = pd.concat([min_max_norm(res15[ticker]), min_max_norm(res30[ticker]), min_max_norm(res75[ticker])])
    ticker_means[ticker] = df_con.groupby(df_con.index).mean().T

In [423]:
ticker_means = pd.concat(ticker_means.values(), axis=1)
ticker_means

Unnamed: 0_level_0,META,AAPL,GOOGL,NFLX,AMZN,^GSPC
feature-set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
fs-company-mean,0.461781,0.347035,0.424487,0.338746,0.332122,0.473488
fs-company-mean-count,0.632516,0.318173,0.332265,0.511845,0.554197,0.748473
fs-company-sum,0.534283,0.134049,0.363916,0.752968,0.258657,0.360639
fs-company-sum-count,1.0,0.167465,0.264016,0.97719,0.336313,0.45022
fs-general-company-mean,0.294437,0.620185,0.330434,0.202876,0.471309,0.690717
fs-general-company-mean-count,0.346086,0.437528,0.19148,0.081908,0.563582,0.815646
fs-general-company-sum,0.366929,0.301764,0.253978,0.478458,0.369127,0.371411
fs-general-company-sum-count,0.287897,0.256225,0.195387,0.767463,0.421197,0.446816
fs-general-mean,0.339961,0.611389,0.843149,0.453519,0.303855,0.522412
fs-general-mean-count,0.329546,0.899234,0.309141,0.429909,0.217002,0.650488


In [424]:
ticker_means['META'].sort_values()

feature-set
fs-return-general-mean           0.014694
fs-return                        0.088868
fs-return-ma15                   0.102409
fs-ma30                          0.157732
fs-price                         0.168865
fs-ma15                          0.200713
fs-general-company-sum-count     0.287897
fs-general-company-mean          0.294437
fs-ma15-general-mean             0.325512
fs-general-mean-count            0.329546
fs-general-mean                  0.339961
fs-general-sum-count             0.345872
fs-general-company-mean-count    0.346086
fs-general-company-sum           0.366929
fs-company-mean                  0.461781
fs-general-sum                   0.513074
fs-company-sum                   0.534283
fs-company-mean-count            0.632516
fs-company-sum-count             1.000000
Name: META, dtype: float64

In [425]:
ticker_means['AAPL'].sort_values()

feature-set
fs-return-general-mean           0.123602
fs-company-sum                   0.134049
fs-general-sum-count             0.138277
fs-company-sum-count             0.167465
fs-return                        0.191482
fs-general-company-sum-count     0.256225
fs-return-ma15                   0.263681
fs-ma15                          0.292774
fs-general-company-sum           0.301764
fs-company-mean-count            0.318173
fs-price                         0.329003
fs-company-mean                  0.347035
fs-general-sum                   0.353589
fs-general-company-mean-count    0.437528
fs-ma15-general-mean             0.467487
fs-general-mean                  0.611389
fs-general-company-mean          0.620185
fs-ma30                          0.648833
fs-general-mean-count            0.899234
Name: AAPL, dtype: float64

In [426]:
ticker_means['GOOGL'].sort_values()

feature-set
fs-return                        0.110349
fs-general-company-mean-count    0.191480
fs-general-sum-count             0.195027
fs-general-company-sum-count     0.195387
fs-general-company-sum           0.253978
fs-return-ma15                   0.263903
fs-company-sum-count             0.264016
fs-ma30                          0.288066
fs-general-mean-count            0.309141
fs-ma15                          0.314650
fs-general-sum                   0.319619
fs-general-company-mean          0.330434
fs-company-mean-count            0.332265
fs-company-sum                   0.363916
fs-ma15-general-mean             0.376279
fs-return-general-mean           0.389381
fs-company-mean                  0.424487
fs-price                         0.425982
fs-general-mean                  0.843149
Name: GOOGL, dtype: float64

In [427]:
ticker_means['NFLX'].sort_values()

feature-set
fs-general-sum                   0.054730
fs-general-company-mean-count    0.081908
fs-price                         0.103899
fs-ma15                          0.179783
fs-general-company-mean          0.202876
fs-return-ma15                   0.211112
fs-general-sum-count             0.219007
fs-return-general-mean           0.246577
fs-ma15-general-mean             0.271416
fs-ma30                          0.299717
fs-company-mean                  0.338746
fs-return                        0.352306
fs-general-mean-count            0.429909
fs-general-mean                  0.453519
fs-general-company-sum           0.478458
fs-company-mean-count            0.511845
fs-company-sum                   0.752968
fs-general-company-sum-count     0.767463
fs-company-sum-count             0.977190
Name: NFLX, dtype: float64

In [428]:
ticker_means['AMZN'].sort_values()

feature-set
fs-general-sum                   0.080188
fs-price                         0.082661
fs-general-sum-count             0.133159
fs-return-ma15                   0.185428
fs-general-mean-count            0.217002
fs-return                        0.225303
fs-company-sum                   0.258657
fs-return-general-mean           0.280717
fs-ma15                          0.283853
fs-general-mean                  0.303855
fs-company-mean                  0.332122
fs-company-sum-count             0.336313
fs-ma15-general-mean             0.349886
fs-general-company-sum           0.369127
fs-general-company-sum-count     0.421197
fs-general-company-mean          0.471309
fs-company-mean-count            0.554197
fs-general-company-mean-count    0.563582
fs-ma30                          0.791778
Name: AMZN, dtype: float64

In [429]:
ticker_means['^GSPC'].sort_values()

feature-set
fs-return-ma15                   0.014966
fs-return                        0.076247
fs-ma15                          0.102316
fs-return-general-mean           0.104318
fs-ma30                          0.227754
fs-general-sum                   0.263795
fs-price                         0.265141
fs-general-sum-count             0.322609
fs-ma15-general-mean             0.324251
fs-company-sum                   0.360639
fs-general-company-sum           0.371411
fs-general-company-sum-count     0.446816
fs-company-sum-count             0.450220
fs-company-mean                  0.473488
fs-general-mean                  0.522412
fs-general-mean-count            0.650488
fs-general-company-mean          0.690717
fs-company-mean-count            0.748473
fs-general-company-mean-count    0.815646
Name: ^GSPC, dtype: float64

Average MSE among all tickers and all prediction lengths

In [430]:
ticker_means.loc[:, ~ticker_means.columns.isin(['^GSPC'])].mean(axis=1).sort_values()

feature-set
fs-return                        0.193662
fs-return-ma15                   0.205306
fs-general-sum-count             0.206268
fs-return-general-mean           0.210994
fs-price                         0.222082
fs-ma15                          0.254355
fs-general-sum                   0.264240
fs-general-company-mean-count    0.324117
fs-general-company-sum           0.354051
fs-ma15-general-mean             0.358116
fs-company-mean                  0.380834
fs-general-company-mean          0.383848
fs-general-company-sum-count     0.385634
fs-company-sum                   0.408775
fs-general-mean-count            0.436967
fs-ma30                          0.437225
fs-company-mean-count            0.469799
fs-general-mean                  0.510374
fs-company-sum-count             0.548997
dtype: float64

In [448]:
df = res15.sort_values(by=['Average', 'Best'], ascending=[1, 0])
for index, row in zip(df.index.tolist(), df.values.tolist()):
    print(f' & {index:>40} & ', ' & '.join(['%.3f' % e for e in row ]), """\\\ \cline{2-10}""")

 &                                 fs-price &  0.172 & 0.215 & 0.241 & 0.421 & 0.217 & 0.075 & 0.000 & 0.244 \\ \cline{2-10}
 &                   fs-return-general-mean &  0.156 & 0.182 & 0.297 & 0.472 & 0.189 & 0.094 & 1.000 & 0.180 \\ \cline{2-10}
 &                           fs-return-ma15 &  0.244 & 0.207 & 0.268 & 0.444 & 0.164 & 0.107 & 1.000 & 0.166 \\ \cline{2-10}
 &                                fs-return &  0.202 & 0.216 & 0.245 & 0.541 & 0.201 & 0.123 & 0.000 & 0.169 \\ \cline{2-10}
 &                     fs-general-sum-count &  0.412 & 0.226 & 0.233 & 0.376 & 0.188 & 0.168 & 2.000 & 0.233 \\ \cline{2-10}
 &                           fs-general-sum &  0.368 & 0.227 & 0.275 & 0.378 & 0.186 & 0.175 & 0.000 & 0.230 \\ \cline{2-10}
 &                                  fs-ma15 &  0.201 & 0.286 & 0.300 & 0.525 & 0.257 & 0.215 & 0.000 & 0.187 \\ \cline{2-10}
 &                                  fs-ma30 &  0.213 & 0.326 & 0.275 & 0.497 & 0.304 & 0.237 & 0.000 & 0.198 \\ \cline{2-10}


In [463]:
for ticker in tickers:
    data = ticker_means[ticker].sort_values().head(5)
    for i, d in zip(data.index.tolist(), data.values.tolist()):
        print(f'& {i:>40} & {d:.3f}', '\\\ \cline{2-10}')
    print('\\\ \hline')

&                   fs-return-general-mean & 0.015 \\ \cline{2-10}
&                                fs-return & 0.089 \\ \cline{2-10}
&                           fs-return-ma15 & 0.102 \\ \cline{2-10}
&                                  fs-ma30 & 0.158 \\ \cline{2-10}
&                                 fs-price & 0.169 \\ \cline{2-10}
\\ \hline
&                   fs-return-general-mean & 0.124 \\ \cline{2-10}
&                           fs-company-sum & 0.134 \\ \cline{2-10}
&                     fs-general-sum-count & 0.138 \\ \cline{2-10}
&                     fs-company-sum-count & 0.167 \\ \cline{2-10}
&                                fs-return & 0.191 \\ \cline{2-10}
\\ \hline
&                                fs-return & 0.110 \\ \cline{2-10}
&            fs-general-company-mean-count & 0.191 \\ \cline{2-10}
&                     fs-general-sum-count & 0.195 \\ \cline{2-10}
&             fs-general-company-sum-count & 0.195 \\ \cline{2-10}
&                   fs-general-company-sum

In [465]:
df_avg_best(ticker_means).sort

Unnamed: 0,META,AAPL,GOOGL,NFLX,AMZN,^GSPC,Average,Best
fs-price,0.168865,0.329003,0.425982,0.103899,0.082661,0.265141,0.204,0
fs-return,0.088868,0.191482,0.110349,0.352306,0.225303,0.076247,0.128,1
fs-ma15,0.200713,0.292774,0.31465,0.179783,0.283853,0.102316,0.203,0
fs-ma30,0.157732,0.648833,0.288066,0.299717,0.791778,0.227754,0.433,0
fs-return-ma15,0.102409,0.263681,0.263903,0.211112,0.185428,0.014966,0.133,1
fs-general-sum,0.513074,0.353589,0.319619,0.05473,0.080188,0.263795,0.233,2
fs-general-sum-count,0.345872,0.138277,0.195027,0.219007,0.133159,0.322609,0.185,0
fs-general-mean,0.339961,0.611389,0.843149,0.453519,0.303855,0.522412,0.557,0
fs-general-mean-count,0.329546,0.899234,0.309141,0.429909,0.217002,0.650488,0.497,0
fs-company-sum,0.534283,0.134049,0.363916,0.752968,0.258657,0.360639,0.388,0


In [467]:
df = res15.sort_values(by=['Average', 'Best'], ascending=[1, 0])
for index, row in zip(df.index.tolist(), df.values.tolist()):
    print(f' {index:>40} & ', ' & '.join(['%.3f' % e for e in row ]), """\\\ \cline""")

                                 fs-price &  0.172 & 0.215 & 0.241 & 0.421 & 0.217 & 0.075 & 0.000 & 0.244 \\ \cline
                   fs-return-general-mean &  0.156 & 0.182 & 0.297 & 0.472 & 0.189 & 0.094 & 1.000 & 0.180 \\ \cline
                           fs-return-ma15 &  0.244 & 0.207 & 0.268 & 0.444 & 0.164 & 0.107 & 1.000 & 0.166 \\ \cline
                                fs-return &  0.202 & 0.216 & 0.245 & 0.541 & 0.201 & 0.123 & 0.000 & 0.169 \\ \cline
                     fs-general-sum-count &  0.412 & 0.226 & 0.233 & 0.376 & 0.188 & 0.168 & 2.000 & 0.233 \\ \cline
                           fs-general-sum &  0.368 & 0.227 & 0.275 & 0.378 & 0.186 & 0.175 & 0.000 & 0.230 \\ \cline
                                  fs-ma15 &  0.201 & 0.286 & 0.300 & 0.525 & 0.257 & 0.215 & 0.000 & 0.187 \\ \cline
                                  fs-ma30 &  0.213 & 0.326 & 0.275 & 0.497 & 0.304 & 0.237 & 0.000 & 0.198 \\ \cline
                   fs-general-company-sum &  0.315 & 0.232 & 0.2