In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import warnings
import scienceplots
from statsmodels.tsa.api import VAR
from statsmodels.tsa.stattools import adfuller

import sys
import os
from dotenv import load_dotenv

load_dotenv()
warnings.filterwarnings("ignore")
plt.style.use('science')

REPO_PATH = os.getenv("REPO_PATH")

# Import main utility functions
sys.path.insert(0, rf'{REPO_PATH}src_HF')
from utils.var_utils import plot_criterion, grangers_causation_matrix, SentVAR

### Load data

In [None]:
FUTURES = ['CLc1', 'LCOc1']
TOPICS = ['CRU', 'CWP', 'CEN']
INTER_TOPICS = [f"{topic}_{i}" for topic in TOPICS for i in range(3)]
CROSS_TOPICS = [f'CT_{i}' for i in range(5)]

def sentiment_cols(tags: list[str]) -> list[str]:
    analyzers = ['VADER', 'TextBlob']
    return [f"{tag}_{analyzer}" for analyzer in analyzers for tag in tags]

dfs = {
    future: pd.read_csv(
        os.path.join(
            REPO_PATH,
            'data',
            'prepared_data',
            f"{future}_5min_resampled.csv"
        ),
        index_col='date',
        parse_dates=True
    ) for future in FUTURES
}

### Stationarity of time series with ADF

In [None]:
INSTRUMENT = 'CLc1'

df = dfs[INSTRUMENT]

SI_columns = df.filter(like='_SI').columns.to_list()
base_columns = ['CLOSE', 'VOLUME', 'COUNT', 'REALIZED_VOL']

variables = SI_columns + base_columns

results = {}
for col in tqdm(variables, desc='Stationarity test'):
    result = adfuller(df[col])
    results[col] = result[:2]

res_df = pd.DataFrame(results).T
res_df.columns = ['ADF Statistic', 'p-value']

print(f'ADFuller test results for {INSTRUMENT} sentiment data:')
display(res_df)

### VAR Optmal lag order, Impulse response function

In [None]:
# find optimal lag order
RESPONSE: list[str] = ['VOLUME', 'REALIZED_VOL']

all_tags = [*TOPICS, *INTER_TOPICS, *CROSS_TOPICS]

cols = sentiment_cols(all_tags)

fig, ax = plt.subplots(1, 2, figsize=(8, 4), dpi=200)

for i, (key, df) in enumerate(tqdm(dfs.items(), desc='Lag order selection')):
    model = VAR(df[[*RESPONSE, *cols]])
    lag_order = model.select_order(30, trend='c')
    plot_criterion(lag_order, ax[i], key)

plt.tight_layout()

fig.savefig(rf'images/lag_order.png')


### Granger causality

In [None]:
df = dfs['CLc1']

# all cross topic columns
CT_colums = df.filter(like='CT').columns.to_list()
grangers_causation_matrix(df, base_columns + CT_colums)


### Topic

In [None]:

ANALYZER = 'VADER'

fig, axs = plt.subplots(1, 3, figsize=(15, 5), dpi=200)
axs = axs.flatten()

for i, topic in enumerate(tqdm(TOPICS)):
    sent_var = SentVAR(dfs, topic, ANALYZER)
    sent_var.plot_irf(axs[i])
    if i == 0:
        fig.legend(
            loc='lower center', 
            bbox_to_anchor=(0.5, -0.1), 
            ncol=4, 
            fontsize=17
        )

fig.tight_layout()

fig.savefig(rf'images/irf_topics_{ANALYZER}.png')


### Inter-topic

In [None]:

fig, axs = plt.subplots(3, 3, figsize=(15, 15), dpi=200)
axs = axs.flatten()

for i, topic in enumerate(tqdm(INTER_TOPICS)):
    sent_var = SentVAR(dfs, topic, ANALYZER)
    sent_var.plot_irf(axs[i])
    if i == 0:
        fig.legend(
            loc='lower center', 
            bbox_to_anchor=(0.5, -0.1/3), 
            ncol=4, 
            fontsize=15
        )

fig.tight_layout()

fig.savefig(rf'images/irf_inter_topics_{ANALYZER}.png')

### Cross-topic

In [None]:

fig = plt.figure(figsize=(15, 10), dpi=200)

locs = [(0,0), (0,2), (0,4), (1,1), (1,3)]
axs = [plt.subplot2grid((2,6), loc, colspan=2, fig=fig) for loc in locs]

for i, topic in enumerate(tqdm(CROSS_TOPICS)):
    sent_var = SentVAR(dfs, topic, ANALYZER)
    sent_var.plot_irf(axs[i])
    if i == 0:
        fig.legend(
            loc='lower center', 
            bbox_to_anchor=(0.5, -0.1/2), 
            ncol=4, 
            fontsize=15
        )

fig.tight_layout()

fig.savefig(rf'images/irf_cross_topics_{ANALYZER}.png')