In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from tqdm import tqdm
import warnings

import statsmodels.api as sm

from statsmodels.tsa.api import SVAR, VAR
from statsmodels.tsa.stattools import adfuller, grangercausalitytests

import sys
import os
from dotenv import load_dotenv

load_dotenv()
warnings.filterwarnings("ignore")
REPO_PATH = os.getenv("REPO_PATH")

# Import main utility functions
sys.path.insert(0, rf'{REPO_PATH}src_HF')
from utils.main_utils import combload_topic_dfs

### Load data

In [None]:
TOPICS = ['CRU', 'CWP', 'CEN']
FUTURES = ['LCOc1', 'CLc1']

news_df = combload_topic_dfs(
    TOPICS, 
    lambda topic: rf'{REPO_PATH}data\sentiment_data\{topic}_ARTICLE_SENTIMENT.csv'
)
news_df.index = pd.to_datetime(news_df.index)

display(news_df.head(2))


futures_dfs = {
    future: pd.read_csv(
        rf'{REPO_PATH}data\time_series\{future}_5min_processed.csv', 
        index_col=0
    ) for future in FUTURES
}

for price_df in futures_dfs.values():
    price_df.index = pd.to_datetime(price_df.index)
    price_df['REALIZED_VOL'] = price_df['REALIZED_VOL'] * 12 * 252 * 24
    price_df['LOGRET'] = price_df['LOGRET'] * 12 * 24

RESAMPLE_WINDOW: str = '5min'

SENTIMENT_COLUMNS: list[str] = [
    'TextBlob_headline', 
    'VADER_headline', 
    'TextBlob_fullStory', 
    'VADER_fullStory'
]

resample_dfs = {
    topic: pd.DataFrame(
        {
            col: news_df[col].resample(RESAMPLE_WINDOW).mean() for col in SENTIMENT_COLUMNS
        }
    ).fillna(0) for topic in TOPICS
}

for res_df in resample_dfs.values():
    count_df = news_df.copy()
    count_df['article_count'] = 1
    res_df['article_count'] = count_df['article_count'].resample(RESAMPLE_WINDOW).sum()

combined_dfs = {
    future: {
        topic: resample_dfs[topic].join(futures_dfs[future]).dropna() for topic in TOPICS
    } for future in FUTURES
}

display(combined_dfs['LCOc1']['CEN'].head(2))


### Stationarity of time series

In [None]:
INSTRUMENT = 'LCOc1'
TOPIC = 'CEN'

df = combined_dfs[INSTRUMENT][TOPIC]

variables = SENTIMENT_COLUMNS + ['article_count', 'CLOSE', 'VOLUME', 'COUNT', 'REALIZED_VOL']

results = {}
for col in tqdm(variables):
    result = adfuller(df[col])
    results[col] = result[:2]

res_df = pd.DataFrame(results).T
res_df.columns = ['ADF Statistic', 'p-value']

print(f'ADFuller test results for {INSTRUMENT} and {TOPIC} sentiment data:')
display(res_df)

### VAR Optmal lag order

In [None]:
# find optimal lag order
INSTRUMENT = 'LCOc1'
RESPONSE: list[str] = ['VOLUME', 'REALIZED_VOL', 'article_count']
SENTIMENT_COLUMNS: list[str] = ['TextBlob_headline', 'VADER_headline', 'TextBlob_fullStory', 'VADER_fullStory']

colors = ['crimson', 'navy', 'limegreen']

def plot_criterion(lag_orders, ax, name):
    for k, ic in enumerate(['aic', 'bic', 'hqic']):
        ic_info = lag_orders.ics[ic]
        lags = range(len(ic_info))
        ax.plot(lags, ic_info, label=ic.upper(), color=colors[k], lw=0.8)
        
        min_ic = np.argmin(ic_info)
        ax.plot(min_ic, ic_info[min_ic], 'ro', color=colors[k])
        # annotate the min point
        ax.annotate(
            f'{min_ic}', (min_ic, ic_info[min_ic]), 
            textcoords="offset points", xytext=(0, 10), 
            ha='center', fontsize=10
        )

    ax.set_xlabel('Lag order')
    ax.legend(frameon=False, loc='upper right', title=name, fontsize=10)    

fig, ax = plt.subplots(1, 3, figsize=(12, 4), dpi=200)

for i, topic in tqdm(enumerate(TOPICS)):
    df = combined_dfs[INSTRUMENT][topic]
    model = VAR(df[[*RESPONSE, *SENTIMENT_COLUMNS]])
    lag_order = model.select_order(30, trend='c')
    plot_criterion(lag_order, ax[i], topic)

plt.tight_layout()

fig.savefig(rf'images/lag_order_{INSTRUMENT}.png')


### Granger causality

In [None]:
df = df_combined[[*SENTIMENT_COLUMNS, *RESPONSE]]

def grangers_causation_matrix(data, variables, test='ssr_chi2test', verbose=False, maxlag=12):    
   
    df = pd.DataFrame(np.zeros((len(variables), len(variables))), columns=variables, index=variables)
    for c in df.columns:
        for r in df.index:
            test_result = grangercausalitytests(data[[r, c]], maxlag=maxlag, verbose=False)
            p_values = [round(test_result[i+1][0][test][1],4) for i in range(maxlag)]
            if verbose: print(f'Y = {r}, X = {c}, P Values = {p_values}')
            min_p_value = np.min(p_values)
            df.loc[r, c] = min_p_value
    df.columns = [var + '_x' for var in variables]
    df.index = [var + '_y' for var in variables]
    return df

grangers_causation_matrix(df, df.columns)


### VAR

In [None]:
RESPONSE: list[str] = ['VOLUME', 'REALIZED_VOL']
SENTIMENT_COLUMNS: list[str] = ['TextBlob_headline', 'VADER_headline', 'TextBlob_fullStory', 'VADER_fullStory']

def plot_response(df: pd.DataFrame, response: list[str], sentiment: list[str], response_names: list[str], sentiment_names: list[str]) -> plt.Figure:

    colors_pos = ['tab:blue', 'gray', 'gray']
    colors_neg = ['tab:red', 'gray', 'gray']
    linestyles = ['-', '--', '--']

    figure, axs = plt.subplots(
        len(response), 
        len(sentiment), 
        figsize=(4 * len(sentiment), 4 * len(response))
    )

    df_pos = df.copy()
    df_pos[SENTIMENT_COLUMNS] = df_pos[SENTIMENT_COLUMNS].clip(lower=0)
    df_neg = df.copy()
    df_neg[SENTIMENT_COLUMNS] = df_neg[SENTIMENT_COLUMNS].clip(upper=0).abs()

    for j, sent_col in enumerate(sentiment):
        model_pos: VAR = VAR(df_pos[[sent_col, *response]])
        model_neg: VAR = VAR(df_neg[[sent_col, *response]])

        results_pos = model_pos.fit(5)
        results_neg = model_neg.fit(5)

        # impulse response
        irf_pos = results_pos.irf(12)
        irf_neg = results_neg.irf(12)
        fig = irf_pos.plot(orth=False, impulse=sent_col)
        fig_neg = irf_neg.plot(orth=False, impulse=sent_col)

        for i, (ax, ax_neg) in enumerate(zip(fig.axes[1:], fig_neg.axes[1:])):

            for k in range(3):
                axs[i, j].plot(
                    ax.lines[k].get_xdata(), ax.lines[k].get_ydata(), 
                    linestyle=linestyles[k], 
                    color=colors_pos[k], 
                    label=f'Positive sentiment' if k == 0 else None
                )
                axs[i, j].plot(
                    ax_neg.lines[k].get_xdata(), ax_neg.lines[k].get_ydata(), 
                    linestyle=linestyles[k], 
                    color=colors_neg[k], 
                    label=f'Negative sentiment' if k == 0 else None
                )
            axs[i, j].set_title(fr'{sentiment_names[j]} $\rightarrow$ {response_names[i]}', fontsize=15)
            axs[i, j].axhline(0, color='black', lw=0.5)
            axs[i, j].set_xlabel('Minutes', fontsize=14)
            axs[i, j].set_ylabel(response_names[i] + ' response', fontsize=14)
            axs[i, j].legend(fontsize=11, frameon=False, loc='upper right')

            # multiply x-ticks with 5 
            axs[i, j].set_xticks(np.arange(0, 13, 2))
            axs[i, j].set_xticklabels(np.arange(0, 13, 2) * 5)

    figure.tight_layout(pad=1.0)

    return figure


fig = plot_response(df_combined, RESPONSE, SENTIMENT_COLUMNS, ['Volume', 'RV'], SENTIMENT_COLUMNS)

fig.savefig(rf'VAR_IRF.png', dpi=200)

### Impulse response function

In [None]:
results.impulse_responses(10, orthogonalized=True, impulse=[1, 0]).plot(figsize=(13,3))