In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from tqdm.notebook import tqdm
import warnings
import scienceplots

import statsmodels.api as sm

from statsmodels.tsa.api import VAR
from statsmodels.tsa.stattools import adfuller, grangercausalitytests

import sys
import os
from dotenv import load_dotenv

load_dotenv()
warnings.filterwarnings("ignore")
REPO_PATH = os.getenv("REPO_PATH")

# Import main utility functions
sys.path.insert(0, rf'{REPO_PATH}src_HF')
plt.style.use('science')

### Load data

In [None]:
FUTURES = ['CLc1', 'LCOc1']
TOPICS = ['CRU', 'CWP', 'CEN']

dfs = {
    future: pd.read_csv(
        os.path.join(
            REPO_PATH,
            'data',
            'prepared_data',
            f"{future}_5min_resampled.csv"
        ),
        index_col='date',
        parse_dates=True
    ) for future in FUTURES
}


dfs['CLc1']


### Stationarity of time series with ADF

In [None]:
INSTRUMENT = 'LCOc1'

df = dfs[INSTRUMENT]

SI_columns = df.filter(like='_SI').columns.to_list()
base_columns = ['CLOSE', 'VOLUME', 'COUNT', 'REALIZED_VOL']

variables = SI_columns + base_columns

results = {}
for col in tqdm(variables, desc='Stationarity test'):
    result = adfuller(df[col])
    results[col] = result[:2]

res_df = pd.DataFrame(results).T
res_df.columns = ['ADF Statistic', 'p-value']

print(f'ADFuller test results for {INSTRUMENT} sentiment data:')
display(res_df)

### VAR Optmal lag order

In [None]:
# find optimal lag order
INSTRUMENT = 'CLc1'
RESPONSE: list[str] = ['VOLUME', 'REALIZED_VOL', 'article_count']
SENTIMENT_COLUMNS: list[str] = ['TextBlob_headline', 'VADER_headline', 'TextBlob_fullStory', 'VADER_fullStory']

colors = ['crimson', 'navy', 'limegreen']

def plot_criterion(lag_orders, ax, name):
    for k, ic in enumerate(['aic', 'bic', 'hqic']):
        ic_info = lag_orders.ics[ic]
        lags = range(len(ic_info))
        ax.plot(lags, ic_info, label=ic.upper(), color=colors[k], lw=0.8)
        
        min_ic = np.argmin(ic_info)
        ax.plot(min_ic, ic_info[min_ic], 'ro', color=colors[k])
        # annotate the min point
        ax.annotate(
            f'{min_ic}', (min_ic, ic_info[min_ic]), 
            textcoords="offset points", xytext=(0, 10), 
            ha='center', fontsize=10
        )

    ax.set_xlabel('Lag order')
    ax.legend(frameon=False, loc='upper right', title=name, fontsize=10)    

fig, ax = plt.subplots(1, 3, figsize=(12, 4), dpi=200)

for i, topic in tqdm(enumerate(TOPICS)):
    df = dfs[INSTRUMENT][topic]
    model = VAR(df[[*RESPONSE, *SENTIMENT_COLUMNS]])
    lag_order = model.select_order(30, trend='c')
    plot_criterion(lag_order, ax[i], topic)

plt.tight_layout()

fig.savefig(rf'images/lag_order_{INSTRUMENT}.png')


### Granger causality

In [None]:
df = dfs['CLc1']
def grangers_causation_matrix(data, variables, test='ssr_chi2test', verbose=False, maxlag=12):    
   
    df = pd.DataFrame(np.zeros((len(variables), len(variables))), columns=variables, index=variables)
    for c in tqdm(df.columns):
        for r in df.index:
            test_result = grangercausalitytests(data[[r, c]], maxlag=maxlag, verbose=False)
            p_values = [round(test_result[i+1][0][test][1],4) for i in range(maxlag)]
            if verbose: print(f'Y = {r}, X = {c}, P Values = {p_values}')
            min_p_value = np.min(p_values)
            df.loc[r, c] = min_p_value
    df.columns = [var + '_x' for var in variables]
    df.index = [var + '_y' for var in variables]
    return df

CT_colums = df.filter(like='CT').columns.to_list()


grangers_causation_matrix(df, base_columns + CT_colums)


### Cross-topic 1

In [None]:
class SentVAR:
    def __init__(
            self, 
            dfs: dict[str, pd.DataFrame], 
            topic: str,
            analyzer: str,
            lags: int = 18
        ):
        self.dfs = dfs.copy()
        self.topic = topic

        self.sent_col = f'{topic}_{analyzer}'

        self.model_dict = {}

        for key, df in self.dfs.items():
            df['positive'] = df[self.sent_col].apply(
                lambda x: x if x > 0 else 0
            )
            df['negative'] = df[self.sent_col].apply(
                lambda x: -x if x < 0 else 0
            )

            # VAR model
            model: VAR = VAR(
                df[['positive', 'negative', 'REALIZED_VOL']]
            )
            results = model.fit(lags)

            # Number of periods over which the impulse response function (IRF) is calculated.
            irf = results.irf(100)

            fig = irf.plot(
                orth=False, 
                response='REALIZED_VOL'
            )
            plt.close()

            axes_list = fig.axes[:2]

            self.model_dict[key] = {
                'model': model,
                'results': results,
                'irf': irf,
                'fig': fig,
                'axes_list': axes_list
            }

    def plot_irf(self, ax) -> None:

        linestyles = {'CLc1': '-', 'LCOc1': '--'}

        colors = ['Green', 'Red', 'gray']
        labels = ['Positive', 'Negative']

        for key, model_info in self.model_dict.items():
            axes_list = model_info['axes_list']

            for i, plot_ax in enumerate(axes_list):
                ax.plot(
                    plot_ax.lines[0].get_xdata(), 
                    plot_ax.lines[0].get_ydata(), 
                    color=colors[i],
                    label=labels[i] if key == 'CLc1' else None,
                    linestyle=linestyles[key],
                )

        ax.set_title(rf'{self.topic} $\rightarrow$ Realized Volatility', fontsize=17)
        ax.axhline(0, color='black', lw=0.5)
        ax.set_xlabel('Minutes', fontsize=16)
        ax.set_ylabel('Realized Vol', fontsize=16)
        ax.legend(frameon=False, fontsize=14)

        # multiply x-ticks with 5 
        # ax.set_xticks(np.arange(0, 13, 2))
        # ax.set_xticklabels(np.arange(0, 13, 2) * 5)


fig, axs = plt.subplots(1, 3, figsize=(15, 5), dpi=200)
axs = axs.flatten()

for i, topic in enumerate(tqdm(['CRU', 'CWP', 'CEN'])):
    sent_var = SentVAR(dfs, topic, 'VADER')
    sent_var.plot_irf(axs[i])

fig.tight_layout()

### Inter-topic

In [None]:
fig, axs = plt.subplots(3, 3, figsize=(15, 15), dpi=200)
axs = axs.flatten()

topics = ['CRU', 'CWP', 'CEN']
expanded_topics = [f"{topic}_{i}" for topic in topics for i in range(3)]

for i, topic in enumerate(tqdm(expanded_topics)):
    sent_var = SentVAR(dfs, topic, 'VADER')
    sent_var.plot_irf(axs[i])

plt.tight_layout()

### Cross-topic

In [None]:

fig = plt.figure(figsize=(15, 10), dpi=200)

ax1 = plt.subplot2grid(shape=(2,6), loc=(0,0), colspan=2, fig=fig)
ax2 = plt.subplot2grid((2,6), (0,2), colspan=2, fig=fig)
ax3 = plt.subplot2grid((2,6), (0,4), colspan=2, fig=fig)
ax4 = plt.subplot2grid((2,6), (1,1), colspan=2, fig=fig)
ax5 = plt.subplot2grid((2,6), (1,3), colspan=2, fig=fig)

axs = [ax1, ax2, ax3, ax4, ax5]

for i, topic in enumerate(tqdm(['CT_0', 'CT_1', 'CT_2', 'CT_3', 'CT_4'])):
    sent_var = SentVAR(dfs, topic, 'VADER')
    sent_var.plot_irf(axs[i])

plt.tight_layout()

In [None]:

def plot_response(
        df: pd.DataFrame, 
        response: list[str], 
        sentiment: list[str]
    ) -> plt.Figure:

    colors_pos = ['tab:blue', 'gray', 'gray']
    linestyles = ['-', '--', '--']

    figure, axs = plt.subplots(
        len(response), 
        len(sentiment), 
        figsize=(4 * len(sentiment), 4 * len(response))
    )

    df_var = df.copy()

    for j, sent_col in enumerate(sentiment):
        model: VAR = VAR(df_var[[sent_col, *response]])

        results = model.fit(5)

        # impulse response
        irf = results.irf(12)
        fig = irf.plot(orth=False, impulse=sent_col)

        for i, (ax, ax_neg) in enumerate(zip(fig.axes[1:], fig.axes[1:])):

            for k in range(2):
                axs[i, j].plot(
                    ax.lines[0].get_xdata(), ax.lines[0].get_ydata(), 
                    linestyle=linestyles[k], 
                    color=colors_pos[k], 
                )

            axs[i, j].set_title(fr'{sentiment[j]} $\rightarrow$ {response[i]}', fontsize=15)
            axs[i, j].axhline(0, color='black', lw=0.5)
            axs[i, j].set_xlabel('Minutes', fontsize=14)
            axs[i, j].set_ylabel(response[i], fontsize=14)

            # multiply x-ticks with 5 
            axs[i, j].set_xticks(np.arange(0, 13, 2))
            axs[i, j].set_xticklabels(np.arange(0, 13, 2) * 5)

    figure.tight_layout(pad=1.0)

    return figure

CT_1 = df.filter(like='CT_0').columns.to_list()

# select columns containing CT but not SI
CT_2 = [col for col in df.columns if 'SI' not in col and 'CT' in col]

print(CT_2)

fig = plot_response(df, ['REALIZED_VOL', 'VOLUME', 'CLOSE'], CT_1)



### VAR

In [None]:


def plot_response(df: pd.DataFrame, response: list[str], sentiment: list[str], response_names: list[str], sentiment_names: list[str]) -> plt.Figure:

    colors_pos = ['tab:blue', 'gray', 'gray']
    colors_neg = ['tab:red', 'gray', 'gray']
    linestyles = ['-', '--', '--']

    figure, axs = plt.subplots(
        len(response), 
        len(sentiment), 
        figsize=(4 * len(sentiment), 4 * len(response))
    )

    df_pos = df.copy()
    df_pos[SENTIMENT_COLUMNS] = df_pos[SENTIMENT_COLUMNS].clip(lower=0)
    df_neg = df.copy()
    df_neg[SENTIMENT_COLUMNS] = df_neg[SENTIMENT_COLUMNS].clip(upper=0).abs()

    for j, sent_col in enumerate(sentiment):
        model_pos: VAR = VAR(df_pos[[sent_col, *response]])
        model_neg: VAR = VAR(df_neg[[sent_col, *response]])

        results_pos = model_pos.fit(5)
        results_neg = model_neg.fit(5)

        # impulse response
        irf_pos = results_pos.irf(12)
        irf_neg = results_neg.irf(12)
        fig = irf_pos.plot(orth=False, impulse=sent_col)
        fig_neg = irf_neg.plot(orth=False, impulse=sent_col)

        for i, (ax, ax_neg) in enumerate(zip(fig.axes[1:], fig_neg.axes[1:])):

            for k in range(3):
                axs[i, j].plot(
                    ax.lines[k].get_xdata(), ax.lines[k].get_ydata(), 
                    linestyle=linestyles[k], 
                    color=colors_pos[k], 
                    label=f'Positive sentiment' if k == 0 else None
                )
                axs[i, j].plot(
                    ax_neg.lines[k].get_xdata(), ax_neg.lines[k].get_ydata(), 
                    linestyle=linestyles[k], 
                    color=colors_neg[k], 
                    label=f'Negative sentiment' if k == 0 else None
                )
            axs[i, j].set_title(fr'{sentiment_names[j]} $\rightarrow$ {response_names[i]}', fontsize=15)
            axs[i, j].axhline(0, color='black', lw=0.5)
            axs[i, j].set_xlabel('Minutes', fontsize=14)
            axs[i, j].set_ylabel(response_names[i] + ' response', fontsize=14)
            axs[i, j].legend(fontsize=11, frameon=False, loc='upper right')

            # multiply x-ticks with 5 
            axs[i, j].set_xticks(np.arange(0, 13, 2))
            axs[i, j].set_xticklabels(np.arange(0, 13, 2) * 5)

    figure.tight_layout(pad=1.0)

    return figure

SICT_columns = df.filter(like='SICT').columns.to_list()

fig = plot_response(df, 'REALIZED_VOL', base_columns + CT_colums, ['Volume', 'RV'], base_columns + CT_colums)

fig.savefig(rf'VAR_IRF.png', dpi=200)

### Impulse response function

In [None]:
results.impulse_responses(10, orthogonalized=True, impulse=[1, 0]).plot(figsize=(13,3))