In [None]:
import numpy as np
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.collections import PolyCollection
import seaborn as sns
from tqdm.notebook import tqdm
import scienceplots

from statsmodels.graphics.tsaplots import plot_acf

import os
import sys
import warnings
from dotenv import load_dotenv

load_dotenv()
REPO_PATH = os.getenv("REPO_PATH")
sys.path.insert(0, rf'{REPO_PATH}src')

from utils.main_utils import combload_topic_dfs, apply_nb_style
from utils.eval_utils import describe_df

apply_nb_style()

plt.style.use('science')
warnings.filterwarnings("ignore")
pd.options.display.float_format = '{:.4f}'.format

### Parameters

In [None]:
TOPICS = ['CRU', 'CWP', 'CEN']

FUTURES = ['CLc1', 'LCOc1']

SENTIMENT_COLUMNS = [
    'TextBlob_fullStory', 
    'VADER_fullStory'
]

RESAMPLE_WINDOW = '5min'

### Import data


In [None]:
news_df: pd.DataFrame = combload_topic_dfs(
    TOPICS, 
    lambda topic: rf'{REPO_PATH}data\sentiment_data\{topic}_ARTICLE_SENTIMENT.csv',
    include_topic=True
)
news_df.index = pd.to_datetime(news_df.index)

news_df['subtopic'] = news_df['topic'] + '_' + news_df['LDA_topic'].astype(str)
news_df['crosstopic'] = 'CT_' + news_df['cross_topic'].astype(str)

news_df.drop(
    columns=[col for col in news_df.columns if 'headline' in col] + ['LDA_topic', 'cross_topic'],
    inplace=True
)

f_dfs: dict[str, pd.DataFrame] = {
    future: pd.read_csv(
        rf'{REPO_PATH}data\raw_futures_data\{future}_High_Frequency.csv', 
        index_col=0
    ) for future in FUTURES
}

display(news_df.head())

### Resample and add measures to futures data

In [None]:
ANNUALIZATION_FACTOR = np.sqrt(276 * 252)

for df in f_dfs.values():
    df.index = pd.to_datetime(df.index)
    df['LOGRET'] = np.log(df['CLOSE']).diff()
    df.loc[df.index[0], 'LOGRET'] = 0

fr_dfs: dict[str, pd.DataFrame] = {
    future: pd.DataFrame(
        {
            'CLOSE': df['CLOSE'].resample(RESAMPLE_WINDOW).last(),
            'VOLUME': df['VOLUME'].resample(RESAMPLE_WINDOW).sum(),
            'COUNT': df['COUNT'].resample(RESAMPLE_WINDOW).sum(),
            'LOGRET': df['LOGRET'].resample(RESAMPLE_WINDOW).sum(),
            'REALIZED_VOL': np.sqrt((df['LOGRET'] ** 2).resample(RESAMPLE_WINDOW).sum())
        }
    ) for future, df in f_dfs.items()
}

for i, (key, df) in enumerate(fr_dfs.items()):
    df.index = pd.to_datetime(df.index)
    df['REALIZED_VOL'] = df['REALIZED_VOL'] * ANNUALIZATION_FACTOR

log_list = []
rev_list = []

for i, (key, df) in enumerate(fr_dfs.items()):
    log_df = describe_df(df['LOGRET'].resample('1D').sum() * 100)
    log_list.append(log_df.T)
    
    rev_df = describe_df(df['REALIZED_VOL'])
    rev_list.append(rev_df.T)

desc_df = pd.concat(
    [
        pd.concat(log_list, axis=1, keys=fr_dfs.keys()),
        pd.concat(rev_list, axis=1, keys=fr_dfs.keys())
    ], axis=1
)

display(desc_df)


### Autocorrelation

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(10, 5), dpi=200)

colors = sns.color_palette('twilight', 2)

plot_content = {
    'Daily': np.sqrt((f_dfs['CLc1']['LOGRET'] ** 2).resample('1d').sum()),
    '5min': fr_dfs['CLc1']['REALIZED_VOL']
}

for i, (key, df) in enumerate(plot_content.items()):
    plot_acf(
        df,
        ax=axs[i], 
        color=colors[i],
        lags=20,
        vlines_kwargs={'colors': colors[i]}
    )
    for item in axs[i].collections:
        if type(item)==PolyCollection:
            item.set_facecolor(colors[i])

    axs[i].set_title(f'{key} Realized Volatility ACF', fontsize=16)
    axs[i].set_xlabel('Lags', fontsize=16)
    axs[i].set_ylabel('Correlation', fontsize=16)
    axs[i].grid(alpha=0.3)

fig.tight_layout()
fig.savefig('images/acf_realized_volatility.png')

print(f'Daily ACF(1): {plot_content["Daily"].autocorr(1):.4f}')
print(f'5min  ACF(1): {plot_content["5min"].autocorr(1):.4f}')

### Resample and treat news sentiment data

In [None]:
classifiers = {
    'topic': TOPICS,
    'subtopic': news_df['subtopic'].unique(),
    'crosstopic': news_df['crosstopic'].unique()
}

resample_dfs = []
for key, classifier in classifiers.items():
    for topic in classifier:
        resampled_df = pd.DataFrame(
            {
                col: news_df[col][news_df[key] == topic].resample(RESAMPLE_WINDOW).mean()
                for col in SENTIMENT_COLUMNS
            }
        ).fillna(0).add_prefix(f'{topic}_')
        resample_dfs.append(resampled_df)

resample_df = pd.concat(resample_dfs, axis=1).fillna(0)
resample_df.columns = resample_df.columns.str.replace('_fullStory', '')

# combine news and futures data
combined_dfs: dict[str, pd.DataFrame] = {
    future: resample_df.fillna(0).join(fr_dfs[future]).dropna()
    for future in FUTURES
}

for df in combined_dfs.values():
    display(df.head(2))


### Add sentiment index

In [None]:
DECAY = 288 # number of 5-minute intervals

dfs = combined_dfs.copy()
for key, df in dfs.items():
    tqdm.pandas(desc=f'Calculating Topic SI for {key}')
    for col_name in tqdm(resample_df.columns):
        df[f'{col_name}_SI'] = df[col_name].ewm(span=DECAY).mean()


### Add temporal features and save dfs


In [None]:
def add_temporal(df: pd.DataFrame) -> None:
    df['HOUR'] = df.index.hour
    df['MINUTE'] = df.index.minute
    df['DAY_OF_WEEK'] = df.index.dayofweek
    df['DAY_OF_MONTH'] = df.index.day
    df['MONTH'] = df.index.month
    df['YEAR'] = df.index.year

def add_lags(df, column, lags: int) -> None:
    for lag in range(lags):
        df[f'RV_LAG_{lag + 1}'] = df[column].shift(lag + 1)

def add_target(df, column, horizon: int) -> None:
    for lag in range(horizon):
        df[f'TARGET_{lag + 1}'] = df[column].shift(-lag - 1)

for future in FUTURES:
    df = combined_dfs[future]
    add_temporal(df)
    add_lags(df, 'REALIZED_VOL', 5)
    
    add_target(df, 'REALIZED_VOL', 1)

    df.dropna(inplace=True)

    df.to_csv(
        rf'{REPO_PATH}data\prepared_data\{future}_{RESAMPLE_WINDOW}_resampled.csv'
    )

