In [None]:

import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import warnings

import sys
import os
from dotenv import load_dotenv

load_dotenv()
warnings.filterwarnings("ignore")

REPO_PATH = os.getenv("REPO_PATH")
sys.path.insert(0, rf'{REPO_PATH}src')
from utils.main_utils import combload_topic_dfs


### Parameters

In [None]:
TOPICS = ['CRU', 'CWP', 'CEN']

FUTURES = ['LCOc1', 'CLc1']

SENTIMENT_COLUMNS: list[str] = [
    'TextBlob_fullStory', 
    'VADER_fullStory'
]

RESAMPLE_WINDOW: str = '5min'

### Import data


In [None]:
# load news and sentiment data
news_df: pd.DataFrame = combload_topic_dfs(
    TOPICS, 
    lambda topic: rf'{REPO_PATH}data\sentiment_data\{topic}_ARTICLE_SENTIMENT.csv',
    include_topic=True
)
news_df.index = pd.to_datetime(news_df.index)

news_df['subtopic'] = news_df['topic'] + '_' + news_df['LDA_topic'].astype(str)
news_df['crosstopic'] = 'CT_' + news_df['cross_topic'].astype(str)

news_df.drop(
    columns=[col for col in news_df.columns if 'headline' in col] + ['LDA_topic', 'cross_topic'],
    inplace=True
)

# load futures data
f_dfs: dict[str, pd.DataFrame] = {
    future: pd.read_csv(
        rf'{REPO_PATH}data\raw_futures_data\{future}_High_Frequency.csv', 
        index_col=0
    ) for future in FUTURES
}

display(news_df.head())

### Resample and add measures to futures data

In [None]:
annualization_factor = np.sqrt(78 * 252)

# set index to datetime and calculate log returns
for df in f_dfs.values():
    df.index = pd.to_datetime(df.index)
    df['LOGRET'] = np.log(df['CLOSE']).diff()
    df.loc[df.index[0], 'LOGRET'] = 0  # set first log return to 0

fr_dfs: dict[str, pd.DataFrame] = {
    future: pd.DataFrame(
        {
            'CLOSE': df['CLOSE'].resample(RESAMPLE_WINDOW).last(),
            'VOLUME': df['VOLUME'].resample(RESAMPLE_WINDOW).sum(),
            'COUNT': df['COUNT'].resample(RESAMPLE_WINDOW).sum(),
            'LOGRET': df['LOGRET'].resample(RESAMPLE_WINDOW).sum(),
            'REALIZED_VOL': np.sqrt((df['LOGRET'] ** 2).resample(RESAMPLE_WINDOW).sum())
        }
    ) for future, df in f_dfs.items()
}

for price_df in fr_dfs.values():
    price_df.index = pd.to_datetime(price_df.index)
    price_df['REALIZED_VOL'] = price_df['REALIZED_VOL'] * annualization_factor

display(fr_dfs['LCOc1'].head())

### Resample and treat news sentiment data

In [None]:
classifiers = {
    'topic': TOPICS,
    'subtopic': news_df['subtopic'].unique(),
    'crosstopic': news_df['crosstopic'].unique()
}

resample_dfs = []
for key, classifier in classifiers.items():
    for topic in classifier:
        resampled_df = pd.DataFrame(
            {
                col: news_df[col][news_df[key] == topic].resample(RESAMPLE_WINDOW).mean()
                for col in SENTIMENT_COLUMNS
            }
        ).fillna(0).add_prefix(f'{topic}_')
        resample_dfs.append(resampled_df)

resample_df = pd.concat(resample_dfs, axis=1).fillna(0)
resample_df.columns = resample_df.columns.str.replace('_fullStory', '')

# combine news and futures data
combined_dfs: dict[str, pd.DataFrame] = {
    future: resample_df.fillna(0).join(fr_dfs[future]).dropna()
    for future in FUTURES
}

for df in combined_dfs.values():
    display(df.head(2))


### Add sentiment index

In [None]:
DECAY = 288 # number of 5-minute intervals in a days

dfs = combined_dfs.copy()
# Topic Sentiment Index
for key, df in dfs.items():
    tqdm.pandas(desc=f'Calculating Topic SI for {key}')
    for col_name in tqdm(resample_df.columns):
        df[f'{col_name}_SI'] = df[col_name].ewm(span=DECAY).mean()


### Add temporal features and save dfs


In [None]:
def add_temporal(df: pd.DataFrame) -> None:
    df['HOUR'] = df.index.hour
    df['MINUTE'] = df.index.minute
    df['MINUTE_OF_DAY'] = df['HOUR'] * 60 + df['MINUTE']
    df['DAY_OF_WEEK'] = df.index.dayofweek
    df['DAY_OF_MONTH'] = df.index.day

def add_lags(df, column, lags: int) -> None:
    for lag in range(lags):
        df[f'RV_LAG_{lag + 1}'] = df[column].shift(lag + 1)

def add_target(df, column, horizon: int) -> None:
    for lag in range(horizon):
        df[f'TARGET_{lag + 1}'] = df[column].shift(-lag - 1)

for future in FUTURES:
    df = combined_dfs[future]
    add_temporal(df)
    add_lags(df, 'REALIZED_VOL', 5)
    
    add_target(df, 'REALIZED_VOL', 1)

    df.dropna(inplace=True)

    df.to_csv(rf'{REPO_PATH}data\prepared_data\{future}_{RESAMPLE_WINDOW}_resampled.csv')

