In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import warnings

import sys
import os
from dotenv import load_dotenv

load_dotenv()
warnings.filterwarnings("ignore")
REPO_PATH = os.getenv("REPO_PATH")

# Import main utility functions
sys.path.insert(0, rf'{REPO_PATH}src_HF')
from utils.main_utils import combload_topic_dfs

### Import data


In [None]:
TOPICS = ['CRU', 'CWP', 'CEN']
FUTURES = ['LCOc1', 'CLc1']
RESAMPLE_WINDOW: str = '5min'

# load news and sentiment data
news_df = combload_topic_dfs(
    TOPICS, 
    lambda topic: rf'{REPO_PATH}data\sentiment_data\{topic}_ARTICLE_SENTIMENT.csv'
)
news_df.index = pd.to_datetime(news_df.index)

# load futures data
f_dfs = {
    future: pd.read_csv(
        rf'{REPO_PATH}data\raw_futures_data\{future}_High_Frequency.csv', 
        index_col=0
    ) for future in FUTURES
}

for df in f_dfs.values():
    df.index = pd.to_datetime(df.index)
    df['LOGRET'] = np.log(df['CLOSE']).diff()
    df.loc[df.index[0], 'LOGRET'] = 0

fr_dfs = {
    future: pd.DataFrame(
        {
            'CLOSE': df['CLOSE'].resample(RESAMPLE_WINDOW).last(),
            'VOLUME': df['VOLUME'].resample(RESAMPLE_WINDOW).sum(),
            'COUNT': df['COUNT'].resample(RESAMPLE_WINDOW).sum(),
            'LOGRET': df['LOGRET'].resample(RESAMPLE_WINDOW).sum(),
            'REALIZED_VOL': np.sqrt((df['LOGRET'] ** 2).resample(RESAMPLE_WINDOW).sum())
        }
    ) for future, df in f_dfs.items()
}

for price_df in fr_dfs.values():
    price_df.index = pd.to_datetime(price_df.index)
    price_df['REALIZED_VOL'] = price_df['REALIZED_VOL'] * 12 * 252 * 24
    price_df['LOGRET'] = price_df['LOGRET'] * 12 * 24

RESAMPLE_WINDOW: str = '5min'

SENTIMENT_COLUMNS: list[str] = [
    'TextBlob_headline', 
    'VADER_headline', 
    'TextBlob_fullStory', 
    'VADER_fullStory'
]

resample_dfs = {
    topic: pd.DataFrame(
        {
            col: news_df[col].resample(RESAMPLE_WINDOW).mean() for col in SENTIMENT_COLUMNS
        }
    ).fillna(0) for topic in TOPICS
}

for res_df in resample_dfs.values():
    count_df = news_df.copy()
    count_df['article_count'] = 1
    res_df['article_count'] = count_df['article_count'].resample(RESAMPLE_WINDOW).sum()

combined_dfs = {
    future: {
        topic: resample_dfs[topic].join(fr_dfs[future]).dropna() for topic in TOPICS
    } for future in FUTURES
}

display(combined_dfs['CLc1']['CEN'])


### Add temporal features and save dfs


In [None]:
def add_temporal(df: pd.DataFrame) -> None:
    df['HOUR'] = df.index.hour
    df['MINUTE'] = df.index.minute
    df['MINUTE_OF_DAY'] = df['HOUR'] * 60 + df['MINUTE']
    df['DAY_OF_WEEK'] = df.index.dayofweek
    df['DAY_OF_MONTH'] = df.index.day

def add_lags(df, column, lags: int) -> None:
    for lag in range(lags):
        df[f'RV_LAG_{lag + 1}'] = df[column].shift(lag + 1)

def add_target(df, column, horizon: int) -> None:
    for lag in range(horizon):
        df[f'TARGET_{lag + 1}'] = df[column].shift(-lag - 1)

for future in FUTURES:
    for topic in TOPICS:
        df = combined_dfs[future][topic]
        add_temporal(df)
        add_lags(df, 'REALIZED_VOL', 5)
        add_target(df, 'REALIZED_VOL', 1)

        df.dropna(inplace=True)

        df.to_csv(rf'{REPO_PATH}data\prepared_data\{future}_{topic}_{RESAMPLE_WINDOW}_resampled.csv')

