In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from arch import arch_model
import yfinance as yf
from tqdm import tqdm

import sys
import os
from dotenv import load_dotenv

load_dotenv()
REPO_PATH = os.getenv("REPO_PATH")

# Import main utility functions
sys.path.insert(0, rf'{REPO_PATH}src')
from utils import *

### Trading days

In [None]:
daily_df = pd.read_csv(rf'{REPO_PATH}data\time_series\LCOc1.csv')
daily_df.index = pd.to_datetime(daily_df['Date'])

daily_df['TRADING_DAY'] = True

# fill all missing days with False in TRADING_DAY column
daily_df = daily_df.resample('D').asfreq().fillna(False)

daily_df['TRADING_DAY']

### Load data

In [None]:
sentiment_df = pd.read_csv(rf'{REPO_PATH}data\news\SENTIMENT_ALL_NEWS.csv')
futures_df = pd.read_csv(rf'{REPO_PATH}data\time_series\LCOc1_High_Frequency.csv')


# remove all news articles before 2023-04-14
sentiment_df = sentiment_df[sentiment_df['versionCreated'] >= '2023-04-14']
sentiment_df.rename(columns={'versionCreated': 'Date'}, inplace=True)
sentiment_df['Date'] = pd.to_datetime(sentiment_df['Date']).dt.tz_convert(None)

futures_df['Date'] = pd.to_datetime(futures_df['Date'])


futures_df['LOGRET'] = np.log(futures_df['CLOSE']).diff()
gm = arch_model(futures_df['LOGRET'].dropna(), vol='GARCH', p=1, q=1)
gm_fit = gm.fit(disp='off')

futures_df['GARCH'] = gm_fit.conditional_volatility



In [None]:


horizon = 100
          
vol_dict = {}

for i, row in tqdm(sentiment_df.iterrows(), total=sentiment_df.shape[0]):
    time = row['Date']

    # in futures data, find closest row after time
    closest = futures_df[futures_df['Date'] >= time].index[0]

    index_range = futures_df.index[closest-horizon:closest+horizon]

    vol_dict[row['storyId']] = [futures_df['GARCH'][i] for i in index_range]
    

res_df = pd.DataFrame(vol_dict, index=range(-horizon, horizon)).T

display(res_df)

In [None]:
# plot
fig, ax = plt.subplots(figsize=(12, 8))

res_df.T.plot(ax=ax, lw=0.2, legend=False)
res_df.T.mean(axis=1).plot(ax=ax, lw=2, color='black', label='Mean')
ax.axvline(0, color='black', lw=0.5, ls='--')

In [None]:
negative = sentiment_df[sentiment_df['polarity'] < -0.5]['storyId']
positive = sentiment_df[sentiment_df['polarity'] > 0.5]['storyId']

print('number of negative news:', len(negative))
print('number of positive news:', len(positive))

res_df.loc[negative].T.mean(axis=1).plot(lw=1, color='red', label='Negative')
res_df.loc[positive].T.mean(axis=1).plot(lw=1, color='green', label='Positive')
res_df.T.mean(axis=1).plot(lw=1, color='black', label='Mean')
plt.axvline(0, color='black', lw=0.5, ls='--')

plt.legend()
plt.xlabel('Time [minutes]')
plt.ylabel('GARCH volatility')

# res_df.T.mean(axis=1).plot(lw=2, color='black', label='Mean')
# res_df.T.mean(axis=1).plot(lw=2, color='black', label='Mean')
# ax.axvline(0, color='black', lw=0.5, ls='--')