In [None]:
from julia import Main

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from scipy import stats
from arch import arch_model
from typing import List, Dict, Tuple

import sys
from textblob import TextBlob
import yfinance as yf

from sent_utils import *

Main.include(r'sent_index.jl')

# Import main utility functions
sys.path.insert(0, r'c:\Users\joneh\master_thesis\src')
from main_utils import *

### Sentiment Analysis Testing

Sentiment analysis functions should take a panads series as input and output a pandas series of the same length with the sentiment of the input text. The sentiment should be a float between -1 and 1, where -1 is negative, 0 is neutral, and 1 is positive.

In [None]:
# Load data
df_TG = load_news_df('TG_CrudeANDOil.csv')
df_NYT = load_news_df('NYT_CrudeANDOil.csv')

# remove all webTitle that is not string
df_NYT = df_NYT[df_NYT['headline'].apply(lambda x: isinstance(x, str))]

df_sent_TG = textblob_sentiment_df(df_TG, Main.sent_index, frequency='D')
df_sent_NYT = textblob_sentiment_df(df_NYT, Main.sent_index, frequency='D')

display(df_sent_TG.head())
display(df_sent_NYT.head())

In [None]:
price_df = pd.read_csv(r'C:\Users\joneh\master_thesis\data\time_series\YahooFinance\CL=F_20years.csv')

price_df.index = pd.to_datetime(price_df['Date']).dt.tz_localize(None)
price_df['Log Return'] = np.log(price_df['Adj Close']).diff()
# Calculate volatility
price_df['Volatility'] = price_df['Log Return'].rolling(21).std()

# add GARCH(1,1) model
garch = arch_model(price_df['Log Return'].dropna(), vol='Garch', p=1, q=1)
res = garch.fit(disp='off')

price_df['GARCH'] = res.conditional_volatility

combined_df_TG = df_sent_TG.join(price_df[['Volatility', 'Adj Close', 'Volume', 'GARCH', 'Log Return']], how='left')
combined_df_TG = combined_df_TG.fillna(method='ffill').dropna()

combined_df_NYT = df_sent_NYT.join(price_df[['Volatility', 'Adj Close', 'Volume', 'GARCH', 'Log Return']], how='left')
combined_df_NYT = combined_df_NYT.fillna(method='ffill').dropna()

display(combined_df_TG)

x_label = 'SV'
y_label = 'GARCH'

slope, intercept, r_value, p_value, std_err = stats.linregress(combined_df_TG[x_label], combined_df_TG[y_label])
print(f'p-value: {p_value:.7f}')
print(r_value)
print(slope)
print(f'Correlation TG: {combined_df_TG[x_label].corr(combined_df_TG[y_label])}')
print()

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(9, 5))
combined_df_TG.plot(kind='scatter', x=x_label, y=y_label, s=2, ax=ax1)
ax1.plot(combined_df_TG[x_label], slope * combined_df_TG[x_label] + intercept, color='red')
ax1.grid(alpha=0.2)
ax1.set_title(f'{x_label} vs {y_label}')

slope, intercept, r_value, p_value, std_err = stats.linregress(combined_df_NYT[x_label], combined_df_NYT[y_label])
print(f'p-value: {p_value:.7f}')
print(r_value)
print(slope)
print(f'Correlation NYT: {combined_df_NYT[x_label].corr(combined_df_NYT[y_label])}')

combined_df_NYT.plot(kind='scatter', x=x_label, y=y_label, s=2, ax=ax2)
ax2.plot(combined_df_TG[x_label], slope * combined_df_TG[x_label] + intercept, color='red')
ax2.grid(alpha=0.2)
ax2.set_title(f'{x_label} vs {y_label}')


combined_df_NYT.to_csv(r'C:\Users\joneh\master_thesis\data\regression\combined_df_NYT.csv')
combined_df_TG.to_csv(r'C:\Users\joneh\master_thesis\data\regression\combined_df_TG.csv')


In [None]:
fig, ax1 = plt.subplots(figsize=(10, 5), dpi=130)
ax2 = ax1.twinx()

lw = 1

combined_df_TG['GARCH'].plot(ax=ax1, color='blue', label='GARCH', lw=lw)
combined_df_TG['SV'].rolling(52).mean().plot(ax=ax2, color='red', label='SV TG', lw=lw)
combined_df_NYT['SV'].rolling(52).mean().plot(ax=ax2, color='green', label='SV NYT', lw=lw)


# combine labels
lines, labels = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax2.legend(lines + lines2, labels + labels2, loc=0)

comp = combined_df_TG[['SV', 'GARCH']].join(combined_df_NYT['SV'], lsuffix='_TG', rsuffix='_NYT', how='inner')

comp['mean'] = comp[['SV_TG', 'SV_NYT']].mean(axis=1)

print(comp.corr())