In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from scipy import stats
from arch import arch_model

import sys
from textblob import TextBlob
import yfinance as yf

from sent_utils import *

# Import main utility functions
sys.path.insert(0, r'c:\Users\joneh\master_thesis\src')
from main_utils import *

### Sentiment Analysis Testing

Sentiment analysis functions should take a panads series as input and output a pandas series of the same length with the sentiment of the input text. The sentiment should be a float between -1 and 1, where -1 is negative, 0 is neutral, and 1 is positive.

In [None]:
# Load data
df = pd.read_csv(r'C:\Users\joneh\master_thesis\data\news\TheGuardian\TG_CrudeANDOil.csv')

df.drop(['type', 'sectionId', 'sectionName', 'webPublicationDate', 'webUrl', 'apiUrl', 'isHosted', 'pillarId', 'pillarName'], axis=1, inplace=True)

df[['polarity', 'subjectivity']] = df['webTitle'].apply(lambda x: pd.Series(TextBlob(x).sentiment))

df.index = pd.to_datetime(df['datetime']).dt.tz_localize(None)

display(df.head())

# aggregate scores to daily frequency
sent_df = aggregate_score(df, ['polarity', 'subjectivity'], frequency='D')


In [None]:
price_df = pd.read_csv(r'C:\Users\joneh\master_thesis\data\time_series\YahooFinance\CL=F_20years.csv')

sent_df['polarity'].plot()

res_list = []

for t in tqdm(range(len(sent_df))):
    SV = sent_df['polarity'].iloc[t]

    for i in range(1, t):
        SV += sent_df['polarity'].iloc[i] * np.exp(-(t-i)/7)

    res_list.append(SV)

sent_df['SV'] = res_list

In [None]:
price_df.index = pd.to_datetime(price_df['Date']).dt.tz_localize(None)
price_df['Log Return'] = np.log(price_df['Adj Close']).diff()
# Calculate volatility
price_df['Volatility'] = price_df['Log Return'].rolling(21).std()

# add GARCH(1,1) model
garch = arch_model(price_df['Log Return'].dropna(), vol='Garch', p=1, q=1)
res = garch.fit(disp='off')

price_df['GARCH'] = res.conditional_volatility

combined_df = sent_df.join(price_df[['Volatility', 'Adj Close', 'Volume', 'GARCH', 'Log Return']], how='left')
combined_df = combined_df.fillna(method='ffill').dropna()

display(combined_df)

x_label = 'SV'
y_label = 'GARCH'

slope, intercept, r_value, p_value, std_err = stats.linregress(combined_df[x_label], combined_df[y_label])
print(f'p-value: {p_value:.7f}')
print(r_value)
print(slope)

fig, ax = plt.subplots(figsize=(7, 5))
combined_df.plot(kind='scatter', x=x_label, y=y_label, s=2, ax=ax)
ax.plot(combined_df[x_label], slope * combined_df[x_label] + intercept, color='red')
ax.grid(alpha=0.2)
ax.set_title(f'{x_label} vs {y_label}')


# correlation
print(f'Correlation: {combined_df[x_label].corr(combined_df[y_label])}')


In [None]:
fig, ax1 = plt.subplots(figsize=(10, 5))
ax2 = ax1.twinx()

combined_df['GARCH'].iloc[-300:].plot(ax=ax1, color='blue', label='Price')
combined_df['SV'].iloc[-300:].plot(ax=ax2, color='red', label='SV')

# combine labels
lines, labels = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax2.legend(lines + lines2, labels + labels2, loc=0)