In [None]:
from sent_utils import *

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from arch import arch_model
import yfinance as yf

import sys
import warnings

warnings.filterwarnings("ignore")

# Import main utility functions
sys.path.insert(0, r'c:\Users\joneh\master_thesis\src')
from main_utils import *

### Sentiment Analysis Testing

Sentiment analysis functions should take a panads series as input and output a pandas series of the same length with the sentiment of the input text. The sentiment should be a float between -1 and 1, where -1 is negative, 0 is neutral, and 1 is positive.

In [None]:
# Load data
news_df = load_df('news', 'CombinedArchive.csv')

# remove all webTitle that is not string
news_df = news_df[news_df['headline'].apply(lambda x: isinstance(x, str))]

### Sentiment analysis

In [None]:
# Textblob
news_df['textblob'] = textblob_sentiment(news_df['headline'])

# cgpt = pd.read_csv('chatgpt_sentiment.csv')

# news_df['ChatGPT'] = cgpt

# FinBERT
# news_df['finbert'] = news_df['headline'].apply(FinBERT_sentiment)

display(news_df)

### Aggregation and Sentiment index

In [None]:
# Resample data
sample_freq = 'd'

df_sent_Oil = aggregate_score(news_df[news_df['source'] == 'TG'], ['textblob'], frequency=sample_freq)
df_sent_NG = aggregate_score(news_df[news_df['source'] == 'NG'], ['textblob'], frequency=sample_freq)

# # apply sentiment index
df_sent_Oil['SI_bai_Oil'] = SI_bai(abs(df_sent_Oil['textblob']), 7.0)
df_sent_NG['SI_bai_NG'] = SI_bai(abs(df_sent_NG['textblob']), 7.0)

# df_sent['SI_bai_finbert'] = SI_bai(df_sent['finbert'], 7.0)

# plot data
fig, ax = plt.subplots(1, 1, figsize=(15, 10))
ax2 = ax.twinx()
df_sent_Oil['SI_bai_Oil'].rolling(100).mean().plot(ax=ax, label='Oil')
df_sent_NG['SI_bai_NG'].rolling(100).mean().plot(ax=ax, label='Natural Gas')

df = yf.download('CL=F', period='20Y')['Adj Close']

df.plot(ax=ax2, color='black', label='Oil Price')

# correlation
print('Correlation Oil:', df_sent_Oil['SI_bai_Oil'].corr(df_sent_NG['SI_bai_NG']))

### Save sentiment indices

In [None]:
# Enter filename here:
file_name = 'sentiment_index.csv'
# Enter relative path for saving the file:
relative_path = 'data/time_series'

df_sent.to_csv(save_path(relative_path, file_name), index=True)

In [None]:
price_df = pd.read_csv(r'C:\Users\joneh\master_thesis\data\time_series\CL=F_20years.csv')
price_df.index = pd.to_datetime(price_df['Date']).dt.tz_localize(None)

# calculate log-returns
price_df['Log Return'] = np.log(price_df['Adj Close']).diff()

# Calculate volatility
price_df['Volatility'] = price_df['Log Return'].rolling(21).std()

# add GARCH(1,1) model
garch = arch_model(price_df['Log Return'].dropna(), vol='Garch', p=1, q=1)
res = garch.fit(disp='off')
price_df['GARCH'] = res.conditional_volatility

combined_df = df_sent.join(
    price_df[['Volatility', 'Adj Close', 'Volume', 'GARCH', 'Log Return']].resample(sample_freq).mean(), 
    how='left'
)

combined_df = combined_df.fillna(method='ffill').dropna()

x_label = 'SI_bai_textblob'
y_label = 'GARCH'

slope, intercept, r_value, p_value, std_err = stats.linregress(combined_df[x_label], combined_df[y_label])
print(f'p-value: {p_value:.10f}')
print(f'R2: {r_value:.4f}')
print(f'Slope: {slope:.4f}')
print(f'Correlation TG: {combined_df[x_label].corr(combined_df[y_label]):.4f}')

fig = sns.jointplot(
    data=combined_df, 
    x=x_label, 
    y=y_label,
    scatter_kws={'s':5},
    kind='reg',
    line_kws={'color':'red'},
)

### Save data

In [None]:
# Enter filename here:
file_name = 'combined_df_sentiment.csv'
# Enter relative path for saving the file:
relative_path = 'data/model_input'

combined_df.to_csv(save_path(relative_path, file_name), index=False)