In [None]:
from sent_utils import * # has to be first to avoid conflict with Julia load

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from arch import arch_model
import scienceplots

import os
import sys
from dotenv import load_dotenv

load_dotenv()
repo_path = os.getenv("REPO_PATH")
plt.style.use('science')

# Import main utility functions
sys.path.insert(0, repo_path + r'src_HF')

from utils.main_utils import *
from utils.text_utils import *

### Sentiment Analysis Testing

Sentiment analysis functions should take a panads series as input and output a pandas series of the same length with the sentiment of the input text. The sentiment should be a float between -1 and 1, where -1 is negative, 0 is neutral, and 1 is positive.

In [None]:
topic = 'CRU'

# Load data from database
text_df = pd.read_json(repo_path + rf'data\news_data\EIKON_{topic}_NEWS_COMPLETE.json', lines=True, orient='records')

display(text_df.head(2))
print(text_df.shape)

### Sentiment analysis with Textblob and VADER


In [None]:
text_df['TextBlob_headline'] = add_textblob_polarity(text_df['text'])
text_df['VADER_headline'] = add_vader_compound(text_df['text'])

text_df['TextBlob_fullStory'] = add_textblob_polarity(text_df['fullStory'])
text_df['VADER_fullStory'] = add_vader_compound(text_df['fullStory'])

display(text_df.head(2))

### Save data

In [None]:
# drop text columns for storage efficiency
text_df.drop(columns=['text', 'fullStory'], inplace=True)

text_df.to_csv(repo_path + rf'data\sentiment_data\{topic}_ARTICLE_SENTIMENT.csv', index=False)

### Correlation between Textblob and VADER

In [None]:

fig, axs = plt.subplots(2, 2, figsize=(10, 10))
text_df.plot.scatter(x='TextBlob_headline', y='TextBlob_fullStory', alpha=0.5, ax=axs[0, 0], s=3)
text_df.plot.scatter(x='VADER_headline', y='VADER_fullStory', alpha=0.5, ax=axs[1, 0], s=3)

text_df.plot.scatter(x='TextBlob_headline', y='VADER_headline', alpha=0.5, ax=axs[0, 1], s=3)
text_df.plot.scatter(x='TextBlob_fullStory', y='VADER_fullStory', alpha=0.5, ax=axs[1, 1], s=3)


### Distribution

In [None]:
# text_df.plot.scatter(x='TextBlob', y='VADER', alpha=0.5, s=1)

fig, ax = plt.subplots(figsize=(9, 6))

text_df['TextBlob'].plot.hist(bins=100, alpha=0.5, label='TextBlob', ax=ax)
text_df['VADER'].plot.hist(bins=100, alpha=0.5, label='VADER', ax=ax)

text_df['compound'] = text_df['VADER'] * 0.5 + text_df['TextBlob'] * 0.5

text_df['compound'].plot.hist(bins=100, alpha=0.5, label='compound', ax=ax)

ax.legend(fontsize=14)
ax.set_xlabel('Sentiment score', fontsize=14)
ax.set_ylabel('Frequency', fontsize=14)
ax.tick_params(axis='both', which='major', labelsize=14)

fig.savefig(repo_path + r'src_HF\3 Sentiment Analysis\images', dpi=200, bbox_inches='tight')

### Save sentiment data

In [None]:
# Enter filename here:
file_name = f'SENTIMENT_ALL_NEWS.csv'
# Enter relative path for saving the file:
relative_path = 'data/news'

sentiment_df.to_csv(save_path(relative_path, file_name), index=False)

### Aggregation and Sentiment index

Calculates the daily average sentiment score.
```python 
aggregate_score()
``` 
$$
SV_t = \frac{1}{N_t}\sum_{i=1}^{n} PV_{it}
$$

Calculates the close sentiment and applying it to the first day market is open.
```python 
merge_sentiment()
```
$$
SV_{t,new} =\frac{\sum_{k=0}^{K}0.9^k\cdot SV_{t-k}}{\sum_{k=0}^{K}0.9^k}
$$

Creates a sentiment index.
```python 
SI_bai()
``` 

$$
SI_t = SV_t+ \sum_{i=1}^{t-1} SV_i\cdot e^{-\frac{t-1}{\beta}}
$$


### Aggregate to daily

In [None]:
# Resample data
sample_freq = 'h'

sentiment_df.index = pd.to_datetime(sentiment_df['versionCreated'])

df_sent = aggregate_score(sentiment_df, ['polarity', 'subjectivity'], frequency=sample_freq)

display(df_sent)

In [None]:
price_df = pd.read_csv(r'C:\Users\joneh\master_thesis\data\time_series\CLc1_High_Frequency.csv')
price_df.index = pd.to_datetime(price_df['Date']).dt.tz_localize(None)

df_sent.index = df_sent.index.tz_localize(None)

# calculate log-returns
price_df['LOGRET'] = np.log(price_df['CLOSE']).diff()

# add GARCH(1,1) model
garch = arch_model(price_df['LOGRET'].dropna(), vol='Garch', p=1, q=1)
res = garch.fit(disp='off')
price_df['GARCH'] = res.conditional_volatility

combined_df = df_sent.join(
    price_df[['GARCH', 'CLOSE', 'LOGRET', 'VOLUME']], 
    how='left'
)

combined_df = combined_df.dropna()

display(combined_df)

# add sentiment index
combined_df['SI_BAI'] = SI_bai(combined_df['polarity'], beta=7)

x_label = 'count'
y_label = 'GARCH'

slope, intercept, r_value, p_value, std_err = stats.linregress(combined_df[x_label], combined_df[y_label])
print(f'p-value: {p_value:.10f}')
print(f'R2: {r_value:.4f}')
print(f'Slope: {slope:.4f}')
print(f'Correlation: {combined_df[x_label].corr(combined_df[y_label]):.4f}')

fig = sns.jointplot(
    data=combined_df, 
    x=x_label, 
    y=y_label,
    scatter_kws={'s':5},
    kind='reg',
    line_kws={'color':'red'},
)

### Save sentiment indices

In [None]:
# Enter filename here:
file_name = 'sentiment_index.csv'
# Enter relative path for saving the file:
relative_path = 'data/time_series'

df_sent.to_csv(save_path(relative_path, file_name), index=True)

### Save data

In [None]:
# Enter filename here:
file_name = 'combined_df_sentiment.csv'
# Enter relative path for saving the file:
relative_path = 'data/model_input'

combined_df.to_csv(save_path(relative_path, file_name), index=False)