In [None]:
from sent_utils import * # has to be first to avoid conflict with Julia load

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from arch import arch_model
import sys
import warnings
from textblob import TextBlob

warnings.filterwarnings("ignore")

# Import main utility functions
sys.path.insert(0, r'c:\Users\joneh\master_thesis\src_HF')

from utils.main_utils import *

### Sentiment Analysis Testing

Sentiment analysis functions should take a panads series as input and output a pandas series of the same length with the sentiment of the input text. The sentiment should be a float between -1 and 1, where -1 is negative, 0 is neutral, and 1 is positive.

In [None]:
# Load data from database
news_df = pd.read_csv(r'C:\Users\joneh\master_thesis\data\news\EIKON_ALL_NEWS.csv')

display(news_df)

### Sentiment analysis with textblob

Applying a sentiment analysis score to each headline. 0.0 is applied where there are none.

In [None]:
sentiment = news_df['text'].apply(lambda x: pd.Series(TextBlob(x).sentiment)).rename(columns={0: 'polarity', 1: 'subjectivity'})
sentiment_df = pd.concat([news_df, sentiment], axis=1)
display(sentiment_df)

### Save sentiment data

In [None]:
# Enter filename here:
file_name = f'SENTIMENT_ALL_NEWS.csv'
# Enter relative path for saving the file:
relative_path = 'data/news'

sentiment_df.to_csv(save_path(relative_path, file_name), index=False)

### Aggregation and Sentiment index

Calculates the daily average sentiment score.
```python 
aggregate_score()
``` 
$$
SV_t = \frac{1}{N_t}\sum_{i=1}^{n} PV_{it}
$$

Calculates the close sentiment and applying it to the first day market is open.
```python 
merge_sentiment()
```
$$
SV_{t,new} =\frac{\sum_{k=0}^{K}0.9^k\cdot SV_{t-k}}{\sum_{k=0}^{K}0.9^k}
$$

Creates a sentiment index.
```python 
SI_bai()
``` 

$$
SI_t = SV_t+ \sum_{i=1}^{t-1} SV_i\cdot e^{-\frac{t-1}{\beta}}
$$


### Aggregate to daily

In [None]:
# Resample data
sample_freq = 'h'

sentiment_df.index = pd.to_datetime(sentiment_df['versionCreated'])

df_sent = aggregate_score(sentiment_df, ['polarity', 'subjectivity'], frequency=sample_freq)

display(df_sent)

In [None]:
price_df = pd.read_csv(r'C:\Users\joneh\master_thesis\data\time_series\CLc1_High_Frequency.csv')
price_df.index = pd.to_datetime(price_df['Date']).dt.tz_localize(None)

df_sent.index = df_sent.index.tz_localize(None)

# calculate log-returns
price_df['LOGRET'] = np.log(price_df['CLOSE']).diff()

# add GARCH(1,1) model
garch = arch_model(price_df['LOGRET'].dropna(), vol='Garch', p=1, q=1)
res = garch.fit(disp='off')
price_df['GARCH'] = res.conditional_volatility

combined_df = df_sent.join(
    price_df[['GARCH', 'CLOSE', 'LOGRET', 'VOLUME']], 
    how='left'
)

combined_df = combined_df.dropna()

display(combined_df)

# add sentiment index
combined_df['SI_BAI'] = SI_bai(combined_df['polarity'], beta=7)

x_label = 'count'
y_label = 'GARCH'

slope, intercept, r_value, p_value, std_err = stats.linregress(combined_df[x_label], combined_df[y_label])
print(f'p-value: {p_value:.10f}')
print(f'R2: {r_value:.4f}')
print(f'Slope: {slope:.4f}')
print(f'Correlation: {combined_df[x_label].corr(combined_df[y_label]):.4f}')

fig = sns.jointplot(
    data=combined_df, 
    x=x_label, 
    y=y_label,
    scatter_kws={'s':5},
    kind='reg',
    line_kws={'color':'red'},
)

### Save sentiment indices

In [None]:
# Enter filename here:
file_name = 'sentiment_index.csv'
# Enter relative path for saving the file:
relative_path = 'data/time_series'

df_sent.to_csv(save_path(relative_path, file_name), index=True)

### Save data

In [None]:
# Enter filename here:
file_name = 'combined_df_sentiment.csv'
# Enter relative path for saving the file:
relative_path = 'data/model_input'

combined_df.to_csv(save_path(relative_path, file_name), index=False)