In [None]:
from sent_utils import * # has to be first to avoid conflict with Julia load

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from arch import arch_model
import yfinance as yf

import sys
import warnings

warnings.filterwarnings("ignore")

# Import main utility functions
sys.path.insert(0, r'c:\Users\joneh\master_thesis\src')

from main_utils import *        # main utilities
from db_utils import *          # database utilities

### Sentiment Analysis Testing

Sentiment analysis functions should take a panads series as input and output a pandas series of the same length with the sentiment of the input text. The sentiment should be a float between -1 and 1, where -1 is negative, 0 is neutral, and 1 is positive.

In [None]:
# Load data from database
news_df = news_db_load('news_filtered')
news_df.drop(columns=['Unnamed: 0'], inplace=True)

display(news_df)

### Sentiment analysis

Applying a sentiment analysis score to each headline. 0.0 is applied where there are none.

In [None]:
# Textblob
news_df['textblob'] = textblob_sentiment(news_df['headline'])

# # FinBERT
# news_df['finbert'] = news_df['headline'].apply(FinBERT_sentiment)

display(news_df)

### Aggregation and Sentiment index

Calculates the daily average sentiment score.
```python 
aggregate_score()
``` 
$$
SV_t = \frac{1}{N_t}\sum_{i=1}^{n} PV_{it}
$$

Calculates the close sentiment and applying it to the first day market is open.
```python 
merge_sentiment()
```
$$
SV_{t,new} =\frac{\sum_{k=0}^{K}0.9^k\cdot SV_{t-k}}{\sum_{k=0}^{K}0.9^k}
$$

Creates a sentiment index.
```python 
SI_bai()
``` 

$$
SI_t = SV_t+ \sum_{i=1}^{t-1} SV_i\cdot e^{-\frac{t-1}{\beta}}
$$


### Aggregate to daily

In [None]:
# Resample data
sample_freq = 'd'

df_sent = aggregate_score(news_df, ['textblob'], frequency=sample_freq)

display(df_sent)

In [None]:
def merge_sentiment(
        sentiment_df: pd.DataFrame, 
        market_days: pd.Series
    ) -> pd.Series:

    col_dict = {}

    sentiment_df = sentiment_df.join(market_days, how='outer')

    display(sentiment_df)

    for col in sentiment_df.columns:

        update_dict = {}
        day_list = []

        first_day = False

        for row in sentiment_df.iterrows():
            index = row[0]
            close = market_days.loc[index]

            print(close)

            if close == False:
                day_list.append(index)
                first_day = True

            elif first_day:
                day_list.append(index)
                up, down = 0, 0

                for i, date in enumerate(day_list[::-1]):
                    sentiment = sentiment_df.loc[date]
                    up += 0.9**i * sentiment
                    down += 0.9**i

                update_dict[index] = up / down
                day_list = []
                first_day = False
            else:
                continue
        
        col_dict[col] = update_dict

    updated_open_days = pd.DataFrame(update_dict, index=[sentiment_df.columns]).T

    return updated_open_days

In [None]:
price_df = pd.read_csv(r'C:\Users\joneh\master_thesis\data\time_series\CL=F_20years.csv')
price_df.index = pd.to_datetime(price_df['Date']).dt.tz_localize(None)

# calculate log-returns
price_df['Log Return'] = np.log(price_df['Adj Close']).diff()

# Calculate volatility
price_df['Volatility'] = price_df['Log Return'].rolling(21).std()

# add GARCH(1,1) model
garch = arch_model(price_df['Log Return'].dropna(), vol='Garch', p=1, q=1)
res = garch.fit(disp='off')
price_df['GARCH'] = res.conditional_volatility

combined_df = df_sent.join(
    price_df[['Volatility', 'Adj Close', 'Volume', 'GARCH', 'Log Return']], 
    how='left'
)

# return series showing if market is open or not with true/false
market_days = combined_df['Adj Close'].notna()

combined_df = combined_df.dropna()

updated_open_days = merge_sentiment(combined_df[['textblob']], market_days)

display(updated_open_days)

combined_df.update(updated_open_days)

display(combined_df)

# add sentiment index
combined_df['SI_BAI'] = SI_bai(combined_df['textblob'], beta=7)

x_label = 'SI_BAI'
y_label = 'GARCH'

slope, intercept, r_value, p_value, std_err = stats.linregress(combined_df[x_label], combined_df[y_label])
print(f'p-value: {p_value:.10f}')
print(f'R2: {r_value:.4f}')
print(f'Slope: {slope:.4f}')
print(f'Correlation TG: {combined_df[x_label].corr(combined_df[y_label]):.4f}')

fig = sns.jointplot(
    data=combined_df, 
    x=x_label, 
    y=y_label,
    scatter_kws={'s':5},
    kind='reg',
    line_kws={'color':'red'},
)

### Save sentiment indices

In [None]:
# Enter filename here:
file_name = 'sentiment_index.csv'
# Enter relative path for saving the file:
relative_path = 'data/time_series'

df_sent.to_csv(save_path(relative_path, file_name), index=True)

### Save data

In [None]:
# Enter filename here:
file_name = 'combined_df_sentiment.csv'
# Enter relative path for saving the file:
relative_path = 'data/model_input'

combined_df.to_csv(save_path(relative_path, file_name), index=False)