In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from arch import arch_model
from tqdm import tqdm

from scipy import stats
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader
from nltk.sentiment.vader import SentimentIntensityAnalyzer

from BERT_utils import *

### Load training data

In [None]:
# Load data
df_headlines = pd.read_csv(r'C:\Users\joneh\master_thesis\data\news\TheGuardian\TG_CrudeANDOil.csv')
df_headlines['datetime'] = pd.to_datetime(df_headlines['datetime']).dt.date
df_headlines = df_headlines.drop(columns=['id', 'webPublicationDate', 'type', 'sectionId', 'sectionName', 'webUrl', 'apiUrl', 'isHosted', 'pillarId', 'pillarName'])

df_prices = pd.read_csv(r'C:\Users\joneh\master_thesis\data\time_series\YahooFinance\CL=F_20years.csv')
df_prices.index = pd.to_datetime(df_prices['Date']).dt.date
df_prices = df_prices.drop(columns=['Date', 'Open', 'High', 'Low', 'Close'])

# Load the pre-trained model
model = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone', num_labels = 3)

# Load the pre-trained model's tokenizer
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')


### Create and add GARCH(1,1) volatility labels to headlines

In [None]:
garch_model = arch_model(df_prices['Adj Close'], vol='Garch', p=1, q=1, dist='Normal')
train_fit = garch_model.fit(disp='off')

# add garch volatility data to df_prices
df_prices['garch_vol'] = train_fit.conditional_volatility
df_prices['log_vol'] = np.log(df_prices['garch_vol']).diff()
df_prices['log_ret'] = np.log(df_prices['Adj Close']).diff()

# add garch_volatility and log_vol labels to df_headlines

for label in ['garch_vol', 'log_vol', 'log_ret', 'Adj Close', 'Volume']:
    df_headlines[label] = df_headlines['datetime'].map(df_prices[label])

# fill NaN values
df_headlines['garch_vol'].fillna(method='ffill', inplace=True)
df_headlines['log_vol'].fillna(method='bfill', inplace=True)
df_headlines['log_ret'].fillna(method='bfill', inplace=True)

df_headlines.dropna(inplace=True)

sia = SentimentIntensityAnalyzer()

res = {}

for i, row in tqdm(df_headlines.iterrows(), total=len(df_headlines)):
    text = row['webTitle']
    myid = i
    res[myid] = sia.polarity_scores(text)

df_headlines['sia'] = pd.DataFrame(res).T['compound']

# df_headlines['FinBERT'] = get_BERT_sentiment_per_headline(
#     df_headlines['webTitle'], 
#     model=model, 
#     tokenizer=tokenizer
# )

display(df_headlines)

In [None]:
x_label = 'sia'
y_label = 'Volume'


slope, intercept, r_value, p_value, std_err = stats.linregress(df_headlines[x_label], df_headlines[y_label])
print(f'p-value: {p_value:.7f}')
print(r_value)
print(slope)

fig, ax = plt.subplots(figsize=(7, 5))
df_headlines.plot(kind='scatter', x=x_label, y=y_label, s=2, ax=ax)
ax.plot(df_headlines[x_label], slope * df_headlines[x_label] + intercept, color='red')
ax.grid(alpha=0.2)
ax.set_title(f'{x_label} vs {y_label}')


# correlation
print(f'Correlation: {df_headlines[x_label].corr(df_headlines[y_label])}')


### Initialize pre-trained model

FinBERT is initialized as a pre-trained model. The model is trained on the financial domain and is fine-tuned on the financial sentiment analysis task.

In [None]:
# Load the pre-trained model
model = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone', num_labels = 3)

# Load the pre-trained model's tokenizer
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

### Train model

In [None]:
batch_of_sentences = df_headlines['webTitle'].tolist()

# Tokenize your dataset
inputs = tokenizer(batch_of_sentences, padding=True, truncation=True, return_tensors="pt")

display(inputs)

# # Create a dataloader
# train_dataloader = DataLoader(training_dataset, batch_size=32)

# # Prepare optimizer
# optimizer = AdamW(model.parameters(), lr=1e-5)

# num_epochs = 5

# # Training loop
# model.train()
# for epoch in range(num_epochs):
#     for batch in train_dataloader:
#         optimizer.zero_grad()
#         input_ids = batch['input_ids']
#         attention_mask = batch['attention_mask']
#         labels = batch['labels']
#         outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
#         loss = outputs.loss
#         loss.backward()
#         optimizer.step()

# # Save your model
# model.save_pretrained('models/tuned_BERT')