In [None]:
import pandas as pd
import numpy as np
import yfinance as yf
import matplotlib.pyplot as plt
from scipy.stats import pearsonr

from nltk import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from nltk.sentiment import SentimentIntensityAnalyzer

### Load data

In [None]:
df = pd.read_csv('data/complete_data.csv')
df.index = pd.to_datetime(df['datetime'])

### Sentiment analysis

In [None]:

tokenized = df['webTitle'].apply(sent_tokenize)

# Tokenize the headlines
tokenized = tokenized.apply(lambda x: [word_tokenize(s) for s in x])

# Remove stopwords
stop_words = set(stopwords.words('english'))
tokenized = tokenized.apply(lambda x: [[w for w in s if w.lower() not in stop_words] for s in x][0])

# sentiment analysis

sia = SentimentIntensityAnalyzer()

sentiments = tokenized.apply(lambda x: [sia.polarity_scores(s) for s in x])
sentiments = sentiments.apply(lambda x: [s['compound'] for s in x])

# add sentikent to df
df['sentiment'] = sentiments.apply(sum)
display(df)

In [None]:
for column in ['type', 'sectionId']:
    print(column)
    print(df[column].unique())

df['sentiment'][df['sentiment'] != 0].plot(kind='hist', bins=100)


In [None]:

df_sent = pd.DataFrame(df['sentiment'].resample('D').mean().fillna(0))

df_sent.index = df_sent.index.strftime('%Y-%m-%d')

# get stock data
stock_data = yf.download('CL=F', period='21Y', ignore_tz=True)[['Adj Close']]
stock_data.index = stock_data.index.strftime('%Y-%m-%d')
stock_data['log_ret'] = np.log(stock_data).diff()

data = stock_data.join(df_sent,  how='outer')
# drop all rows where sentiment is NaN
data = data.dropna(subset=['sentiment'])
# forward fill adj close 
data['Adj Close'] = data['Adj Close'].ffill()


display(data)



In [None]:


fig, ax1 = plt.subplots()
ax2 = ax1.twinx()

data['Adj Close'].rolling(252).std().plot(ax=ax1, color='r')
data['sentiment'].rolling(252).mean().plot(ax=ax2, color='b')

vol = pd.concat([
    data['Adj Close'].rolling(252).std(), 
    data['sentiment'].rolling(252).mean(),
    data['log_ret']
    ], axis=1)

vol.dropna(inplace=True)

display(vol.corr())

# get p-value

pearsonr(vol['Adj Close'], vol['sentiment'])

vol.plot(kind='scatter', x='log_ret', y='sentiment', c='Adj Close', cmap='coolwarm', alpha=0.7, figsize=(12, 6))
