In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import scienceplots

import os
import sys
from dotenv import load_dotenv

load_dotenv()
repo_path = os.getenv("REPO_PATH")
plt.style.use('science')

# Import main utility functions
sys.path.insert(0, repo_path + r'src_HF')
from utils import *

### Load data

In [None]:
topic = 'CWP'

# Load data
sent_df = pd.read_csv(repo_path + rf'data\sentiment_data\{topic}_ARTICLE_SENTIMENT.csv')

display(sent_df.head(2))

In [None]:
sent_df.plot(kind='scatter', x='VADER_fullStory', y='FinBERT_fullStory', s=1, figsize=(10,10))
# print correlation
correlation = sent_df['VADER_fullStory'].corr(sent_df['FinBERT_fullStory'])
print(f'Correlation between VADER and FinBERT: {correlation}')

### Correlation between sentiment tool results

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(10, 10))

plot_pairs = [
    ('TextBlob_headline', 'TextBlob_fullStory'),   # Top-left subplot
    ('VADER_headline', 'VADER_fullStory'),         # Bottom-left subplot
    ('TextBlob_headline', 'VADER_headline'),       # Top-right subplot
    ('TextBlob_fullStory', 'VADER_fullStory')      # Bottom-right subplot
]

# Iterate over each pair of columns and their corresponding subplot axis
for idx, (x_col, y_col) in enumerate(plot_pairs):
    # Calculate row and column index for the subplot
    row = idx // 2
    col = idx % 2
    # Plotting the scatter plot on the appropriate subplot
    sent_df.plot.scatter(x=x_col, y=y_col, alpha=0.5, ax=axs[row, col], s=3)
    # write correlation coefficient in top left corner
    corr = sent_df[x_col].corr(sent_df[y_col])
    axs[row, col].text(
        0.05, 0.95, f'Corr: {corr:.2f}', 
        transform=axs[row, col].transAxes, 
        fontsize=14, verticalalignment='top', 
        bbox=dict(facecolor='white', alpha=1)
    )

    axs[row, col].tick_params(axis='both', which='major', labelsize=14)
    axs[row, col].set_xlabel(x_col.replace('_', ' '), fontsize=16)
    axs[row, col].set_ylabel(y_col.replace('_', ' '), fontsize=16)


fig.tight_layout(pad=3.0)
fig.savefig(
    repo_path + rf'src_HF\4 Sentiment Analysis\images\{topic}_sentiment_correlation.png', 
    dpi=200
)

### Distributions

In [None]:
# text_df.plot.scatter(x='TextBlob', y='VADER', alpha=0.5, s=1)

fig, ax = plt.subplots(figsize=(9, 6))

text_df['TextBlob'].plot.hist(bins=100, alpha=0.5, label='TextBlob', ax=ax)
text_df['VADER'].plot.hist(bins=100, alpha=0.5, label='VADER', ax=ax)

text_df['compound'] = text_df['VADER'] * 0.5 + text_df['TextBlob'] * 0.5

text_df['compound'].plot.hist(bins=100, alpha=0.5, label='compound', ax=ax)

ax.legend(fontsize=14)
ax.set_xlabel('Sentiment score', fontsize=14)
ax.set_ylabel('Frequency', fontsize=14)
ax.tick_params(axis='both', which='major', labelsize=14)

fig.savefig(repo_path + r'src_HF\3 Sentiment Analysis\images', dpi=200, bbox_inches='tight')