In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import scienceplots
import warnings

import os
import sys
from dotenv import load_dotenv

load_dotenv()
REPO_PATH = os.getenv("REPO_PATH")
plt.style.use('science')
warnings.filterwarnings("ignore")

# Import main utility functions
sys.path.insert(0, rf'{REPO_PATH}src')
from utils.main_utils import combload_topic_dfs

### Load data

In [None]:
TOPICS = ['CRU', 'CWP', 'CEN']

df = combload_topic_dfs(
    TOPICS,
    lambda topic: rf'{REPO_PATH}data\sentiment_data\{topic}_ARTICLE_SENTIMENT.csv'
)

display(df.head(2))
display(df.shape)

### Correlation between sentiment tool results

In [None]:
TOPIC = 'CRU'

colors = sns.color_palette('twilight', n_colors=2)

sent_df = df[df['topic'] == TOPIC]

plot_pairs = [
    ('VADER_headline', 'VADER_fullStory'),      # Top-left subplot
    ('TextBlob_headline', 'TextBlob_fullStory'),         # Bottom-left subplot
    ('TextBlob_headline', 'VADER_headline'),       # Top-right subplot
    ('TextBlob_fullStory', 'VADER_fullStory')      # Bottom-right subplot
]

fig, axs = plt.subplots(2, 2, figsize=(10, 6), dpi=200)

# Iterate over each pair of columns and their corresponding subplot axis
for idx, (x_col, y_col) in enumerate(plot_pairs):

    row = idx // 2
    col = idx % 2

    sent_df.plot.scatter(
        x=x_col, y=y_col, 
        alpha=0.5, 
        ax=axs[row, col], 
        s=3,
        color=colors[idx] if idx < 2 else 'darkgray'
    )

    corr = sent_df[x_col].corr(sent_df[y_col])
    axs[row, col].text(
        0.05, 0.95, f'Corr: {corr:.2f}', 
        transform=axs[row, col].transAxes, 
        fontsize=14, verticalalignment='top', 
        bbox=dict(facecolor='white', alpha=1)
    )

    axs[row, col].tick_params(axis='both', which='major', labelsize=14)
    axs[row, col].set_xlabel(x_col.replace('_', ' '), fontsize=16)
    axs[row, col].set_ylabel(y_col.replace('_', ' '), fontsize=16)


fig.tight_layout(pad=3.0)
fig.savefig(rf'images\{TOPIC}_sentiment_correlation.png')

### Distributions

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(10, 6), dpi=200)
axs = axs.flatten()

cols = ['VADER_headline', 'TextBlob_headline', 'VADER_fullStory', 'TextBlob_fullStory']

for i, col in enumerate(cols):
    sns.histplot(
        sent_df[col], 
        ax=axs[i], 
        label=col, 
        binrange=(-1, 1), 
        bins=100, 
        stat='frequency',
        color=colors[i % 2]
    )
    axs[i].set_xlabel('Sentiment score', fontsize=14)
    axs[i].set_ylabel('Density', fontsize=14)
    
    # write VADER or textblob in top left corner
    axs[i].text(
        0.08, 0.9, col.replace('_', ' '),
        transform=axs[i].transAxes, 
        fontsize=14, verticalalignment='center',
    )

    # add mean text
    axs[i].text(
        0.08, 0.8, f'Mean: {sent_df[col].mean():.3f}',
        transform=axs[i].transAxes,
        fontsize=14, verticalalignment='center'
    )

fig.tight_layout()
fig.savefig(rf'images/{TOPIC}_sentiment_distributions')