In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import scienceplots

import os
import sys
from dotenv import load_dotenv

load_dotenv()
REPO_PATH = os.getenv("REPO_PATH")
plt.style.use('science')

# Import main utility functions
sys.path.insert(0, rf'{REPO_PATH}src')
from utils.main_utils import combload_topic_dfs

### Load data

In [None]:
TOPICS = ['CRU', 'CWP', 'CEN']

df = combload_topic_dfs(
    TOPICS,
    lambda topic: rf'{REPO_PATH}data\sentiment_data\{topic}_ARTICLE_SENTIMENT.csv'
)

display(df.head(2))
display(df.shape)

### Correlation between sentiment tool results

In [None]:
TOPIC = 'CWP'

sent_df = df[df['topic'] == TOPIC]

plot_pairs = [
    ('TextBlob_headline', 'TextBlob_fullStory'),   # Top-left subplot
    ('VADER_headline', 'VADER_fullStory'),         # Bottom-left subplot
    ('TextBlob_headline', 'VADER_headline'),       # Top-right subplot
    ('TextBlob_fullStory', 'VADER_fullStory')      # Bottom-right subplot
]

fig, axs = plt.subplots(2, 2, figsize=(10, 10), dpi=200)

# Iterate over each pair of columns and their corresponding subplot axis
for idx, (x_col, y_col) in enumerate(plot_pairs):
    # Calculate row and column index for the subplot
    row = idx // 2
    col = idx % 2
    # Plotting the scatter plot on the appropriate subplot
    sent_df.plot.scatter(x=x_col, y=y_col, alpha=0.5, ax=axs[row, col], s=3)
    # write correlation coefficient in top left corner
    corr = sent_df[x_col].corr(sent_df[y_col])
    axs[row, col].text(
        0.05, 0.95, f'Corr: {corr:.2f}', 
        transform=axs[row, col].transAxes, 
        fontsize=14, verticalalignment='top', 
        bbox=dict(facecolor='white', alpha=1)
    )

    axs[row, col].tick_params(axis='both', which='major', labelsize=14)
    axs[row, col].set_xlabel(x_col.replace('_', ' '), fontsize=16)
    axs[row, col].set_ylabel(y_col.replace('_', ' '), fontsize=16)


fig.tight_layout(pad=3.0)
fig.savefig(
    rf'{REPO_PATH}src_HF\4 Sentiment Analysis\images\{TOPIC}_sentiment_correlation.png'
)

### Distributions

In [None]:
# sent_df.plot.scatter(x='TextBlob', y='VADER', alpha=0.5, s=1)

fig, ax = plt.subplots(figsize=(9, 6))

sent_df['TextBlob'].plot.hist(bins=100, alpha=0.5, label='TextBlob', ax=ax)
sent_df['VADER'].plot.hist(bins=100, alpha=0.5, label='VADER', ax=ax)

sent_df['compound'] = sent_df['VADER'] * 0.5 + sent_df['TextBlob'] * 0.5

sent_df['compound'].plot.hist(bins=100, alpha=0.5, label='compound', ax=ax)

ax.legend(fontsize=14)
ax.set_xlabel('Sentiment score', fontsize=14)
ax.set_ylabel('Frequency', fontsize=14)
ax.tick_params(axis='both', which='major', labelsize=14)

fig.savefig(rf'{REPO_PATH}src_HF\3 Sentiment Analysis\images', dpi=200, bbox_inches='tight')