In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import scienceplots
from wordcloud import WordCloud
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textblob import TextBlob

import os
import sys
from dotenv import load_dotenv

load_dotenv()
repo_path =  os.getenv('REPO_PATH')
plt.style.use('science')

sys.path.insert(0, repo_path + r'src_HF')
from utils.main_utils import *
from utils.text_utils import *

### Import data

In [None]:
text_df = pd.read_json(repo_path + r'data\news_data\EIKON_CRU_NEWS_COMPLETE.json', lines=True, orient='records')

text_df['tokenized'], text_df['tokenized_cleaned'] = clean_token_series(text_df['fullStory'])

display(text_df)

### Create a word dataframe

In [None]:
word_series = text_df['tokenized_cleaned'].explode()
word_series = word_series[~word_series.isin(ignore_words)]

word_df = pd.DataFrame(word_series.value_counts()).reset_index().rename(columns={'index': 'word', 'tokenized_cleaned': 'count'})
sia = SentimentIntensityAnalyzer()
word_df['sia'] = word_df['word'].apply(lambda x: sia.polarity_scores(x)['compound'])

word_df['textblob'] = word_df['word'].apply(lambda x: TextBlob(x).sentiment.polarity)
display(word_df)

In [None]:
sentiment_id= 'sia'

conditions = [
    word_df[sentiment_id] < -0.5,
    (word_df[sentiment_id] < -0.15) & (word_df[sentiment_id] > -0.5),
    (word_df[sentiment_id] > 0.15) & (word_df[sentiment_id] < 0.5),
    word_df[sentiment_id] > 0.5,
]

condition_names = [
    'Negative sentiment',
    'Slightly Negative sentiment',
    'Slightly Positive sentiment',
    'Positive sentiment'
]

fig, axs = plt.subplots(2, 2, facecolor=None, figsize=(12, 12))

for i, ax in enumerate(axs.flatten()):

    filtered_word_series = word_df[conditions[i]]['word']
    print(len(filtered_word_series))
    wordcloud = create_wordcloud(filtered_word_series)
    ax.imshow(wordcloud, interpolation='bilinear')
    ax.set_title(f'{condition_names[i]}', fontsize=19, loc='left', y=1.05)
    ax.axis("off")


fig.tight_layout(pad=4)

fig.savefig(repo_path + r'src_HF\2 Data Cleaning and EDA\images\sentiment_wordcloud.png')

In [None]:
word_df['word_count'] = 1

# sum word_count where both sia and textblob are the same
word_df_grouped = word_df.groupby(['sia', 'textblob']).sum().reset_index()

# remove row where both values are 0
word_df_grouped = word_df_grouped[(word_df_grouped['sia'] != 0) | (word_df_grouped['textblob'] != 0)]

word_df_grouped
fig = word_df_grouped.plot.scatter(
    x='sia', 
    y='textblob', 
    figsize=(10, 7), 
    c='word_count', 
    s='word_count', 
    colormap='viridis'
)

# increase label sizes
fig.set_xlabel('SIA', fontsize=15)
fig.set_ylabel('TextBlob', fontsize=15)
fig.tick_params(axis='both', which='major', labelsize=12)
fig.grid(alpha=0.3)


cbar = fig.collections[0].colorbar
cbar.ax.tick_params(labelsize=12)
cbar.set_label('Word count', fontsize=15)

fig.figure.savefig(repo_path + r'src_HF\2 Data Cleaning and EDA\images\sentiment_comparison.png')