In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import scienceplots

import os
import sys
from dotenv import load_dotenv

load_dotenv()
REPO_PATH =  os.getenv('REPO_PATH')
plt.style.use('science')

sys.path.insert(0, rf'{REPO_PATH}src_HF')
from utils.text_utils import clean_token_series, create_wordcloud, IGNORE_WORDS
from utils.sentiment_utils import add_textblob_polarity, add_vader_compound

### Import data

In [None]:
TOPIC = 'CRU'

text_df = pd.read_json(
    rf'{REPO_PATH}data\news_data\EIKON_{TOPIC}_NEWS_COMPLETE.json', 
    lines=True, 
    orient='records'
)

text_df['tokenized'], text_df['tokenized_cleaned'] = clean_token_series(text_df['fullStory'])

display(text_df)

### Create a word dataframe

In [None]:
word_series = text_df['tokenized_cleaned'].explode()

word_series = word_series[~word_series.isin(IGNORE_WORDS)]

word_df = pd.DataFrame(
    word_series.value_counts()
).reset_index().rename(columns={'tokenized_cleaned': 'word'})

word_df['vader'] = add_vader_compound(word_df['word'])
word_df['textblob'] = add_textblob_polarity(word_df['word'])

display(word_df)

In [None]:
SENTIMENT_ID = 'textblob'

conditions = [
    word_df[SENTIMENT_ID] < -0.5,
    (word_df[SENTIMENT_ID] < -0.0) & (word_df[SENTIMENT_ID] > -0.5),
    (word_df[SENTIMENT_ID] > 0.0) & (word_df[SENTIMENT_ID] < 0.5),
    word_df[SENTIMENT_ID] > 0.5,
]

condition_names = [
    'Negative sentiment',
    'Slightly Negative sentiment',
    'Slightly Positive sentiment',
    'Positive sentiment'
]

N = len(word_df)

n = len(word_df[word_df[SENTIMENT_ID] == 0.0]["word"])
print(f'Neutral sentiment: {n}, ({n/N:.2%})')

fig, axs = plt.subplots(1, 4, facecolor=None, figsize=(12, 4), dpi=200)

for i, ax in enumerate(axs.flatten()):

    filtered_word_series = word_df[conditions[i]]['word']
    n = len(filtered_word_series)
    print(f'{condition_names[i]}: {n}, ({n/N:.2%})')
    wordcloud = create_wordcloud(filtered_word_series, height=700)
    ax.imshow(wordcloud, interpolation='bilinear')
    # ax.set_title(f'{condition_names[i]}', fontsize=19, loc='left', y=1.05)
    ax.axis("off")


fig.tight_layout(pad=0)

fig.savefig(rf'images\{SENTIMENT_ID}_{TOPIC}_word_sentiment_wordcloud.png')

In [None]:
word_df['word_count'] = 1

# sum word_count where both sia and textblob are the same
word_df_grouped = word_df.groupby(['vader', 'textblob']).sum().reset_index()

# remove row where both values are 0
word_df_grouped = word_df_grouped[(word_df_grouped['vader'] != 0) | (word_df_grouped['textblob'] != 0)]

word_df_grouped
fig = word_df_grouped.plot.scatter(
    x='vader', 
    y='textblob', 
    figsize=(10, 7), 
    c='word_count', 
    s='word_count', 
    colormap='twilight_shifted',
)

# increase label sizes
fig.set_xlabel('VADER', fontsize=15)
fig.set_ylabel('TextBlob', fontsize=15)
fig.tick_params(axis='both', which='major', labelsize=12)
fig.grid(alpha=0.3)


cbar = fig.collections[0].colorbar
cbar.ax.tick_params(labelsize=12)
cbar.set_label('Word count', fontsize=15)

fig.figure.savefig(r'images\sentiment_comparison.png')

### Headline and story length distribution

In [None]:
text_df['tokenized_h'], text_df['tokenized_cleaned_h'] = clean_token_series(text_df['text'])

# remove punctuation from tokenized_h and tokenized
text_df['tokenized_h_words'] = text_df['tokenized_h'].apply(lambda x: [word for word in x if word.isalnum()])
text_df['tokenized_words'] = text_df['tokenized'].apply(lambda x: [word for word in x if word.isalnum()])

In [None]:
word_counts_headline = text_df['tokenized_h_words'].apply(len)
word_counts_story = text_df['tokenized_words'].apply(len)

# set limits
word_counts_headline = word_counts_headline[word_counts_headline < 80]
word_counts_story = word_counts_story[word_counts_story < 1500]

wc_df = pd.DataFrame({
    'headline': word_counts_headline,
    'story': word_counts_story
})

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 4), dpi=200)

colormap = plt.cm.get_cmap('twilight', 256)
palette = [colormap(i) for i in (70, 180)]

wc_df['headline'].plot(
    kind='hist', 
    bins=75, 
    color=palette[0],
    edgecolor='black',
    ax=ax1,
    )

wc_df['story'].plot(
    kind='hist', 
    bins=75, 
    color=palette[1],
    edgecolor='black',
    ax=ax2,
    )

for ax, label in zip((ax1, ax2), ('Headline ', 'Full text ')):
    ax.set_xlabel(label + 'word count', fontsize=13)
    ax.set_ylabel('Frequency', fontsize=13)
    ax.grid(alpha=0.3)
    ax.set_axisbelow(True)

fig.figure.savefig(f'images/news_word_count_dist.png', bbox_inches='tight')