In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import scienceplots

import os
import sys
from dotenv import load_dotenv

load_dotenv()
REPO_PATH =  os.getenv('REPO_PATH')

sys.path.insert(0, rf'{REPO_PATH}src')
from utils.main_utils import combload_topic_dfs, apply_nb_style
from utils.text_utils import clean_token_series, create_word_df
from utils.plot_utils import create_sent_wc

apply_nb_style()
plt.style.use('science')

### Import data

In [None]:
TOPICS = ['CRU', 'CWP', 'CEN']

text_df = combload_topic_dfs(
    TOPICS,
    lambda topic: rf'{REPO_PATH}data\news_data\EIKON_{topic}_NEWS_COMPLETE.json', 
)

text_df['tokenized'], text_df['tokenized_cleaned'] = clean_token_series(
    text_df['fullStory'], 
    include_raw=True
)

word_dfs = {topic: create_word_df(text_df, topic) for topic in TOPICS}

### Sentiment wordcloud

In [None]:
for analyzer in ['vader', 'textblob']:
    for topic in TOPICS:
        df = word_dfs[topic]

        conditions = [
            df[analyzer] < -0.25,
            (df[analyzer] > -0.25) & (df[analyzer] < 0.25),
            df[analyzer] > 0.25,
        ]

        fig = create_sent_wc(df, topic, analyzer, conditions)

        fig.savefig(
            rf'images\{analyzer}_{topic}_word_sentiment_wordcloud.png'
        )

In [None]:
comb_df = pd.concat(word_dfs.values(), ignore_index=True)

comb_df['word_count'] = 1

word_df_grouped = comb_df.groupby(
    ['vader', 'textblob']
).sum().reset_index()

word_df_grouped = word_df_grouped[
    (word_df_grouped['vader'] != 0) | (word_df_grouped['textblob'] != 0)
]

fig = word_df_grouped.plot.scatter(
    x='vader', 
    y='textblob', 
    figsize=(10, 7), 
    c='word_count',
    s='word_count', 
    colormap='twilight_shifted',
)

fig.set_xlabel('VADER', fontsize=15)
fig.set_ylabel('TextBlob', fontsize=15)
fig.tick_params(axis='both', which='major', labelsize=12)
fig.grid(alpha=0.3)

cbar = fig.collections[0].colorbar
cbar.ax.tick_params(labelsize=12)
cbar.set_label('Word count', fontsize=15)

fig.figure.savefig('images\sentiment_comparison.png')

### Headline and story length distribution

In [None]:
text_df['tokenized_h'], text_df['tokenized_cleaned_h'] = clean_token_series(
    text_df['text'], 
    include_raw=True
)

text_df['tokenized_h_words'] = text_df['tokenized_h'].apply(
    lambda x: [word for word in x if word.isalnum()]
)
text_df['tokenized_words'] = text_df['tokenized'].apply(
    lambda x: [word for word in x if word.isalnum()]
)

word_counts_headline = text_df['tokenized_h_words'].apply(len)
word_counts_story = text_df['tokenized_words'].apply(len)

wc_df = pd.DataFrame({
    'headline': word_counts_headline,
    'story': word_counts_story
})

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 4), dpi=200)

colormap = plt.cm.get_cmap('twilight', 256)
palette = [colormap(i) for i in (70, 180)]

wc_df['headline'][wc_df['headline'] < 100].plot(
    kind='hist', 
    bins=75, 
    color=palette[0],
    edgecolor='black',
    ax=ax1,
    )

wc_df['story'][wc_df['story'] < 3000].plot(
    kind='hist', 
    bins=75, 
    color=palette[1],
    edgecolor='black',
    ax=ax2,
    )

for ax, label in zip((ax1, ax2), ('Headline ', 'Full text ')):
    ax.set_xlabel(label + 'word count', fontsize=13)
    ax.set_ylabel('Frequency', fontsize=13)
    ax.grid(alpha=0.3)
    ax.set_axisbelow(True)

fig.figure.savefig(f'images/news_word_count_dist.png', bbox_inches='tight')