In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
import eikon as ek
import scienceplots
import seaborn as sns

import os
import sys
from dotenv import load_dotenv

load_dotenv()
ek.set_app_key(os.getenv('EIKON_API_KEY'))
REPO_PATH = os.getenv('REPO_PATH')
plt.style.use('science')
warnings.filterwarnings("ignore")

sys.path.insert(0, rf'{REPO_PATH}src_HF')
from utils.main_utils import load_json, combload_topic_dfs

### Load and combine news data, and remove duplicates

In [None]:
TOPICS: list[str] = ['CRU', 'CWP', 'CEN']

CONFIG: dict[str, str] = load_json(rf'{REPO_PATH}src_HF\plot_config.json')

url = lambda topic: rf'{REPO_PATH}data\news_data\EIKON_{topic}_NEWS_COMPLETE.json'

df = combload_topic_dfs(TOPICS, url)

display(df.head(2))

### News frequency

In [None]:
fig, ax = plt.subplots(figsize=(10, 4), dpi=200)

ti_df = df.copy()
ti_df.index = pd.to_datetime(ti_df['date'])

ti_df = ti_df.sort_index()

ti_df['count'] = 1
freq_df = pd.concat(
    [
        ti_df[ti_df['topic'] == topic]['count'].resample('w').sum() 
        for topic in ti_df['topic'].unique()
    ], axis=1
)
freq_df.columns = [topic for topic in ti_df['topic'].unique()]


freq_df.plot(
    kind='bar',
    # set color to twilight palette
    color=sns.color_palette('twilight', n_colors=len(freq_df.columns)),
    stacked=True,
    width=0.8,
    ax=ax
)

ax.legend(
    title='Topic tag:', 
    fontsize=13, 
    title_fontsize=14, 
    loc='upper left',
    ncols=3,
    fancybox=True,
    frameon=False
)

plt.xticks(
    np.arange(0, len(freq_df), 3), 
    freq_df.index[np.arange(0, len(freq_df), 3)].strftime('%Y-%m'),
    rotation=45
)

ax.set_ylim(top=ax.get_ylim()[1] * 1.3)
ax.set_xlabel('Date', fontsize=15)
ax.set_ylabel('Number of articles', fontsize=15)

# add horizontal gridlines
ax.yaxis.grid(True, alpha=0.4)
ax.set_axisbelow(True)

# remove all tick and subtick lines
ax.tick_params(axis='x', which='both', bottom=False, top=False)

plt.xticks(rotation=45)

fig.savefig(f'images/news_freq.png', bbox_inches='tight')

### Seasonality frequency

In [None]:

count_df = df.copy()
count_df['count'] = 1

# resample to daily
daily_df = count_df.resample('D', on='date').sum()
hourly_df = count_df.resample('h', on='date').sum()

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5), dpi=200)

dayofweek_names = {
    0: 'Monday', 
    1: 'Tuesday', 
    2: 'Wednesday', 
    3: 'Thursday', 
    4: 'Friday', 
    5: 'Saturday', 
    6: 'Sunday'
}

sns.boxplot(
    data=daily_df, 
    x=daily_df.index.dayofweek, 
    y='count',
    ax=ax1,
    palette='twilight'
)

sns.boxplot(
    data=hourly_df, 
    x=hourly_df.index.hour, 
    y='count',
    ax=ax2,
    palette='twilight'
)

ax1.set_xticklabels([dayofweek_names[i] for i in range(7)], rotation=30) # change names of xticks
ax1.tick_params(axis='both', which='major', labelsize=11) # change tic size
ax1.xaxis.set_ticks_position('none') # remove xtick ruler

label_size = 16

ax1.set_xlabel('Day of the week', fontsize=label_size)
ax1.set_ylabel('Number of articles', fontsize=label_size)

ax2.set_xlabel('Hour of day at UTC+0', fontsize=label_size)
ax2.set_ylabel('Number of articles', fontsize=label_size)

ax2.xaxis.labelpad = 30 # push x_label a litle bit down

fig.savefig(
    rf'{REPO_PATH}src_HF\2 Data Cleaning and EDA\images\news_weekly.png', 
    bbox_inches='tight'
)