In [None]:
import numpy as np
import matplotlib.pyplot as plt
import warnings
import eikon as ek
import matplotlib.dates as mdates

from plot_utils import plot_news_frequency

import os
import sys
from dotenv import load_dotenv

load_dotenv()
ek.set_app_key(os.getenv('EIKON_API_KEY'))
repo_path = os.getenv('REPO_PATH')

warnings.filterwarnings("ignore")

sys.path.insert(0, repo_path + r'src_HF')
from utils.main_utils import *

### Load and combine news data, and remove duplicates

In [None]:
article_sources = {
    'CRU': repo_path + r'data\raw_news_headlines\EIKON_CRU_NEWS.csv',
    'CWP': repo_path + r'data\raw_news_headlines\EIKON_CWP_NEWS.csv',
    'CEN': repo_path + r'data\raw_news_headlines\EIKON_CEN_NEWS.csv'
}

config = load_json(repo_path + r'src_HF\plot_config.json')

df_list = []
for key, url in article_sources.items():
    df = pd.read_csv(url)
    df['topic'] = key
    df_list.append(df)

df = pd.concat(df_list)

print('Number of duplicates: ' + str(df['storyId'].duplicated().sum()))

# remove duplicates with same storyId remove latest
df = df.sort_values('versionCreated', ascending=False).drop_duplicates('storyId').sort_index()

display(df)

### News frequency

In [None]:

fig, ax = plt.subplots(figsize=(10, 6), dpi=200)

plt.rcParams['font.family'] = 'Arial'

ti_df = df.copy()
ti_df.index = pd.to_datetime(ti_df['versionCreated'])
ti_df = ti_df.sort_index()

ti_df['count'] = 1
freq_df = pd.concat(
    [
        ti_df[ti_df['topic'] == topic]['count'].resample('m').sum() 
        for topic in ti_df['topic'].unique()
    ], axis=1
)
freq_df.columns = [config[topic]['fullname'] for topic in ti_df['topic'].unique()]

display(freq_df)

freq_df.plot(
    kind='bar',
    color=[config[topic]['color'] for topic in ti_df['topic'].unique()],
    stacked=True, 
    ax=ax
)

ax.legend(
    title='News Topic:', 
    fontsize=11, 
    title_fontsize=12, 
    loc='upper left',
    ncols=3,
    fancybox=True,
    frameon=False
)

plt.xticks(
    np.arange(0, len(freq_df), 1), 
    freq_df.index[np.arange(0, len(freq_df), 1)].strftime('%Y-%m'), 
    rotation=45
)

ax.set_ylim(top=ax.get_ylim()[1] * 1.2)
ax.set_xlabel('Date', fontsize=13)
ax.set_ylabel('Number of articles', fontsize=13)

plt.xticks(rotation=45)

fig.savefig(f'images/news_freq.png', bbox_inches='tight')

### Headline analysis

In [None]:

word_counts = df['text'].str.split().str.len()

# remove counts with over 100 words
word_counts = word_counts[word_counts < 75]

fig, ax = plt.subplots()

word_counts.plot(
    kind='hist', 
    bins=75, 
    color='crimson',
    edgecolor='black',
    figsize=(8, 5),
    ax=ax,
    )

ax.set_xlabel('Word count', fontsize=13)
ax.set_ylabel('Frequency', fontsize=13)
ax.grid(alpha=0.3)
ax.set_axisbelow(True)

fig.figure.savefig(f'images/news_word_count_dist.png', bbox_inches='tight', dpi=150)


### News source

In [None]:
top10_sources =df['sourceCode'].value_counts().head(10)

print(top10_sources)

### Save combined news df

In [None]:
# Enter filename here:
file_name = f'EIKON_ALL_NEWS.csv'
# Enter relative path for saving the file:
relative_path = 'data/news'

df.to_csv(save_path(relative_path, file_name), index=False)