In [None]:
import eikon as ek
import pandas as pd
import datetime as dt
from textblob import TextBlob
from tqdm import tqdm
from bs4 import BeautifulSoup

import os
import sys
from dotenv import load_dotenv

load_dotenv()
ek.set_app_key(os.getenv('EIKON_API_KEY'))
repo_path = os.getenv('REPO_PATH')

sys.path.insert(0, repo_path + r'src_HF')
from utils.main_utils import *

def get_story_text(id: str, text_dict: dict[str:str]) -> None:
    response = ek.get_news_story(id)
    soup: BeautifulSoup = BeautifulSoup(response, 'html.parser')
    text: str = soup.get_text()
    text_dict[id] = text

In [None]:
headline_topic = 'CRU'

# load headlines
headline_df = pd.read_csv(rf'data\raw_news_headlines\EIKON_{headline_topic}_NEWS.csv')

# check if file exists, if not create it
if not os.path.exists(repo_path + rf'data\raw_news_stories\EIKON_{headline_topic}_NEWS_FULL.csv'):
    blank_df = pd.DataFrame(columns=['date', 'versionCreated', 'text', 'storyId', 'sourceCode', 'fullStory'])
    blank_df.to_csv(repo_path + rf'data\raw_news_stories\EIKON_{headline_topic}_NEWS_FULL.csv', index=False)

# load existing story data
existing_df = pd.read_csv(repo_path + rf'data\raw_news_stories\EIKON_{headline_topic}_NEWS_FULL.csv')
existing_ids = existing_df['storyId']

# get all story ids not in the existing_ids series
storie_ids = headline_df[~headline_df['storyId'].isin(existing_ids)]['storyId']

print(f'Number of stories alredy downloaded/failed: ({len(existing_ids)}/{len(headline_df)})')

text_dict = dict()
nexcept = 0

for id in tqdm(storie_ids):
    try:
        get_story_text(id, text_dict)

    except ek.EikonError as e:
        
        if str(e) == 'Error code 429 | Client Error: Too many requests, please try again later.':
            print('Daily request limit reached')
            break  # Break if daily request limit is reached
        
        else:
            text_dict[id] = 'error'  # Add error message to text_dict if other error occurs


In [None]:
text_df = pd.DataFrame.from_dict(text_dict, orient='index', columns=['text'])

display(text_df)

# map the text to the headlines
headline_df['fullStory'] = headline_df['storyId'].map(text_df['text'])
complete_df = pd.concat([existing_df, headline_df.dropna(subset=['fullStory'])])

complete_df.to_csv(repo_path + rf'data\raw_news_stories\EIKON_{headline_topic}_NEWS_FULL.csv', index=False)

In [None]:
pd.read_csv(rf'C:\Users\joneh\master_thesis\data\raw_news_stories\EIKON_CRU_NEWS_FULL.csv')