In [None]:
import pandas as pd

import os
import sys
from dotenv import load_dotenv

load_dotenv()
REPO_PATH =  os.getenv('REPO_PATH')
sys.path.insert(0, rf'{REPO_PATH}src')

from utils.main_utils import load_json

### Merge headlines with stories

In [None]:
TOPIC = 'CEN'

CUTOFF_DATE = '2023-04-12'

text_df = pd.read_csv(
    rf'{REPO_PATH}data\raw_news_headlines\EIKON_{TOPIC}_NEWS.csv'
)
story_dict = load_json(
    rf'{REPO_PATH}data\raw_news_stories\EIKON_{TOPIC}_NEWS_FULL.json'
)

def remove_nan(df) -> pd.DataFrame:
    df['fullStory'] = df['storyId'].map(story_dict)
    return df.dropna(subset=['fullStory'])

def remove_float(df) -> pd.DataFrame:
    df[df['fullStory'].apply(lambda x: isinstance(x, str))]
    df['date'] = pd.to_datetime(df['date'])
    df.sort_values(by='date', inplace=True)
    return df.reset_index(drop=True)

def drop_duplicates(df) -> pd.DataFrame:
    print(f'Duplicates removed: {df["fullStory"].duplicated().sum()}')
    return df.drop_duplicates(subset=['fullStory'])

operations = {
    'Stories after cutoff': lambda df: df[df['date'] > CUTOFF_DATE],
    'Stories after removing NaN': lambda df: remove_nan(df),
    'Stories after removing error': lambda df: df[df['fullStory'] != 'error'],
    'Stories after removing float': lambda df: remove_float(df),
    'Stories after removing duplicates': lambda df: drop_duplicates(df)
}

for key, function in operations.items():
    text_df = function(text_df)
    print(f'{key}: {text_df.shape[0]}')

text_df.reset_index(drop=True, inplace=True)
display(text_df.head())


### Save data

In [None]:
text_df.to_json(
    rf'{REPO_PATH}data\news_data\EIKON_{TOPIC}_NEWS_COMPLETE.json',
    orient='records',
    lines=True
)

print(F'Saved {text_df.shape[0]} unique stories to json.')