In [1]:
import pandas as pd
import numpy as np
from dateutil import parser

# News Data Processing

In [2]:
def convert_to_month_day_year(datetime_str):
    parsed_datetime = parser.parse(str(datetime_str))
    formatted_datetime = parsed_datetime.strftime('%B %d, %Y')
    return formatted_datetime

In [3]:
cnbc_df = pd.read_csv('cnbc_headlines.csv', usecols=['Headlines', 'Time'])
cnbc_df.dropna(inplace=True)
print(cnbc_df.shape)

cnbc_df['Time'] = cnbc_df['Time'].apply(convert_to_month_day_year)
cnbc_df.head(2)


(2800, 2)




Unnamed: 0,Headlines,Time
0,Jim Cramer: A better way to invest in the Covi...,"July 17, 2020"
1,Cramer's lightning round: I would own Teradyne,"July 17, 2020"


In [4]:
guardian_df = pd.read_csv('guardian_headlines.csv')
guardian_df.dropna(inplace=True)
print(guardian_df.shape)

guardian_df['Time'] = guardian_df['Time'].apply(convert_to_month_day_year)
guardian_df.head(2)

(17800, 2)


Unnamed: 0,Time,Headlines
0,"July 18, 2020",Johnson is asking Santa for a Christmas recovery
1,"July 18, 2020",‘I now fear the worst’: four grim tales of wor...


In [5]:
reuters_df = pd.read_csv('reuters_headlines.csv', usecols=['Headlines', 'Time'])
reuters_df.dropna(inplace=True)
print(reuters_df.shape)

reuters_df['Time'] = reuters_df['Time'].apply(convert_to_month_day_year)
reuters_df.head(2)

(32770, 2)


Unnamed: 0,Headlines,Time
0,TikTok considers London and other locations fo...,"July 18, 2020"
1,Disney cuts ad spending on Facebook amid growi...,"July 18, 2020"


In [6]:
import os
import json 


folder_paths = ['./2018_1', './2018_2', './2018_3', './2018_4', './2018_5'] 

data = []

for folder_path in folder_paths:
    for filename in os.listdir(folder_path):
        if filename.endswith('.json'):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r') as json_file:
                json_data = json.load(json_file)
            data.append(json_data)

json_news_df = pd.json_normalize(data)
json_news_df = json_news_df[['title', 'thread.published']]

json_news_df.dropna(inplace=True)
print(json_news_df.shape)

(306242, 2)


In [8]:
json_news_df['thread.published'] = json_news_df['thread.published'].apply(convert_to_month_day_year)
json_news_df.rename(columns={'title': 'Headlines', 'thread.published': 'Time'}, inplace=True)
json_news_df.head(2)

Unnamed: 0,Headlines,Time
0,EMERGING MARKETS-Mexican peso seesaws over dol...,"January 26, 2018"
1,"Migrants must visit Nazi concentration camps, ...","January 10, 2018"


In [18]:
# Combine all the dfs
news_df = pd.concat([json_news_df, reuters_df, guardian_df, cnbc_df], axis=0)
print(news_df.shape)
news_df.head(2)

(359612, 2)


Unnamed: 0,Headlines,Time
0,EMERGING MARKETS-Mexican peso seesaws over dol...,"January 26, 2018"
1,"Migrants must visit Nazi concentration camps, ...","January 10, 2018"


In [19]:
# Remove the duplicated rows
before_rows = news_df.shape[0]
news_df.drop_duplicates(inplace=True)

print(f"Removed {before_rows - news_df.shape[0]} rows as duplicates. There are now {news_df.shape[0]} rows.")

Removed 39074 rows as duplicates. There are now 320538 rows.
