In [27]:
import pandas as pd
import numpy as np
from dateutil import parser
from datetime import datetime

# News Data Processing

In [26]:
def convert_to_month_day_year(datetime_str, reverse_date=False):
    parsed_datetime = parser.parse(str(datetime_str))
    formatted_datetime = parsed_datetime.strftime('%B %d, %Y')
    return formatted_datetime

In [None]:
cnbc_df = pd.read_csv('cnbc_headlines.csv', usecols=['Headlines', 'Time'])
cnbc_df.dropna(inplace=True)
print(cnbc_df.shape)

cnbc_df['Time'] = cnbc_df['Time'].apply(convert_to_month_day_year)
cnbc_df.head(2)

In [None]:
guardian_df = pd.read_csv('guardian_headlines.csv')
guardian_df.dropna(inplace=True)
print(guardian_df.shape)

guardian_df['Time'] = guardian_df['Time'].apply(convert_to_month_day_year)
guardian_df.head(2)

(17800, 2)


Unnamed: 0,Time,Headlines
0,"July 18, 2020",Johnson is asking Santa for a Christmas recovery
1,"July 18, 2020",‘I now fear the worst’: four grim tales of wor...


In [None]:
reuters_df = pd.read_csv('reuters_headlines.csv', usecols=['Headlines', 'Time'])
reuters_df.dropna(inplace=True)
print(reuters_df.shape)

reuters_df['Time'] = reuters_df['Time'].apply(convert_to_month_day_year)
reuters_df.head(2)

(32770, 2)


Unnamed: 0,Headlines,Time
0,TikTok considers London and other locations fo...,"July 18, 2020"
1,Disney cuts ad spending on Facebook amid growi...,"July 18, 2020"


In [None]:
import os
import json 


folder_paths = ['./2018_1', './2018_2', './2018_3', './2018_4', './2018_5'] 

data = []

for folder_path in folder_paths:
    for filename in os.listdir(folder_path):
        if filename.endswith('.json'):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r') as json_file:
                json_data = json.load(json_file)
            data.append(json_data)

json_news_df = pd.json_normalize(data)
json_news_df = json_news_df[['title', 'thread.published']]

json_news_df.dropna(inplace=True)
print(json_news_df.shape)

(306242, 2)


In [None]:
json_news_df['thread.published'] = json_news_df['thread.published'].apply(convert_to_month_day_year)
json_news_df.rename(columns={'title': 'Headlines', 'thread.published': 'Time'}, inplace=True)
json_news_df.head(2)

Unnamed: 0,Headlines,Time
0,EMERGING MARKETS-Mexican peso seesaws over dol...,"January 26, 2018"
1,"Migrants must visit Nazi concentration camps, ...","January 10, 2018"


In [None]:
# Combine all the dfs
news_df = pd.concat([json_news_df, reuters_df, guardian_df, cnbc_df], axis=0)
print(news_df.shape)
news_df.head(2)

(359612, 2)


Unnamed: 0,Headlines,Time
0,EMERGING MARKETS-Mexican peso seesaws over dol...,"January 26, 2018"
1,"Migrants must visit Nazi concentration camps, ...","January 10, 2018"


In [None]:
# Remove the duplicated rows
before_rows = news_df.shape[0]
news_df.drop_duplicates(inplace=True)

print(f"Removed {before_rows - news_df.shape[0]} rows as duplicates. There are now {news_df.shape[0]} rows.")

Removed 39074 rows as duplicates. There are now 320538 rows.


In [None]:
news_df.to_csv("./news.csv", index=False)

# Macro-Econ Data

In [28]:
econ_df = pd.read_csv("DATA.csv")
print(f"There are {econ_df.shape[0]} rows")
econ_df.head(3)

There are 241 rows


Unnamed: 0,DATE,UNRATE(%),CONSUMER CONF INDEX,PPI-CONST MAT.,CPIALLITEMS,INFLATION(%),MORTGAGE INT. MONTHLY AVG(%),MED HOUSEHOLD INCOME,CORP. BOND YIELD(%),MONTHLY HOME SUPPLY,% SHARE OF WORKING POPULATION,GDP PER CAPITA,QUARTERLY REAL GDP,QUARTERLY GDP GROWTH RATE (%),CSUSHPISA
0,01-05-2022,3.6,106.4,352.857,123.3228,8.581511,5.23,,4.13,8.4,,74737,19699.465,-0.144227,120.724
1,01-04-2022,3.6,107.3,343.73,121.97817,8.258629,4.9825,,3.76,8.4,,74737,19699.465,-0.144227,121.813
2,01-03-2022,3.6,107.2,345.852,121.301004,8.542456,4.172,,3.43,7.0,,73289,19727.918,-0.395692,122.888


In [31]:
def convert_to_time_reverse(datetime_str):
    datetime_obj = datetime.strptime(datetime_str, '%d-%m-%Y')
    return datetime.strftime(datetime_obj, '%B %d, %Y')

In [36]:
imp_econ_df = econ_df.copy()

imp_econ_df.rename(columns={'DATE': 'date', 
                            'UNRATE(%)': 'unemployment rate (%)', 
                            'CONSUMER CONF INDEX': 'consumer confidence index',
                            'PPI-CONST MAT.': 'producers purchase index',
                            'CPIALLITEMS': 'consumer price index',
                            'INFLATION(%)': 'inflation rate (%)',
                            'MORTGAGE INT. MONTHLY AVG(%)': 'average mortage interest rate',
                            'MED HOUSEHOLD INCOME': 'median household income',
                            'CORP. BOND YIELD(%)': 'corporate bond yield',
                            'MONTHLY HOME SUPPLY': 'monthly home supply',
                            '% SHARE OF WORKING POPULATION': 'working population (%)',
                            'GDP PER CAPITA': 'gdp per capita',
                            'QUARTERLY REAL GDP': 'quarterly real GDP',
                            'QUARTERLY GDP GROWTH RATE (%)': 'quarterly gdp growth rate (%)',
                            'CSUSHPISA': 'national home price index'}, inplace=True)

imp_econ_df['date'] = imp_econ_df['date'].apply(convert_to_time_reverse)
print(f"There are {imp_econ_df.shape[0]} rows")
imp_econ_df.head(3)

There are 241 rows


Unnamed: 0,date,unemployment rate (%),consumer confidence index,producers purchase index,consumer price index,inflation rate (%),average mortage interest rate,median household income,corporate bond yield,monthly home supply,working population (%),gdp per capita,quarterly real GDP,quarterly gdp growth rate (%),national home price index
0,"May 01, 2022",3.6,106.4,352.857,123.3228,8.581511,5.23,,4.13,8.4,,74737,19699.465,-0.144227,120.724
1,"April 01, 2022",3.6,107.3,343.73,121.97817,8.258629,4.9825,,3.76,8.4,,74737,19699.465,-0.144227,121.813
2,"March 01, 2022",3.6,107.2,345.852,121.301004,8.542456,4.172,,3.43,7.0,,73289,19727.918,-0.395692,122.888


In [37]:
imp_econ_df.to_csv("econ_data.csv", index=False)