#### Import Package

In [1]:
import pandas as pd
import numpy as np

from class_data.data import Data
from utils.system import *

import warnings
warnings.filterwarnings('ignore')

#### Format Data

In [2]:
# US Daily News Index
print("Loading in US Daily News Index...")
us_news_data = pd.read_csv(get_data() / 'All_Daily_Policy_Data.csv')
# Format Daily US News Index
us_news = Data(data=us_news_data, name='daily_us_news_index')
us_news = us_news.format_dep()
# Export Data
us_news.to_parquet(get_format_data() / 'us_news.parquet.brotli', compression='brotli')

Loading in US Daily News Index...


In [2]:
# 5-Year Breakeven Inflation Rate
print("Loading in 5-Year Breakeven Inflation Rate Index...")
ir_data = pd.read_csv(get_data() / 'T5YIE.csv')
# Format Daily US News Index
ir = Data(data=ir_data, name='ir')
ir = ir.format_dep()
# Export Data
ir.to_parquet(get_format_data() / 'ir.parquet.brotli', compression='brotli')

Loading in 5-Year Breakeven Inflation Rate Index...


In [2]:
# Financial Stress Index
print("Loading in FSI Index...")
fsi_data = pd.read_csv(get_data() / 'fsi.csv')
# Format Daily US News Index
fsi = Data(data=fsi_data, name='fsi')
fsi = fsi.format_dep()
# Export Data
fsi.to_parquet(get_format_data() / 'fsi.parquet.brotli', compression='brotli')

Loading in FSI Index...


In [2]:
# AI Google Trend Index
print("Loading in AI Google Trend...")
ai_google_trend = pd.read_csv(get_data() / 'ai_google_trend.csv', skiprows=1)
# Format AI Google Trend Index
ai_google_trend = Data(data=ai_google_trend, name='ai_google_trend')
ai_google_trend = ai_google_trend.format_dep()
# Export Data
ai_google_trend.to_parquet(get_format_data() / 'ai_google_trend.parquet.brotli', compression='brotli')

Loading in AI Google Trend...


In [2]:
# ESG Google Trend Index
print("Loading in ESG Google Trend...")
esg_google_trend = pd.read_csv(get_data() / 'esg_google_trend.csv', skiprows=1)
# Format AI Google Trend Index
esg_google_trend = Data(data=esg_google_trend, name='esg_google_trend')
esg_google_trend = esg_google_trend.format_dep()
# Export Data
esg_google_trend.to_parquet(get_format_data() / 'esg_google_trend.parquet.brotli', compression='brotli')

Loading in ESG Google Trend...


In [3]:
# Recession Attention Index
print("Loading in Recession Attention Index...")
recession = pd.read_csv(get_data() / 'Recession_Attention.csv')
# Format Recession Index
recession = Data(data=recession, name='recession_attention')
recession = recession.format_dep()
# Export Data
recession.to_parquet(get_format_data() / 'recession.parquet.brotli', compression='brotli')

Loading in Recession Attention Index...


In [2]:
# EPU Data
print("Loading in EPU Data...")
epu_data = pd.read_excel(get_data() / 'US_Policy_Uncertainty_Data.xlsx')
# Format Categorical EPU Data
epu = Data(data=epu_data, name='epu_data')
epu = epu.format_dep()
# Export Data
epu.to_parquet(get_format_data() / 'epu.parquet.brotli', compression='brotli')

Loading in EPU Data...


In [2]:
# Categorical EPU Data
print("Loading in Categorical EPU Data...")
epu_cat_data = pd.read_excel(get_data() / 'Categorical_EPU_Data.xlsx')
# Format Categorical EPU Data
epu_cat = Data(data=epu_cat_data, name='categorical_epu_data')
epu_cat = epu_cat.format_dep()
# Export Data
epu_cat.to_parquet(get_format_data() / 'epu_cat.parquet.brotli', compression='brotli')

Loading in Categorical EPU Data...


In [2]:
# Biodiversity Index
print("Loading in Biodiversity Index...")
bio_data = pd.read_csv(get_data() / 'google_biodiversity_attention_index.csv')
# Format Categorical EPU Data
bio = Data(data=bio_data, name='biodiversity_index')
bio = bio.format_dep()
# Export Data
bio.to_parquet(get_format_data() / 'bio_index.parquet.brotli', compression='brotli')

Loading in Biodiversity Index...


In [4]:
# CC Embeddings
print("Loading in CC Embeddings...")
cc_emb_data = pd.read_parquet(get_data() / 'cc' / 'doc.pq')
# Format CC Embeddings
cc_emb_format = Data(data=cc_emb_data, name='cc')
cc_emb = cc_emb_format.format_emb()
# Export Data
cc_emb.to_parquet(get_format_data() / 'cc_emb.parquet.brotli', compression='brotli')

Loading in CC Embeddings...


In [5]:
# NYT Embeddings
print("Loading in NYT Embeddings...")
nyt_emb_data = Data(folder_path=get_data() / 'nyt', file_pattern='doc_*')
nyt_emb = nyt_emb_data.concat_files()
# Format NYT Embeddings
nyt_emb_format = Data(data=nyt_emb, name='nyt')
nyt_emb = nyt_emb_format.format_emb()
# Export data
nyt_emb.to_parquet(get_format_data() / 'nyt_emb.parquet.brotli', compression='brotli')

Loading in NYT Embeddings...


In [2]:
# Daily Topic Attention Data
print("Loading in Daily Topic Attention Data...")
topic_attention_data = pd.read_csv(get_data() / 'Daily_Topic_Attention_Theta.csv')
# Format Topic Attention Data
topic_attention = Data(data=topic_attention_data, name='topic_attention')
topic_attention = topic_attention.format_dep()
# Export data
topic_attention.to_parquet(get_format_data() / 'topic_attention.parquet.brotli', compression='brotli')

Loading in Daily Topic Attention Data...


In [2]:
# Daily Multiple WSJ Articles
print("Loading in Daily WSJ Articles...")
wsj_art_data = Data(folder_path=get_data() / 'wsj_multiple', file_pattern='doc_*')
wsj_art = wsj_art_data.concat_files()
# Format WSJ Embeddings
wsj_art_format = Data(data=wsj_art, name='wsj')
wsj_art = wsj_art_format.format_article()
# Export Data
chunks = np.array_split(wsj_art, 5)
for i, df in enumerate(chunks, 1):
    print(i)
    df.to_parquet(get_format_data() / 'art' / f'wsj_art_{i}.parquet.brotli', compression='brotli')

Loading in Daily WSJ Articles...
1
2
3
4
5
