## Import Package

In [None]:
import pandas as pd
import os
import logging
logging.getLogger('ray').setLevel(logging.ERROR)
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

from class_model.model import Model
from utils.system import *

import warnings
warnings.filterwarnings('ignore')

## Load Data

#### Daily Index

In [104]:
# Read in Formatted NYT Embeddings
nyt_emb = pd.read_parquet(get_format_data() / 'nyt_emb.parquet.brotli')
# Read in Formatted WSJ Embeddings
wsj_emb = pd.read_parquet(get_format_data() / 'wsj_emb.parquet.brotli')
# Read in Formatted US News
us_news = pd.read_parquet(get_format_data() / 'us_news.parquet.brotli')
# Read in Formatted Topic Attention
topic_attention = pd.read_parquet(get_format_data() / 'topic_attention.parquet.brotli')

In [105]:
# Merge NYT embeddings with daily EPU
nyt_merge_d = pd.merge(us_news, nyt_emb, left_index=True, right_index=True, how='left').dropna()
nyt_merge_d = pd.merge(topic_attention, nyt_merge_d, left_index=True, right_index=True, how='left').dropna()
# Merge WSJ embeddings with daily EPU
wsj_merge_d = pd.merge(us_news, wsj_emb, left_index=True, right_index=True, how='left').dropna()
wsj_merge_d = pd.merge(topic_attention, wsj_merge_d, left_index=True, right_index=True, how='left').dropna()

#### Monthly Index

In [88]:
# Read in Formatted Topic Attention
epu_cat = pd.read_parquet(get_format_data() / 'epu_cat.parquet.brotli')
# Read in Formatted Biodiversity Index
bio_index = pd.read_parquet(get_format_data() / 'bio_index.parquet.brotli')

In [89]:
# Merge NYT embeddings with daily EPU
nyt_emb_m = nyt_emb.resample('M').mean()
wsj_emb_m = wsj_emb.resample('M').mean()
nyt_merge_m = pd.merge(epu_cat, nyt_emb_m, left_index=True, right_index=True, how='left').dropna()
nyt_merge_m = pd.merge(bio_index, nyt_merge_m, left_index=True, right_index=True, how='left').dropna()
# Merge WSJ embeddings with daily EPU
wsj_merge_m = pd.merge(epu_cat, wsj_emb_m, left_index=True, right_index=True, how='left').dropna()
wsj_merge_m = pd.merge(bio_index, wsj_merge_m, left_index=True, right_index=True, how='left').dropna()

## Parallelized Linear Regression Experiments (X-Variable = Embedding, Y-Variable = Index)

In [4]:
# Alpha grid from 0 to 30.5
grid = [0, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1] + [0.5 * i for i in range(1, int(30 / 0.5) + 2)]

#### NYT (Daily Index)

In [5]:
# Tune OOS L2 OLS NYT
nyt_beta_col = nyt_merge_d.filter(regex='^nyt').columns
y_col = us_news.columns.tolist() + topic_attention.columns.tolist()

nyt_model = Model(data=nyt_merge_d, d_to_m=True, beta_col=nyt_beta_col, y_col=y_col, split=0.5, alpha_tune=grid)
nyt_col_result = nyt_model.tune_multiple_y('oos_l2_ols')

2023-12-08 21:05:10,987	INFO worker.py:1673 -- Started a local Ray instance.
[36m(pid=11096)[0m 
[36m(process_column pid=11096)[0m   return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
[36m(pid=24436)[0m [32m [repeated 15x across cluster][0m
[36m(process_column pid=27960)[0m   return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T[32m [repeated 16x across cluster][0m
[36m(process_column pid=9948)[0m   return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T[32m [repeated 16x across cluster][0m
[36m(process_column pid=236)[0m   return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T[32m [repeated 14x across cluster][0m
[36m(process_column pid=9948)[0m   return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T[32m [repeated 2x across cluster][0m
[36m(process_column pid=11096)[0m   return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T[32m [repeated 9x across cluster][0m
[36m(process_column pid=9948)[0m   return l

In [6]:
nyt_table_d = nyt_model.table_multiple_y(nyt_col_result)

In [7]:
nyt_table_d.to_csv(get_reports() / 'results' / 'nyt_emb_results_d.csv', index=False)

In [8]:
nyt_table_d = pd.read_csv(get_reports() / 'results' / 'nyt_emb_d_results.csv')

#### Analyze Result and Statistics

In [20]:
nyt_daily_pol_table = nyt_table_d.loc[nyt_table_d['Y Variable'].isin(us_news.columns.tolist())]
nyt_daily_pol_table

Unnamed: 0,Y Variable,alpha,is_r2,oos_r2
0,daily_pol,30.5,0.547236,0.0


In [21]:
nyt_topic_table = nyt_table_d.loc[nyt_table_d['Y Variable'].isin(topic_attention.columns.tolist())]

In [46]:
# pct is_r2 > 0 and oos_r2 > 0
nyt_topic_is_pct = round(len(nyt_topic_table.loc[nyt_topic_table['is_r2'] > 0]) / len(nyt_topic_table) * 100, 2)
nyt_topic_oos_pct = round(len(nyt_topic_table.loc[nyt_topic_table['oos_r2'] > 0]) / len(nyt_topic_table) * 100, 2)
print(f"pct is_r2 > 0: {nyt_topic_is_pct}")
print(f"pct oos_r2 > 0: {nyt_topic_oos_pct}")
# mean is_r2 and oos_r2
nyt_topic_is_r2 = round(nyt_topic_table['is_r2'].mean()*100, 2)
nyt_topic_oos_r2 = round(nyt_topic_table['oos_r2'].mean()*100, 2)
print(f"mean is_r2: {nyt_topic_is_r2}")
print(f"mean oos_r2: {nyt_topic_oos_r2}")

pct is_r2 > 0: 100.0
pct oos_r2 > 0: 22.78
mean is_r2: 52.52
mean oos_r2: 2.76


#### NYT (Monthly Index)

In [90]:
# Tune OOS L2 OLS NYT
nyt_beta_col = nyt_merge_m.filter(regex='^nyt').columns
y_col = epu_cat.columns.tolist() + bio_index.columns.tolist()

nyt_model = Model(data=nyt_merge_m, d_to_m=False, beta_col=nyt_beta_col, y_col=y_col, split=0.5, alpha_tune=grid)
nyt_col_result = nyt_model.tune_multiple_y('oos_l2_ols')

2023-12-10 19:42:33,527	INFO worker.py:1673 -- Started a local Ray instance.
[36m(pid=13380)[0m 
[36m(pid=24112)[0m [32m [repeated 12x across cluster][0m


In [91]:
nyt_table_m = nyt_model.table_multiple_y(nyt_col_result)

In [92]:
nyt_table_m.to_csv(get_reports() / 'results' / 'nyt_emb_m_results.csv', index=False)

In [93]:
nyt_table_m = pd.read_csv(get_reports() / 'results' / 'nyt_emb_m_results.csv')

#### Analyze Result and Statistics

In [94]:
nyt_bio_index_table = nyt_table_m.loc[nyt_table_m['Y Variable'].isin(bio_index.columns.tolist())]
nyt_bio_index_table

Unnamed: 0,Y Variable,alpha,is_r2,oos_r2
12,bio,0.5,0.238194,0.152328


In [95]:
nyt_epu_cat_table = nyt_table_m.loc[nyt_table_m['Y Variable'].isin(epu_cat.columns.tolist())]

In [96]:
# pct is_r2 > 0 and oos_r2 > 0
nyt_epu_cat_is_pct = round(len(nyt_epu_cat_table.loc[nyt_epu_cat_table['is_r2'] > 0]) / len(nyt_epu_cat_table) * 100, 2)
nyt_epu_cat_oos_pct = round(len(nyt_epu_cat_table.loc[nyt_epu_cat_table['oos_r2'] > 0]) / len(nyt_epu_cat_table) * 100, 2)
print(f"pct is_r2 > 0: {nyt_epu_cat_is_pct}")
print(f"pct oos_r2 > 0: {nyt_epu_cat_oos_pct}")
# mean is_r2 and oos_r2
nyt_epu_cat_is_r2 = round(nyt_epu_cat_table['is_r2'].mean()*100, 2)
nyt_epu_cat_oos_r2 = round(nyt_epu_cat_table['oos_r2'].mean()*100, 2)
print(f"mean is_r2: {nyt_epu_cat_is_r2}")
print(f"mean oos_r2: {nyt_epu_cat_oos_r2}")

pct is_r2 > 0: 100.0
pct oos_r2 > 0: 58.33
mean is_r2: 70.75
mean oos_r2: 14.45


#### WSJ (Daily Index)

In [10]:
# Tune OOS L2 OLS WSJ
wsj_beta_col = wsj_merge_d.filter(regex='^wsj').columns
y_col = us_news.columns.tolist() + topic_attention.columns.tolist()

wsj_model = Model(data=wsj_merge_d, d_to_m=True, beta_col=wsj_beta_col, y_col=y_col, split=0.5, alpha_tune=grid)
wsj_col_result = wsj_model.tune_multiple_y('oos_l2_ols')

2023-12-08 21:11:00,095	INFO worker.py:1673 -- Started a local Ray instance.
[36m(pid=35500)[0m 
[36m(pid=18936)[0m [32m [repeated 15x across cluster][0m


In [11]:
wsj_table_d = wsj_model.table_multiple_y(wsj_col_result)

In [12]:
wsj_table_d.to_csv(get_reports() / 'results' / 'wsj_emb_d_results.csv', index=False)

In [13]:
wsj_table_d = pd.read_csv(get_reports() / 'results' / 'wsj_emb_d_results.csv')

#### Analyze Result and Statistics

In [39]:
wsj_daily_pol_table = wsj_table_d.loc[wsj_table_d['Y Variable'].isin(us_news.columns.tolist())]
wsj_daily_pol_table

Unnamed: 0,Y Variable,alpha,is_r2,oos_r2
0,daily_pol,1.0,0.662995,0.395172


In [40]:
wsj_topic_table = wsj_table_d.loc[wsj_table_d['Y Variable'].isin(topic_attention.columns.tolist())]

In [48]:
# pct is_r2 > 0 and oos_r2 > 0
wsj_topic_is_pct = round(len(wsj_topic_table.loc[wsj_topic_table['is_r2'] > 0]) / len(wsj_topic_table) * 100, 2)
wsj_topic_oos_pct = round(len(wsj_topic_table.loc[wsj_topic_table['oos_r2'] > 0]) / len(wsj_topic_table) * 100, 2)
print(f"pct is_r2 > 0: {wsj_topic_is_pct}")
print(f"pct oos_r2 > 0: {wsj_topic_oos_pct}")

# mean is_r2 and oos_r2
wsj_topic_is_r2 = round(wsj_topic_table['is_r2'].mean()*100, 2)
wsj_topic_oos_r2 = round(wsj_topic_table['oos_r2'].mean()*100, 2)
print(f"mean is_r2: {wsj_topic_is_r2}")
print(f"mean oos_r2: {wsj_topic_oos_r2}")

pct is_r2 > 0: 100.0
pct oos_r2 > 0: 78.89
mean is_r2: 75.79
mean oos_r2: 36.11


#### WSJ (Monthly Index)

In [97]:
# Tune OOS L2 OLS WSJ
wsj_beta_col = wsj_merge_m.filter(regex='^wsj').columns
y_col = epu_cat.columns.tolist() + bio_index.columns.tolist()

wsj_model = Model(data=wsj_merge_m, d_to_m=False, beta_col=wsj_beta_col, y_col=y_col, split=0.5, alpha_tune=grid)
wsj_col_result = wsj_model.tune_multiple_y('oos_l2_ols')

2023-12-10 19:42:57,420	INFO worker.py:1673 -- Started a local Ray instance.
[36m(pid=32588)[0m 
[36m(process_column pid=32588)[0m   dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
[36m(pid=24016)[0m [32m [repeated 12x across cluster][0m
[36m(process_column pid=24016)[0m   dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)[32m [repeated 12x across cluster][0m


In [98]:
wsj_table_m = wsj_model.table_multiple_y(wsj_col_result)

In [99]:
wsj_table_m.to_csv(get_reports() / 'results' / 'wsj_emb_m_results.csv', index=False)

In [100]:
wsj_table_m = pd.read_csv(get_reports() / 'results' / 'wsj_emb_m_results.csv')

#### Analyze Result and Statistics

In [101]:
wsj_bio_index_table = wsj_table_m.loc[wsj_table_m['Y Variable'].isin(bio_index.columns.tolist())]
wsj_bio_index_table

Unnamed: 0,Y Variable,alpha,is_r2,oos_r2
12,bio,0.001,0.957615,0.0


In [102]:
wsj_epu_cat_table = wsj_table_m.loc[wsj_table_m['Y Variable'].isin(epu_cat.columns.tolist())]

In [103]:
# pct is_r2 > 0 and oos_r2 > 0
wsj_epu_cat_is_pct = round(len(wsj_epu_cat_table.loc[wsj_epu_cat_table['is_r2'] > 0]) / len(wsj_epu_cat_table) * 100, 2)
wsj_epu_cat_oos_pct = round(len(wsj_epu_cat_table.loc[wsj_epu_cat_table['oos_r2'] > 0]) / len(wsj_epu_cat_table) * 100, 2)
print(f"pct is_r2 > 0: {wsj_epu_cat_is_pct}")
print(f"pct oos_r2 > 0: {wsj_epu_cat_oos_pct}")
# mean is_r2 and oos_r2
wsj_epu_cat_is_r2 = round(wsj_epu_cat_table['is_r2'].mean()*100, 2)
wsj_epu_cat_oos_r2 = round(wsj_epu_cat_table['oos_r2'].mean()*100, 2)
print(f"mean is_r2: {wsj_epu_cat_is_r2}")
print(f"mean oos_r2: {wsj_epu_cat_oos_r2}")

pct is_r2 > 0: 100.0
pct oos_r2 > 0: 58.33
mean is_r2: 51.53
mean oos_r2: 7.68


## Parallelized Linear Regression Experiments (X-Variable = Embedding, Y-Variable = Index.diff())

#### NYT (Monthly Index)

In [116]:
# Tune OOS L2 OLS NYT 
nyt_beta_col = nyt_merge_d.filter(regex='^nyt').columns.tolist()
y_col = us_news.columns.tolist()

nyt_merge_diff_d = nyt_merge_d[nyt_beta_col + y_col]
nyt_merge_diff_d = nyt_merge_diff_d.diff()
nyt_merge_diff_d = nyt_merge_diff_d.dropna()
nyt_merge_diff_d = nyt_merge_diff_d.resample('M').mean()

nyt_model = Model(data=nyt_merge_diff_d, d_to_m=True, beta_col=nyt_beta_col, y_col=y_col, split=0.5, alpha_tune=grid)
nyt_col_result = nyt_model.tune_multiple_y('oos_l2_ols')

2023-12-10 19:57:06,434	INFO worker.py:1673 -- Started a local Ray instance.
[36m(pid=1924)[0m 


In [117]:
nyt_table_diff_d = nyt_model.table_multiple_y(nyt_col_result)

In [118]:
nyt_table_diff_d

Unnamed: 0,Y Variable,alpha,is_r2,oos_r2
0,daily_pol,0.1,0.07351,0.001542


#### WSJ (Monthly Index)

In [125]:
# Tune OOS L2 OLS WSJ 
wsj_beta_col = wsj_merge_d.filter(regex='^wsj').columns.tolist()
y_col = us_news.columns.tolist()

wsj_merge_diff_d = wsj_merge_d[wsj_beta_col + y_col]
wsj_merge_diff_d = wsj_merge_diff_d.diff()
wsj_merge_diff_d = wsj_merge_diff_d.dropna()
wsj_merge_diff_d = wsj_merge_diff_d.resample('M').mean()

wsj_model = Model(data=wsj_merge_diff_d, d_to_m=True, beta_col=wsj_beta_col, y_col=y_col, split=0.5, alpha_tune=grid)
wsj_col_result = wsj_model.tune_multiple_y('oos_l2_ols')

2023-12-10 19:58:38,091	INFO worker.py:1673 -- Started a local Ray instance.
[36m(pid=19156)[0m 
[36m(process_column pid=19156)[0m   dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)


In [126]:
wsj_table_diff_d = wsj_model.table_multiple_y(wsj_col_result)

In [127]:
wsj_table_diff_d

Unnamed: 0,Y Variable,alpha,is_r2,oos_r2
0,daily_pol,0.01,0.133132,0.01584


## Parallelized Linear Regression Experiments (X-Variable = Embedding, Y-Variable = Index.shift())

#### WSJ (Monthly Index)

##### Shift Index One Day Forward

In [129]:
# Tune OOS L2 OLS WSJ 
wsj_beta_col = wsj_merge_d.filter(regex='^wsj').columns.tolist()
y_col = us_news.columns.tolist()

wsj_merge_shiftf_d = wsj_merge_d[wsj_beta_col + y_col]
wsj_merge_shiftf_d[wsj_beta_col] = wsj_merge_shiftf_d[wsj_beta_col].shift(1)
wsj_merge_shiftf_d = wsj_merge_shiftf_d.dropna()

wsj_model = Model(data=wsj_merge_shiftf_d, d_to_m=True, beta_col=wsj_beta_col, y_col=y_col, split=0.5, alpha_tune=grid)
wsj_col_result = wsj_model.tune_multiple_y('oos_l2_ols')

2023-12-10 20:00:46,793	INFO worker.py:1507 -- Calling ray.init() again after it has already been called.


In [130]:
wsj_table_shiftf_d = wsj_model.table_multiple_y(wsj_col_result)

In [131]:
wsj_table_shiftf_d

Unnamed: 0,Y Variable,alpha,is_r2,oos_r2
0,daily_pol,0.5,0.667285,0.308407


##### Shift Index One Day Back

In [133]:
# Tune OOS L2 OLS WSJ 
wsj_beta_col = wsj_merge_d.filter(regex='^wsj').columns.tolist()
y_col = us_news.columns.tolist()

wsj_merge_shiftb_d = wsj_merge_d[wsj_beta_col + y_col]
wsj_merge_shiftb_d[wsj_beta_col] = wsj_merge_shiftb_d[wsj_beta_col].shift(-1)
wsj_merge_shiftb_d = wsj_merge_shiftb_d.dropna()

wsj_model = Model(data=wsj_merge_shiftb_d, d_to_m=True, beta_col=wsj_beta_col, y_col=y_col, split=0.5, alpha_tune=grid)
wsj_col_result = wsj_model.tune_multiple_y('oos_l2_ols')

2023-12-10 20:02:45,616	INFO worker.py:1673 -- Started a local Ray instance.
[36m(pid=23028)[0m 


In [134]:
wsj_table_shiftb_d = wsj_model.table_multiple_y(wsj_col_result)

In [135]:
wsj_table_shiftb_d

Unnamed: 0,Y Variable,alpha,is_r2,oos_r2
0,daily_pol,3.5,0.532906,0.168728
