In [303]:
import pandas as pd
import numpy as np

In [304]:
from sklearn.metrics import mean_squared_error

In [305]:
import os

In [306]:
y_true = pd.read_csv('datasets/test_set.csv')['cpi_pct'].to_numpy()

In [307]:
prediction = []
for file_name in os.listdir('predictions/'):
    
    if file_name[:4] != 'pca_':
        continue
    
    # skip uninformative ebic results and shuffled results
    if file_name in ["pca50_lasso_ebic.csv", "pca50_lasso_cvshuffled.csv", 'pca_lasso_cvshuffled.csv', 'pca_lasso_ebic.csv']: continue
    
    y_pred = pd.read_csv(f'predictions/{file_name}', index_col=0).to_numpy()
    
    name = file_name.rsplit('.', 1)[0]
    rmse =  mean_squared_error(y_true, y_pred, squared=False)
    print(name, "rmse:", round(rmse, 7))
    prediction.append({'model': name, 'rmse': rmse})

pca_bma rmse: 0.0030046
pca_lasso_bic rmse: 0.0024414
pca_lasso_cv rmse: 0.0024375
pca_ols rmse: 0.0030814
pca_ridge_cv rmse: 0.0024442


In [308]:
score_df = pd.DataFrame(prediction).set_index('model')

In [309]:
score_df['diff to best'] =  score_df.rmse - score_df.rmse.min()

In [310]:
score_df

Unnamed: 0_level_0,rmse,diff to best
model,Unnamed: 1_level_1,Unnamed: 2_level_1
pca_bma,0.003005,0.000567
pca_lasso_bic,0.002441,4e-06
pca_lasso_cv,0.002438,0.0
pca_ols,0.003081,0.000644
pca_ridge_cv,0.002444,7e-06


In [311]:
score_df.sort_values('rmse').to_csv('results/1970_2023_pca_model_scores_sorted.csv')
score_df.sort_values('rmse')

Unnamed: 0_level_0,rmse,diff to best
model,Unnamed: 1_level_1,Unnamed: 2_level_1
pca_lasso_cv,0.002438,0.0
pca_lasso_bic,0.002441,4e-06
pca_ridge_cv,0.002444,7e-06
pca_bma,0.003005,0.000567
pca_ols,0.003081,0.000644


In [312]:
sorted_cols = score_df.sort_values('rmse').index
sorted_cols[0]

'pca_lasso_cv'

2000-2023 model:

- benchmark1_mean rmse: 0.0026948
- benchmark2_prev rmse: 0.0034082
- bma rmse: 0.002297
- lasso_bic rmse: 0.0023243
- lasso_ebic rmse: 0.0023711
- lasso_cv rmse: 0.0027338
- lasso_cvshuffled rmse: 0.0027338
- ols rmse: 0.0279417
- ridge_cv rmse: 0.0024787

1970-2023 model:

- benchmark1_mean rmse: 0.0032184
- benchmark2_prev rmse: 0.0026672
- benchmark3_rf rmse: 0.0020702
- bma rmse: 0.0020429
- lasso_bic rmse: 0.0021015
- lasso_cv rmse: 0.0021016
- lasso_cvshuffled rmse: 0.0019987
- lasso_ebic rmse: 0.0021058
- ols rmse: 0.0030478
- ridge_cv rmse: 0.0021375

1970-2023 model (PCA):

- benchmark1_mean rmse: 0.0026948
- benchmark2_prev rmse: 0.0034082
- benchmark3_rf rmse: 0.0020702
- bma rmse: 0.002297
- lasso_bic rmse: 0.0023243
- lasso_ebic rmse: 0.0023711
- lasso_cv rmse: 0.0027338
- lasso_cvshuffled rmse: 0.0027338
- ols rmse: 0.0279417
- ridge_cv rmse: 0.0024787

In [313]:
df_list = []


for file_name in os.listdir('models/coefs/'):
    
    coefs = pd.read_csv(f'models/coefs/{file_name}', index_col=0)
    
    if file_name[:4] != 'pca_':
        continue
    
    if file_name in ["bma_coefs.csv", "pca_bma_coefs.csv", "pca50_bma_coefs.csv"]:  
        coefs = coefs.rename({'intercept': 'Intercept'})
     
    name = file_name.rsplit('.', 1)[0]
    
    if name[-5:] != 'coefs':
        continue

    coefs.columns = [name[:-6]]
    

    print(name)
    df_list.append(coefs)


pca_bma_coefs
pca_lasso_bic_coefs
pca_lasso_cvshuffled_coefs
pca_lasso_cv_coefs
pca_lasso_ebic_coefs
pca_ols_coefs
pca_ridge_cv_coefs


In [314]:
df = pd.concat(df_list, axis=1).replace(-0, 0)

In [315]:
sorted_idx = df.abs().sort_values(sorted_cols[0], ascending=False).index

In [316]:
df.loc[sorted_idx, sorted_cols].to_csv('results/all_pca_model_coefs_sorted.csv')

In [317]:
df.loc[sorted_idx, sorted_cols].head(40)

model,pca_lasso_cv,pca_lasso_bic,pca_ridge_cv,pca_bma,pca_ols
Intercept,0.003175,0.003175,0.003175,0.003175033,0.003175
pca_0,-0.001546,-0.001536,-0.001067,-0.001721865,-0.00172
pca_3,0.000826,0.000816,0.000621,0.0009995287,0.001
pca_151,0.000435,0.000425,0.000378,0.0005932144,0.00061
pca_5,-0.000381,-0.000372,-0.000345,-0.0004560964,-0.000556
pca_2,0.000363,0.000353,0.000334,0.0003790919,0.000538
pca_1,-0.000324,-0.000314,-0.000309,-0.0001906706,-0.000498
pca_13,-0.000274,-0.000264,-0.000278,-4.483833e-05,-0.000448
pca_18,-0.00025,-0.00024,-0.000264,-1.776634e-05,-0.000425
pca_12,-0.000238,-0.000228,-0.000256,-1.268814e-05,-0.000413


In [318]:
['pca_bma', 'pca_lasso_bic', 'pca_lasso_cvshuffled', 'pca_lasso_cv',
       'pca_lasso_ebic', 'pca_ols', 'pca_ridge_cv']

['pca_bma',
 'pca_lasso_bic',
 'pca_lasso_cvshuffled',
 'pca_lasso_cv',
 'pca_lasso_ebic',
 'pca_ols',
 'pca_ridge_cv']