In [None]:
%matplotlib inline
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
old_root = Path('/ihme/covid-19/seir-pipeline-outputs/regression/2020_06_30.integration_test_2')
new_root = Path('/ihme/covid-19/seir-regression/latest/')
old_betas_path = old_root / 'betas'
new_betas_path = new_root / 'betas'
old_coef_path = old_root / 'coefficients'
new_coef_path = new_root / 'coefficients'

In [None]:
def load_loc_draw_files(root: Path) -> pd.DataFrame:
    data = []
    for loc_dir in root.iterdir():
        data.append(load_draw_files(loc_dir))
    return pd.concat(data)

def load_draw_files(root: Path):
    data = []
    for draw_file in root.iterdir():
        df = pd.read_csv(draw_file)
        df['draw'] = int(draw_file.stem.split('_')[-1])
        data.append(df)
    return pd.concat(data)

In [None]:
old_betas = load_loc_draw_files(old_betas_path).drop(columns='location_id').rename(columns={'loc_id': 'location_id', 'testing_reference': 'testing'}).sort_values(['location_id', 'draw', 'date']).reset_index(drop=True)
new_betas = load_draw_files(new_betas_path).sort_values(['location_id', 'draw', 'date']).reset_index(drop=True)

In [None]:
# Regress is more complicated, this won't work
old_betas.equals(new_betas)

In [None]:
# Look systematically
for col in old_betas.columns:
    if isinstance(old_betas[col][0], str):
        print(col, ': ', old_betas[col].equals(new_betas[col]))
    else:
        print(col, ': ', np.allclose(old_betas[col], new_betas[col], equal_nan=True))

In [None]:
# Beta pred is a bit weird.  How bad are the errors
plt.hist((old_betas['beta_pred'] - new_betas['beta_pred'])/old_betas['beta_pred'], bins=1000)
#plt.plot(old_betas['beta_pred'])
plt.xlim([-.003, .003])
plt.show()

In [None]:
# Errors are unbiased, which is good.  Check out the tails
(np.abs(old_betas['beta_pred'] - new_betas['beta_pred'])/old_betas['beta_pred']).describe()

In [None]:
# We can live with a max 3% error if it is almost always closer to 0.03%

In [None]:
# Let's see if some of this change is explicable
old_coef = load_draw_files(old_coef_path).drop(columns='Unnamed: 0').rename(columns={'testing_reference': 'testing'}).sort_values(['group_id', 'draw'])
new_coef = load_draw_files(new_coef_path).sort_values(['group_id', 'draw'])

In [None]:
# Old covariate vs new covariate. Dots on the line is good
fig, ax = plt.subplots(nrows = len(new_coef.columns), figsize=(5, 25))
for i, col in enumerate(new_coef.columns):
    ax[i].plot(old_coef[col], old_coef[col])
    ax[i].scatter(new_coef[col], old_coef[col])
    ax[i].set_title(col)
fig.tight_layout()
plt.show()

In [None]:
# Discrepancy is all in the non-time varying covariates, which 
# coincidentally are also very small. All looks feasible to me.