In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from mrtool import MRData, LinearCovModel
from gkmodel import OverallModel, StudyModel, TwoStageModel, StagewiseModel, result_to_df

In [None]:
indicator = 'smoking'
df = pd.read_csv(f'../data/{indicator}.as.csv')

In [None]:
df.head(3)

In [None]:
data_stage1 = MRData()
data_stage1.load_df(
    df,
    col_obs=f'{indicator}_logit',
    col_obs_se=f'{indicator}_logit_se',
    col_covs=['sdi', 'year_id'],
    col_study_id='ls_id'
)

In [None]:
cov_models1 = [
    LinearCovModel('intercept'),
    LinearCovModel('sdi',
                   use_spline=True,
                   spline_knots=np.linspace(0.0, 1.0, 5),
                   spline_l_linear=True,
                   spline_r_linear=True)
]
cov_models2 = [LinearCovModel('intercept'), LinearCovModel('sdi')]

### Using Overall Model and StudyModel separately

In [None]:
data_stage1

In [None]:
model_stage1 = OverallModel(data_stage1, cov_models1)

In [None]:
model_stage1.fit_model()

In [None]:
model_stage1.soln

In [None]:
model_stage1.data._sort_by_data_id()
df_stage1 = result_to_df(model_stage1, model_stage1.data, prediction='pred_stage1', residual='resi_stage1')
df_stage1 = pd.merge(df, df_stage1[['pred_stage1', 'resi_stage1']], left_index=True, right_index=True)

In [None]:
df_stage1.head()

In [None]:
data_stage2 = MRData()
data_stage2.load_df(
    df_stage1,
    col_obs='resi_stage1',
    col_obs_se=f'{indicator}_logit_se',
    col_covs=['sdi'],
    col_study_id='ls_id'
)

In [None]:
model_stage2 = StudyModel(data_stage2, cov_models2)

In [None]:
model_stage2.fit_model()

### Using TwoStage Model

In [None]:
cov_models1 = [
    LinearCovModel('intercept'),
    LinearCovModel('sdi',
                   use_spline=True,
                   spline_knots=np.linspace(0.0, 1.0, 5),
                   spline_l_linear=True,
                   spline_r_linear=True)
]
cov_models2 = [LinearCovModel('intercept'), LinearCovModel('sdi')]

In [None]:
tsmodel = TwoStageModel(data_stage1, cov_models1, cov_models2)

In [None]:
tsmodel.fit_model()

In [None]:
result_to_df(tsmodel, tsmodel.data1)

### Using StageWise

In [None]:
cov_models1 = [
    LinearCovModel('intercept'),
    LinearCovModel('sdi',
                   use_spline=True,
                   spline_knots=np.linspace(0.0, 1.0, 5),
                   spline_l_linear=True,
                   spline_r_linear=True)
]
cov_models2 = [LinearCovModel('intercept'), LinearCovModel('sdi')]

In [None]:
swmodel = StagewiseModel(data_stage1, [OverallModel(cov_models=cov_models1),
                                       StudyModel(cov_models=cov_models2)])

In [None]:
swmodel.fit_model()

In [None]:
swmodel.soln_to_df(0)

In [None]:
swmodel.soln_to_df(1)

In [None]:
swmodel.result_to_df()

### Compare Fit

In [None]:
print(model_stage1.soln - tsmodel.model1.soln)
print(model_stage1.soln - swmodel.node_models[0].soln)

In [None]:
for ls_id, soln in model_stage2.soln.items():
    assert np.linalg.norm(soln - tsmodel.model2.soln[ls_id]) / np.linalg.norm(soln) < 1e-5
    assert np.linalg.norm(soln - swmodel.node_models[1].soln[ls_id]) / np.linalg.norm(soln) < 1e-5

### Compare prediction

- predictions from OverallModel + StudyModel

In [None]:
model_stage2.data._sort_by_data_id()
df_stage2 = result_to_df(model_stage2, model_stage2.data, prediction='pred_stage2', residual='resi_stage2')
df_stage2 = pd.merge(df_stage1, df_stage2[['pred_stage2', 'resi_stage2']], left_index=True, right_index=True)

In [None]:
df_stage2.head()

In [None]:
prediction = df_stage2['pred_stage1'].values + df_stage2['pred_stage2'].values

- prediction from TwoStageModel

In [None]:
prediction_ts = tsmodel.predict()

In [None]:
np.linalg.norm(prediction_ts - prediction)

- prediction from stagewise model

In [None]:
prediction_sw = swmodel.predict()

In [None]:
np.linalg.norm(prediction_sw - prediction)

- predict with quantile

In [None]:
prediction_ts

In [None]:
tsmodel.predict(slope_quantile=dict(sdi=0.15), ref_cov=None)

In [None]:
tsmodel.predict(slope_quantile=dict(sdi=0.15), ref_cov=('year_id', 2000))

In [None]:
sw_result = swmodel.result_to_df()
sw_result_base = sw_result.copy()
sw_result_base['pred_type'] = 'base'

In [None]:
sw_result_no_adjust = sw_result.copy()
sw_result_no_adjust['prediction'] = swmodel.predict(slope_quantile=dict(sdi=0.15), ref_cov=None)
sw_result_no_adjust['pred_type'] = 'no_adjust'

In [None]:
sw_result_adjusted = sw_result.copy()
sw_result_adjusted['prediction'] = swmodel.predict(slope_quantile=dict(sdi=0.15), ref_cov=('year_id', 2000))
sw_result_adjusted['pred_type'] = 'adjusted'

In [None]:
result_compare = pd.concat([sw_result_base, sw_result_no_adjust, sw_result_adjusted]).query('study_id == [101, 102]')

In [None]:
result_compare

In [None]:
import plotly.express as px

In [None]:
px.scatter(
    result_compare, 
    x='year_id', 
    y='prediction', 
    color='pred_type', facet_row='study_id', width=700, height=1000).update_traces(mode='lines+markers')

### More plotting

In [None]:
ls_id = 1021
index = df_stage2.ls_id == ls_id
obs = df_stage2[f'{indicator}_logit'].values[index]
year_id = df_stage2['year_id'].values[index]

prediction = df_stage2['pred_stage1'].values + df_stage2['pred_stage2'].values
print(prediction)
prediction = prediction[index]

plt.scatter(year_id, obs, label='observation')
plt.scatter(year_id, prediction, label='prediction')
plt.legend()
plt.title(f"loc_sex_id: {ls_id}")

In [None]:
soln = np.vstack(list(model_stage2.soln.values()))

In [None]:
plt.plot(np.sort(soln[:, 0]), np.linspace(0.0, 1.0, soln.shape[0]))
plt.title("random intercept cumulative density")

In [None]:
plt.plot(np.sort(soln[:, 1]), np.linspace(0.0, 1.0, soln.shape[0]))
plt.title("random slope cumulative density")