In [None]:
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import itertools as iter

from regmod.data import Data
from regmod.models import BinomialModel
from regmod.variable import Variable, SplineVariable
from regmod.utils import SplineSpecs

from pplkit.data.interface import DataInterface

In [None]:
#source activate FILEPATH
!which python

In [None]:
ticket = Path("FILEPATH")
dataif = DataInterface(ticket=ticket, data=ticket / "data")

In [None]:
def pre_process_cf(df: pd.DataFrame) -> pd.DataFrame:
    df["year_id"] = (df["year_id"] - 1980) / (2020 - 1980)
    id_vars = ["cause_id", "location_id", "age_group_id", "sex_id", "year_id"]
    df_ss = pd.melt(
        df.rename(columns={"sample_vr": "vr", "sample_cr": "cr"}),
        id_vars=id_vars,
        value_vars=["vr", "cr"],
        var_name="cf_type",
        value_name="sample_size",
    )
    df_cf = pd.melt(
        df.rename(columns={"cf_vr": "vr", "cf_cr": "cr"}),
        id_vars=id_vars,
        value_vars=["vr", "cr"],
        var_name="cf_type",
        value_name="cf",
    )
    df_cf = df_cf[~df_cf["cf"].isna()].reset_index(drop=True)
    df_cf["is_cr"] = (df_cf["cf_type"] == "cr").astype(float)
    df_cf = df_cf.merge(
        df_ss,
        on=id_vars + ["cf_type"],
        how="left",
    )
    return df_cf

def adjust_cf_cr(
    df: pd.DataFrame,
    variables: list[Variable, ...],
    
) -> pd.DataFrame:
    df = df.copy()
    if len(df["cf_type"].unique()) == 1:
        print(f"all data are {df['cf_type'].unique()[0]}")
        df["cf_adjusted"] = df["cf"]
        return df
    id_vars = ["cause_id", "location_id", "age_group_id", "sex_id", "year_id"]
    col_covs = [v.name for v in variables]
    data = Data(
        col_obs="cf",
        col_covs=col_covs,
        col_weights="sample_size",
        df=df,
    )
    model = BinomialModel(data, param_specs={"p": {"variables": variables}})
    try:
        model.fit()
    except ValueError:
        print("singular vcov")
        
    # predict
    index = col_covs.index("is_cr")
    adjustment = np.exp(df["is_cr"] * model.opt_coefs[index])
    df["cf_adjusted"] = df["cf"] / ((1 - df["cf"]) * adjustment + df["cf"])
    df_pred_data = df.copy()
    df_pred_data["is_cr"] = 0.0
    df["cf_vr_pred"] = model.predict(df_pred_data)["p"]
    df[id_vars + ["cf_type", "cf", "cf_adjusted", "cf_vr_pred"]].copy()
    return df



In [None]:
# expanded to all causes both sexes, all ages, all locations
df = dataif.load_data("overlap_one.csv")
df_group = df.groupby(["cause_id","location_id", "age_group_id", "sex_id"], as_index=False)

results = list()
for key, df_sub in df_group:
    try: 
        df_sub = pre_process_cf(df_sub)
        df_sub = adjust_cf_cr(
            df_sub, 
            variables=[
                Variable("intercept"), 
                Variable("is_cr")
            ]
        )
        
        results.append(df_sub)
    except Exception as e:  
        print(f"this key failed: {key}")
        print(e)

final_result = pd.concat(results)
final_result.reset_index(inplace=True)
final_result['year_id'] = (final_result['year_id'] * 40) + 1980
# Round values to the nearest whole number and convert to integers
columns_to_convert = ['cause_id', 'location_id', 'age_group_id', 'sex_id', 'year_id']
final_result[columns_to_convert] = final_result[columns_to_convert].round().astype('int')

In [None]:
#write out to csv
final_result.to_csv(r"FILEPATH")