In [None]:
%%capture
import os
import pandas as pd
import numpy as np
from dj_notebook import activate
from pathlib import Path

env_file = os.environ["INTECOMM_ENV"]
analysis_folder = Path(os.environ["INTECOMM_ANALYSIS_FOLDER"])
reports_folder = Path(os.environ["INTECOMM_ANALYSIS_FOLDER"])
plus = activate(dotenv_file=env_file)


In [None]:
from statsmodels.genmod.generalized_estimating_equations import GEE
from statsmodels.genmod.families import Binomial
from statsmodels.stats.proportion import proportion_confint
from intecomm_analytics.dataframes import get_df_main_1858


In [None]:
path = analysis_folder / 'df_bp.csv'
df_bp = pd.read_csv(path)
df_bp

In [None]:
path = analysis_folder / 'df_glu_primary.csv'
df_glucose_gee = pd.read_csv(path)
df_glucose_gee.groupby(by=["assignment"]).size()


In [None]:
df_gee = pd.merge(df_bp, df_glucose_gee, on=["subject_identifier","assignment", "time"], how="outer")
df_gee.groupby(by=["assignment"]).size()

In [None]:
def is_controlled(s):
    if pd.notna(s["bp_controlled"]) and pd.notna(s["glucose_controlled"]):
        if s["bp_controlled"] is True and s["glucose_controlled"] is True:
            return True
        elif s["bp_controlled"] is False and s["glucose_controlled"] is False:
            return False
        else:
            return False # true / false
    elif pd.notna(s["bp_controlled"]) and pd.isna(s["glucose_controlled"]):
        return s["bp_controlled"]
    elif pd.isna(s["bp_controlled"]) and pd.notna(s["glucose_controlled"]):
        return s["glucose_controlled"]
    else:
        return np.nan


df_gee["controlled"] = df_gee.apply(is_controlled, axis=1)
df_gee.drop(columns={"bp_controlled", "glucose_controlled"}, inplace=True)
df_gee

In [None]:
df = df_gee.copy()

# Define the dependent variable and independent variables
dependent_var = 'controlled'
independent_vars = ['assignment', 'time']

# Convert categorical variables to dummy variables
df = pd.get_dummies(df, columns=['assignment', 'time'], drop_first=True)

# Update the list of independent variables after creating dummy variables
independent_vars = [col for col in df.columns if col not in ['controlled', 'subject_identifier']]

# Define the model
model = GEE(df[dependent_var], df[independent_vars], groups=df['subject_identifier'], family=Binomial())

# Fit the model
result = model.fit()
print(result.summary())

In [None]:

df = df_gee.copy()

# Calculate the crude risk difference
risk_a = df[df['assignment'] == 'a']['controlled'].mean()
risk_b = df[df['assignment'] == 'b']['controlled'].mean()
crude_risk_difference = risk_b - risk_a

# Calculate the confidence interval for the crude risk difference
n_a = df[df['assignment'] == 'a'].shape[0]
n_b = df[df['assignment'] == 'b'].shape[0]
ci_low_a, ci_upp_a = proportion_confint(count=df[df['assignment'] == 'a']['controlled'].sum(), nobs=n_a, alpha=0.05, method='normal')
ci_low_b, ci_upp_b = proportion_confint(count=df[df['assignment'] == 'b']['controlled'].sum(), nobs=n_b, alpha=0.05, method='normal')

# Calculate the confidence interval for the risk difference
ci_low_diff = (risk_b - ci_upp_a) - (risk_a - ci_upp_b)
ci_upp_diff = (risk_b + ci_low_a) - (risk_a + ci_low_b)

print(f"Crude Risk Difference: {crude_risk_difference}")
print(f"95% Confidence Interval for Crude Risk Difference: ({ci_low_diff}, {ci_upp_diff})")

In [None]:
df_gee.time.value_counts()

In [None]:
####

In [None]:
df_gee

In [None]:
df_main = get_df_main_1858(None)
df_smf = df_gee.merge(df_main[["subject_identifier", "group_identifier", "age_in_years", "gender"]], on="subject_identifier", how="left")

In [None]:
df_main[(df_main.hiv==1) & ((df_main.dm==0) & (df_main.htn==0))].groupby(by=["assignment"]).size()


In [None]:
df_main[(df_main.hiv==0) & ((df_main.dm==1) | (df_main.htn==1))].groupby(by=["assignment"]).size()

In [None]:
690+679

In [None]:
df_smf.reset_index(drop=True, inplace=True)

In [None]:
df_smf = df_smf[df_smf.time=="endline"].copy()
df_smf.reset_index(drop=True, inplace=True)
df_smf


In [None]:
df_smf["cluster"] = df_smf["group_identifier"].str[0:7]
df_smf["cluster"] = df_smf["cluster"] .astype(int)
df_smf["group"] = df_smf["assignment"].apply(lambda x: 1 if x == "a" else 0)
df_smf["outcome"] = df_smf["controlled"].apply(lambda x: 1 if x is True else 0)
df_smf.dtypes

In [None]:
import statsmodels.api as sm
import statsmodels.formula.api as smf


# Define the correlation structure
ind = sm.cov_struct.Exchangeable()

# Fit the GEE model with identity link and binomial family, including covariates
model = smf.gee(
    "outcome ~ group + age_in_years + gender",
    "cluster",
    df_smf,
    cov_struct=ind,
    # family=sm.families.Binomial(link=sm.families.links.Identity())
    family=sm.families.Binomial(link=sm.families.links.Logit())
)
result = model.fit()

# Print the summary
print(result.summary())


In [None]:
# Fit the Logit model with regularization
model = smf.logit("outcome ~ group + age_in_years + gender", df_smf).fit_regularized()
print(model.summary())