In [None]:
%%capture
import os
from pathlib import Path
import pandas as pd

from dj_notebook import activate

env_file = os.environ["INTECOMM_ENV"]
reports_folder = Path(os.environ["INTECOMM_REPORTS_FOLDER"])
analysis_folder = Path(os.environ["INTECOMM_ANALYSIS_FOLDER"])
plus = activate(dotenv_file=env_file)


In [None]:
"""
Assumptions
1. fasting must be >=8hrs
2. take latest measurement if more than one at endline at same timepoint
3. two records need unites to be changes

"""

In [None]:
from intecomm_analytics.dataframes import get_df_main_1858
from tabulate import tabulate
from edc_constants.constants import NO, YES
from intecomm_analytics.dataframes import treatment_arm_labels as treatment_arm
from intecomm_rando.constants import FACILITY_ARM, COMMUNITY_ARM



In [None]:
narrative = []

# boudaries for first measurement
baseline_lower_bound = -180
baseline_upper_bound = 31

# boudaries for last measurement
endline_lower_bound = 182

# boundaries for diagnosis
days_since_dx = 180

# boundaries for fasting
fasting_hours = 8



In [None]:
# get 1858
df_main_original = get_df_main_1858(None)
df_main = df_main_original.copy()


In [None]:
df_main.country.value_counts()

In [None]:
msg = f"There are {len(df_main[(df_main.dm_scr==1)])}/{len(df_main)} subjects with DM reported at screening"
narrative.append(msg)
print(msg)


In [None]:
msg = f"There are {len( df_main[(df_main.dm==1)])}/{len(df_main)} subjects with DM confirmed at baseline and diagnosed at least {days_since_dx} days before baseline"
narrative.append(msg)
print(msg)


In [None]:
msg = f"There are {len(df_main[(df_main.dm==1) & (df_main.hiv==0)])}/{len(df_main[(df_main.dm==1)])} HIV(-) subjects with All confirmed at baseline and diagnosed at least {days_since_dx} days before baseline"
narrative.append(msg)
print(msg)



In [None]:
msg = f"There are {len(df_main[(df_main.htn==0) & (df_main.dm==1) & (df_main.hiv==0)])}/{len(df_main[(df_main.dm==1) & (df_main.hiv==0)])} HIV(-) subjects with DM ONLY confirmed at baseline and diagnosed at least {days_since_dx} days before baseline"
narrative.append(msg)
print(msg)


In [None]:
print("\n".join(narrative))

In [None]:
len(df_main[(df_main.hiv==0) & (df_main.dm==1)])

In [None]:
def get_cells_for_continuous_var(df)->list[str]:
    return [
        f"{int(df['count'])}",
        f"{df['mean']:.2f}({df['std']:.2f})",
        f"{df['50%']:.2f}({df['min']:.2f}–{df['max']:.2f})"
    ]

def get_cells_for_yes_no(df:pd.DataFrame, col:str, arm:str|None=None)->list[str]:
    if arm:
        n = len(df[(df['assignment']==arm) & (df[col].notna())])
        counts = df[(df['assignment'] == arm) & (df[col].notna())][col].value_counts()
        percentages = df[(df['assignment'] == arm) & (df[col].notna())][col].value_counts(normalize=True) * 100
    else:
        n = len(df[(df[col].notna())])
        counts = df[(df[col].notna())][col].value_counts()
        percentages = df[(df[col].notna())][col].value_counts(normalize=True) * 100
    return [
        n,
        f"{counts.get(YES, 0)} ({percentages.get(YES, 0):.1f}%)",
        f"{counts.get(NO, 0)} ({percentages.get(NO, 0):.1f}%)"]

def get_cells_for_yes_no_missing(df:pd.DataFrame, col:str, arm:str|None=None)->list[str]:
    if arm:
        n = len(df[(df['assignment']==arm) & (df[col].notna())])
        counts = df[(df['assignment'] == arm) & (df[col].notna())][col].value_counts()
        percentages = df[(df['assignment'] == arm) & (df[col].notna())][col].value_counts(normalize=True) * 100
    else:
        n = len(df[(df[col].notna())])
        counts = df[(df[col].notna())][col].value_counts()
        percentages = df[(df[col].notna())][col].value_counts(normalize=True) * 100
    return [
        n,
        f"{counts.get(YES, 0)} ({percentages.get(YES, 0):.1f}%)",
        f"{counts.get(NO, 0)} ({percentages.get(NO, 0):.1f}%)",
        f"{counts.get('Missing', 0)} ({percentages.get('Missing', 0):.1f}%)"]


def get_formatted_rows_glucose(df, col_baseline:str|None=None, col_endline:str|None=None):
    """Returns 5 columns"""

    df_base = df.copy()
    baseline_a = df_base[df_base['assignment'] == 'a'][col_baseline].describe()
    baseline_b = df_base[df_base['assignment'] == 'b'][col_baseline].describe()
    baseline_all = df_base[col_baseline].describe()

    df_end = df[(df["onstudy_days"] >= 182)].copy()
    endline_a = df_end[df_end['assignment'] == 'a'][col_endline].describe()
    endline_b = df_end[df_end['assignment'] == 'b'][col_endline].describe()
    endline_all = df_end[col_endline].describe()

    return  {
        'Timepoint': ['Baseline', '', '', 'Endline', '', ''],
        'Statistics': ['n', 'Mean(sd)', 'Median(min-max)','n', 'Mean(sd)', 'Median(min-max)'],
        treatment_arm[COMMUNITY_ARM]: [
            *get_cells_for_continuous_var(baseline_a),
            *get_cells_for_continuous_var(endline_a),
        ],
        treatment_arm[FACILITY_ARM]: [
            *get_cells_for_continuous_var(baseline_b),
            *get_cells_for_continuous_var(endline_b),
        ],
        'All': [
            *get_cells_for_continuous_var(baseline_all),
            *get_cells_for_continuous_var(endline_all),
        ],
    }

def get_formatted_rows_yes_no(df_base:pd.DataFrame,df_end:pd.DataFrame, baseline_col:str, endline_col:str, missing:bool|None=None):
    """Returns 5 columns"""
    rows = {}
    if missing:
        func = get_cells_for_yes_no_missing
        rows.update({
            'Timepoint': ['Baseline', '', '','', 'Endline', '', '',''],
            'Statistics': ['n', 'Yes', 'No', "Missing", 'n', 'Yes','No', "Missing"]})
    else:
        func = get_cells_for_yes_no
        rows.update({
            'Timepoint': ['Baseline', '', '', 'Endline', '', ''],
            'Statistics': ['n', 'Yes', 'No', 'n', 'Yes', 'No',]})
    rows.update({
        treatment_arm[COMMUNITY_ARM]: [
        *func(df_base, baseline_col, arm="a"),
        *func(df_end, endline_col, arm="a"),
        ],
        treatment_arm[FACILITY_ARM]: [
            *func(df_base, baseline_col, arm="b"),
            *func(df_end, endline_col, arm="b"),
        ],
        'All': [
            *func(df_base, baseline_col),
            *func(df_end, endline_col),
        ],
        })
    return rows

In [None]:
# create df_main filtered by condition
df_dm_htn = df_main[(df_main.hiv==0) & ((df_main.dm==1) | ((df_main.dm==1) & (df_main.htn==1)))].copy()
df_dm_htn.reset_index(inplace=True, drop=True)

In [None]:
print(f"{df_dm_htn[df_dm_htn.glucose_value_baseline.notna()]["subject_identifier"].count()} first results")
print(f"{df_dm_htn[df_dm_htn.glucose_value_endline.notna()]["subject_identifier"].count()} last results")
print(f"{df_dm_htn[(df_dm_htn.glucose_value_baseline.notna()) & (df_dm_htn.glucose_value_endline.notna())]["subject_identifier"].count()} first and last results")

In [None]:
path = analysis_folder / "df_htn_dm.csv"
df_dm_htn.to_csv(path, index=False)


In [None]:
df_dm_and_htn = df_dm_htn[(df_dm_htn.htn==1) & (df_dm_htn.dm==1)].copy()
df_dm_only = df_dm_htn[(df_dm_htn.dm==1) & (df_dm_htn.htn==0)].copy()


In [None]:
# All
glucose_table = {'Condition': ['All', '', '', '', '', '']}
glucose_table.update({
    'Parameter': ['Glucose level (mmol/L)', '', '', '', '', ''],
    **get_formatted_rows_glucose(df_dm_htn, "glucose_value_baseline", "glucose_value_endline")
})
table_dm_htn_df = pd.DataFrame(glucose_table)
table_dm_htn_df


In [None]:
# All
glucose_table = {'Condition': ['DM and HTN', '', '', '', '', '']}
glucose_table.update({
    'Parameter': ['Glucose level (mmol/L)', '', '', '', '', ''],
    **get_formatted_rows_glucose(df_dm_and_htn, "glucose_value_baseline", "glucose_value_endline")
})
table_dm_and_htn_df = pd.DataFrame(glucose_table)
table_dm_and_htn_df


In [None]:
# DM
glucose_table = {'Condition': ['DM only', '', '', '', '', '']}
glucose_table.update({
    'Parameter': ['Glucose level (mmol/L)', '', '', '', '', ''],
    **get_formatted_rows_glucose(df_dm_only, "glucose_value_baseline", "glucose_value_endline")
})
table_dm_only_df = pd.DataFrame(glucose_table)
table_dm_only_df


In [None]:
# Glucose resulted

glucose_table = {'Condition': ['All', '', '', '', '', '']}
glucose_table.update({
    'Parameter': ['Glucose measured', '', '', '', '', ''],
    **get_formatted_rows_yes_no(
        df_dm_htn,
        df_dm_htn[(df_dm_htn["onstudy_days"] >= 182)],
        "glucose_resulted_baseline",
        "glucose_resulted_endline",
        missing=False
    )
})
table_dm_htn_available_df = pd.DataFrame(glucose_table)
table_dm_htn_available_df


In [None]:
glucose_table = {'Condition': ['DM and HTN', '', '', '', '', '']}
glucose_table.update({
    'Parameter': ['Glucose measured', '', '', '', '', ''],
    **get_formatted_rows_yes_no(
        df_dm_and_htn,
        df_dm_and_htn[(df_dm_and_htn["onstudy_days"] >= 182)],
        "glucose_resulted_baseline",
        "glucose_resulted_endline",
        missing=False
    )
})
table_dm_and_htn_available_df = pd.DataFrame(glucose_table)
table_dm_and_htn_available_df


In [None]:
glucose_table = {'Condition': ['DM only', '', '', '', '', '']}
glucose_table.update({
    'Parameter': ['Glucose measured', '', '', '', '', ''],
    **get_formatted_rows_yes_no(
        df_dm_only,
        df_dm_only[(df_dm_only["onstudy_days"] >= 182)],
        "glucose_resulted_baseline",
        "glucose_resulted_endline",
        missing=False
    )
})
table_dm_only_available_df = pd.DataFrame(glucose_table)
table_dm_only_available_df


In [None]:
glucose_table = {'Condition': ['All', '', '', '', '', '']}
glucose_table.update({
    'Parameter': ['Glucose < 7 mmol/L', '', '', '', '', ''],
    **get_formatted_rows_yes_no(
        df_dm_htn[df_dm_htn.glucose_value_baseline.notna()],
        df_dm_htn[(df_dm_htn.glucose_value_endline.notna()) & (df_dm_htn["onstudy_days"] >= 182)],
        "glucose_controlled_baseline",
        "glucose_controlled_endline",
        missing=False
    )
})
table_dm_htn_controlled_df = pd.DataFrame(glucose_table)
table_dm_htn_controlled_df


In [None]:
glucose_table = {'Condition': ['DM and HTN', '', '', '', '', '']}
glucose_table.update({
    'Parameter': ['Glucose < 7 mmol/L', '', '', '', '', ''],
    **get_formatted_rows_yes_no(
        df_dm_and_htn[df_dm_and_htn.glucose_value_baseline.notna()],
        df_dm_and_htn[(df_dm_and_htn.glucose_value_endline.notna()) & (df_dm_and_htn["onstudy_days"] >= 182)],
        "glucose_controlled_baseline",
        "glucose_controlled_endline",
        missing=False
    )
})
table_dm_and_htn_controlled_df = pd.DataFrame(glucose_table)
table_dm_and_htn_controlled_df


In [None]:


glucose_table = {'Condition': ['DM only', '', '', '', '', '']}
glucose_table.update({
    'Parameter': ['Glucose < 7 mmol/L', '', '', '', '', ''],
    **get_formatted_rows_yes_no(
        df_dm_only[(df_dm_only.glucose_value_baseline.notna())],
        df_dm_only[(df_dm_only.glucose_value_endline.notna()) & (df_dm_only["onstudy_days"] >= 182)],
        "glucose_controlled_baseline",
        "glucose_controlled_endline",
        missing=False
    )
})
table_dm_only_controlled_df = pd.DataFrame(glucose_table)
table_dm_only_controlled_df


In [None]:
# FASTING HRS

# ALL
glucose_table = {'Condition': ['ALL', '', '', '', '', '']}
glucose_table.update({
    'Parameter': ['Fasted (hrs)', '', '', '', '', ''],
    **get_formatted_rows_glucose(df_dm_htn, col_baseline='glucose_fasting_duration_hours_baseline', col_endline='glucose_fasting_duration_hours_endline')
})
table_fast_all_df = pd.DataFrame(glucose_table)
# table_fast_all_df

In [None]:
# ENDLINE DAYS FROM BASELINE

# ALL
glucose_table = {'Condition': ['ALL', '', '', '', '', '']}
glucose_table.update({
    'Parameter': ['Mean days measured from baseline', '', '', '', '', ''],
    **get_formatted_rows_glucose(df_dm_htn, col_baseline='glucose_measured_days_baseline', col_endline='glucose_measured_days_endline')
})
table_days_measured_from_baseline_df = pd.DataFrame(glucose_table)


In [None]:
# ALL
baseline_a = df_dm_htn[df_dm_htn['assignment'] == 'a']["glucose_first_to_last_days"].describe()
baseline_b = df_dm_htn[df_dm_htn['assignment'] == 'b']["glucose_first_to_last_days"].describe()
baseline_all = df_dm_htn["glucose_first_to_last_days"].describe()

glucose_table = {
    'Condition': ['ALL', '', ''],
    'Parameter': ['Mean days between measurments', '', '', ],
    'Timepoint': ['', '', ''],
    'Statistics': ['n', 'Mean(sd)', 'Median(min-max)'],
    treatment_arm[COMMUNITY_ARM]: [
        *get_cells_for_continuous_var(baseline_a),
    ],
    treatment_arm[FACILITY_ARM]: [
        *get_cells_for_continuous_var(baseline_b),
    ],
    'All': [
        *get_cells_for_continuous_var(baseline_all),
    ],
}


table_mean_days_between_measurements_df = pd.DataFrame(glucose_table)


In [None]:
table_df = pd.concat([
    table_dm_htn_available_df,
    table_dm_htn_df,
    table_dm_htn_controlled_df,
    table_dm_and_htn_available_df,
    table_dm_and_htn_df,
    table_dm_and_htn_controlled_df,
    table_dm_only_available_df,
    table_dm_only_df,
    table_dm_only_controlled_df,
    table_fast_all_df,
    table_days_measured_from_baseline_df,
    table_mean_days_between_measurements_df],
    ignore_index=True
)
table = tabulate(table_df, headers='keys', tablefmt='grid')


In [None]:
path = analysis_folder / 'glucose.csv'
table_df.to_csv(path_or_buf=path, index=False)


In [None]:
path = analysis_folder / 'glucose.txt'
with open(path, 'w') as file:
    file.write(table)

In [None]:
print("\n".join(narrative))

In [None]:
df_baseline = df_dm_htn[["subject_identifier", "assignment","glucose_controlled_baseline"]].copy()
df_baseline.rename(columns={"glucose_controlled_baseline": "glucose_controlled"}, inplace=True)
df_baseline["glucose_controlled"] = df_baseline["glucose_controlled"] == 1.0
df_baseline["time"] = "baseline"
df_endline = df_dm_htn[["subject_identifier", "assignment","glucose_controlled_endline"]].copy()
df_endline.rename(columns={"glucose_controlled_endline": "glucose_controlled"}, inplace=True)
df_endline["time"] = "endline"
df_endline["glucose_controlled"] = df_endline["glucose_controlled"] == 1.0

df_glu_gee = pd.concat([df_baseline, df_endline], ignore_index=True)


In [None]:
path = analysis_folder / 'df_glu_primary.csv'
df_glu_gee.to_csv(path, index=False)


In [None]:
table_df