In [None]:
%%capture
import os
from pathlib import Path
import pandas as pd

from dj_notebook import activate

env_file = os.environ["INTECOMM_ENV"]
documents_folder = Path(os.environ["INTECOMM_DOCUMENTS_FOLDER"])
plus = activate(dotenv_file=env_file)

report_folder = Path(documents_folder)

In [None]:
"""
Assumptions
1. fasting must be >=8hrs
2. take latest measurement if more than one at endline at same timepoint
3. two records need unites to be changes

"""

In [None]:
from intecomm_analytics.dataframes import get_df_main_1858
from tabulate import tabulate
from edc_constants.constants import NO, YES, NOT_APPLICABLE
from edc_pdutils.dataframes import get_crf
from datetime import timedelta
from edc_reportable.units import MILLIGRAMS_PER_DECILITER


In [None]:
df_main = get_df_main_1858(None)


In [None]:
columns = ["subject_identifier", "visit_code", "visit_datetime", "glucose_fasting", "glucose_fasting_duration_str", "glucose_fasting_duration_delta", "glucose_date", "glucose_value", "glucose_quantifier", "glucose_units", "source"]

In [None]:
df_dminitialreview = get_crf(model="intecomm_subject.dminitialreview", subject_visit_model="intecomm_subject.subjectvisit")
df_dminitialreview["source"] = "intecomm_subject.dminitialreview"
df_dminitialreview = df_dminitialreview[(df_dminitialreview.visit_code==1000.0) & (df_dminitialreview.glucose_performed == YES)]
df_dminitialreview["dm_dx_date"] = df_dminitialreview.apply(lambda row: row.dx_date if pd.notna(row.dx_date) else row.dx_calculated_date, axis=1)
df_dx_date = df_dminitialreview[["subject_identifier", "dm_dx_date"]].copy()
df_dx_date.reset_index(drop=True, inplace=True)
df_dminitialreview = df_dminitialreview[columns].copy()
df_dminitialreview.reset_index(drop=True, inplace=True)
df_dminitialreview

# df_dminitialreview[columns]

In [None]:
df_dmreview = get_crf(model="intecomm_subject.dmreview", subject_visit_model="intecomm_subject.subjectvisit")
df_dmreview["source"] = "intecomm_subject.dmreview"
df_dmreview = df_dmreview[(df_dmreview.visit_code>=1090.0) & (df_dmreview.subject_identifier.isin(df_dminitialreview.subject_identifier))]
df_dmreview = df_dmreview[columns].copy()
df_dmreview.reset_index(drop=True, inplace=True)
df_dmreview


In [None]:

df_glucose_crf = get_crf(model="intecomm_subject.glucose", subject_visit_model="intecomm_subject.subjectvisit")
df_glucose_crf["source"] = "intecomm_subject.glucose"
df_glucose_crf = df_glucose_crf[(df_glucose_crf.visit_code>=1090.0) & (df_glucose_crf.subject_identifier.isin(df_dminitialreview.subject_identifier))]
df_glucose_crf = df_glucose_crf[columns].copy()
df_glucose_crf.reset_index(drop=True, inplace=True)
df_glucose_crf


# df_glucose[columns]

In [None]:
df_blood_result_glu = get_crf(model="intecomm_subject.bloodresultsglu", subject_visit_model="intecomm_subject.subjectvisit")
df_blood_result_glu
# df_blood_result_glu[["subject_identifier", "glucose_performed", "glucose_fasting", "glucose_fasting_duration_str", "glucose_date", "glucose_value", "glucose_quantifier", "glucose_units"]]

In [None]:
# merge all sources of glucose results
df_glucose = pd.concat([df_dminitialreview, df_dmreview, df_glucose_crf])
df_glucose = df_glucose[(df_glucose.glucose_fasting==YES) & (df_glucose.glucose_fasting_duration_delta>= timedelta(hours=8))]
df_glucose.reset_index(drop=True, inplace=True)

df_glucose = df_glucose.merge(df_dx_date, on="subject_identifier", how="left")
df_glucose.reset_index(drop=True, inplace=True)
df_glucose


In [None]:
# identify incorrect units
# thses need to be corrected on the EDC
df_glucose[(df_glucose.glucose_units==MILLIGRAMS_PER_DECILITER)]["glucose_value"]


In [None]:
# is visit_datetime an OK proxy for glucose_date?
# df_glucose[df_glucose.visit_datetime != df_glucose.glucose_date][["visit_datetime", "glucose_date"]]

In [None]:
# df_glucose["dx_date"] = df_glucose[df_glucose.source=="intecomm_subject.dminitialreview"].apply(lambda row: row.dx_date if pd.notna(row.dx_date) else row.dx_calculated_date, axis=1)


In [None]:

def get_measured_timedelta(s):
    if pd.notna(s["glucose_date_first"]) and pd.notna(s["glucose_date_last"]):
        return s["glucose_date_last"] - s["glucose_date_first"]
    return pd.NaT


# get endline glucose per subject
df_glucose = df_glucose.sort_values(by=["subject_identifier", "glucose_date"])
# df_glu.rename(
#     columns={"visit_datetime": "glu_datetime", "visit_code": "glu_visit_code"},
#     inplace=True,
# )

df_first_last = (
    df_glucose[
        [
            "subject_identifier",
            "glucose_value",
            "glucose_date",
            "glucose_units",
            "glucose_fasting_duration_delta",
        ]
    ]
    .groupby(by=["subject_identifier"])
    .agg(["first", "last"])
    .reset_index()
)
df_first_last.columns = [
    "_".join(col).strip() if col[1] else col[0]
    for col in df_first_last.columns.values
]

df_first_last["glucose_measured_delta"] = df_first_last.apply(
    get_measured_timedelta, axis=1
)
# df_glu_first_last[df_glu_first_last.glu_measured_delta>timedelta(days=0)]

In [None]:

df_final = pd.merge(df_main, df_first_last, on="subject_identifier", how="left")
df_final = df_final.merge(df_dx_date, on="subject_identifier", how="left")
df_final["glucose_fasting_duration_days_first"] = df_final["glucose_fasting_duration_delta_first"].dt.total_seconds() / 3600
df_final["glucose_fasting_duration_days_last"] = df_final["glucose_fasting_duration_delta_last"].dt.total_seconds() / 3600
df_final["glucose_measured_days_last"] = df_final["glucose_measured_delta"].dt.days
df_final["glucose_measured_days_first"] = 0
df_final = df_final[(df_final.dm_dx_date < (df_final.baseline_datetime - timedelta(days=180)))]
df_final.reset_index(drop=True, inplace=True)


In [None]:
# df_final[df_final.dm==1]

In [None]:
df_final[[
    "subject_identifier",
    "dm_dx_date",
    "glucose_value_first",
    "glucose_date_first",
    "glucose_units_first",
    "glucose_fasting_duration_days_first",
    "glucose_fasting_duration_delta_first",
    "glucose_value_last",
    "glucose_date_last",
    "glucose_units_last",
    "glucose_fasting_duration_delta_last",
    "glucose_fasting_duration_days_last",
    "glucose_measured_delta",
    "glucose_measured_days_first",
    "glucose_measured_days_last",
    "assignment",
]]

In [None]:
#


In [None]:
# def get_measured_from_baseline_to_last_timedelta(s):
#     if pd.notna(s["glucose_date_last"]):
#         return s["glucose_date_last"] - s["baseline_datetime"]
#     return pd.NaT
#
# def get_measured_from_baseline_to_first_timedelta(s):
#     if pd.notna(s["glucose_date_first"]):
#         return s["glucose_date_first"] - s["baseline_datetime"]
#     return pd.NaT
#
# df_glucose["glucose_from_baseline_delta_last"] = df_glucose.apply(get_measured_from_baseline_to_last_timedelta, axis=1)
# df_glucose["glucose_from_baseline_delta_first"] = df_glucose.apply(get_measured_from_baseline_to_first_timedelta, axis=1)


In [None]:
def get_cells_for_continuous_var(df)->list[str]:
    """ From describe(), format 3 cells as:

        +======================+
        | 930                  |
        +----------------------+
        | 127.69(16.84)        |
        +----------------------+
        | 127.00(82.00–183.00) |
        +----------------------+
    """
    return [
        f"{int(df['count'])}",
        f"{df['mean']:.2f}({df['std']:.2f})",
        f"{df['50%']:.2f}({df['min']:.2f}–{df['max']:.2f})"
    ]

def get_cells_for_yes_no_na(df:pd.DataFrame, col:str, arm:str|None=None)->list[str]:
    if arm:
        n = len(df[(df['assignment']==arm) & (df[col].notna())])
        counts = df[(df['assignment'] == arm) & (df[col].notna())][col].value_counts()
        percentages = df[(df['assignment'] == arm) & (df[col].notna())][col].value_counts(normalize=True) * 100
    else:
        n = len(df[(df[col].notna())])
        counts = df[(df[col].notna())][col].value_counts()
        percentages = df[(df[col].notna())][col].value_counts(normalize=True) * 100
    return [
        n,
        f"{counts.get(NO, 0)} ({percentages.get(NO, 0):.1f}%)",
        f"{counts.get(YES, 0)} ({percentages.get(YES, 0):.1f}%)"]
        # f"{counts.get(NOT_APPLICABLE, 0)} ({percentages.get(NOT_APPLICABLE, 0):.1f}%)"]


def get_formatted_rows_glucose(df, col_baseline:str|None=None, col_endline:str|None=None):
    """Returns 5 columns"""

    df_base = df[df[col_baseline].notna()].copy()
    baseline_a = df_base[df_base['assignment'] == 'a'][col_baseline].describe()
    baseline_b = df_base[df_base['assignment'] == 'b'][col_baseline].describe()
    baseline_all = df_base[col_baseline].describe()

    df_end = df[df[col_endline].notna()].copy()
    endline_a = df_end[df_end['assignment'] == 'a'][col_endline].describe()
    endline_b = df_end[df_end['assignment'] == 'b'][col_endline].describe()
    endline_all = df_end[col_endline].describe()

    return  {
        'Timepoint': ['Baseline', '', '', 'Endline', '', ''],
        'Statistics': ['n', 'Mean(sd)', 'Median(min-max)','n', 'Mean(sd)', 'Median(min-max)'],
        'Treatment A': [
            *get_cells_for_continuous_var(baseline_a),
            *get_cells_for_continuous_var(endline_a),
        ],
        'Treatment B': [
            *get_cells_for_continuous_var(baseline_b),
            *get_cells_for_continuous_var(endline_b),
        ],
        'All': [
            *get_cells_for_continuous_var(baseline_all),
            *get_cells_for_continuous_var(endline_all),
        ],
    }

def get_formatted_rows_controllled(df:pd.DataFrame, baseline_col:str, endline_col:str):
    """Returns 5 columns"""
    return  {
        'Timepoint': ['Baseline', '', '', 'Endline', '', ''],
        'Statistics': ['n', 'No', 'Yes', 'n', 'No', 'Yes'],
        'Treatment A': [
            *get_cells_for_yes_no_na(df, baseline_col, arm="a"),
            *get_cells_for_yes_no_na(df, endline_col, arm="a"),
        ],
        'Treatment B': [
            *get_cells_for_yes_no_na(df, baseline_col, arm="b"),
            *get_cells_for_yes_no_na(df, endline_col, arm="b"),
        ],
        'All': [
            *get_cells_for_yes_no_na(df, baseline_col),
            *get_cells_for_yes_no_na(df, endline_col),
        ],
    }

In [None]:
# df_all = df_glucose[[
#     "subject_identifier",
#     "dm_dx_date",
#     "glucose_value_first",
#     "glucose_date_first",
#     "glucose_units_first",
#     "glucose_fasting_duration_delta_first",
#     "glucose_value_last",
#     "glucose_date_last",
#     "glucose_units_last",
#     "glucose_fasting_duration_delta_last",
#     "glucose_from_baseline_delta_first",
#     "glucose_from_baseline_delta_last",
#     "assignment"
# ]].copy()
# df_all

In [None]:
df_all = df_final[(df_final.hiv==0) & (df_final.glucose_measured_delta>=timedelta(days=270))].copy()
df_all.reset_index(inplace=True, drop=True)

df_htn_dm = df_final[((df_final.dm==1) | ((df_final.dm==1) & (df_final.htn==1))) & (df_final.hiv==0) & (df_final.glucose_measured_delta>=timedelta(days=270))].copy()
df_htn_dm.reset_index(inplace=True, drop=True)
path = documents_folder / "df_htn_dm.csv"
df_htn_dm.to_csv(path, index=False)

df_dm_only = df_final[(df_final.dm==1) & (df_final.htn==0) & (df_final.hiv==0) & (df_final.glucose_measured_delta>=timedelta(days=270))].copy()
df_dm_only.reset_index(inplace=True, drop=True)

In [None]:
# ALL
glucose_table = {'Condition': ['All', '', '', '', '', '']}
glucose_table.update({
    'Parameter': ['Glucose level (mmol/L)', '', '', '', '', ''],
    **get_formatted_rows_glucose(df_all, "glucose_value_first", "glucose_value_last")
})
table_all_df = pd.DataFrame(glucose_table)
# table_all_df

In [None]:
# HTN/DM
glucose_table = {'Condition': ['HTN/DM', '', '', '', '', '']}
glucose_table.update({
    'Parameter': ['Glucose level (mmol/L)', '', '', '', '', ''],
    **get_formatted_rows_glucose(df_htn_dm, "glucose_value_first", "glucose_value_last")
})
table_htn_dm_df = pd.DataFrame(glucose_table)
# table_htn_dm_df


In [None]:
# DM
glucose_table = {'Condition': ['DM only', '', '', '', '', '']}
glucose_table.update({
    'Parameter': ['Glucose level (mmol/L)', '', '', '', '', ''],
    **get_formatted_rows_glucose(df_dm_only, "glucose_value_first", "glucose_value_last")
})
table_dm_only_df = pd.DataFrame(glucose_table)
# table_dm_only_df


In [None]:
# Glucose controlled
def controlled(value):
    if value is None:
        return NOT_APPLICABLE
    if value < 7.00:
        return YES
    elif value >= 7.00:
        return NO
    return ''

df_htn_dm["glucose_controlled_first"] = df_htn_dm["glucose_value_first"].apply(lambda x: controlled(x))
df_htn_dm["glucose_controlled_last"] = df_htn_dm["glucose_value_last"].apply(lambda x: controlled(x))

glucose_table = {'Condition': ['HTN/DM', '', '', '', '', '']}
glucose_table.update({
    'Parameter': ['Glucose < 7 mmol/L', '', '', '', '', ''],
    **get_formatted_rows_controllled(df_htn_dm, "glucose_controlled_first", "glucose_controlled_last")
})
table_dm_controlled_df = pd.DataFrame(glucose_table)
table_dm_controlled_df


In [None]:
# FASTING HRS

# ALL
glucose_table = {'Condition': ['ALL', '', '', '', '', '']}
glucose_table.update({
    'Parameter': ['Fasted (hrs)', '', '', '', '', ''],
    **get_formatted_rows_glucose(df_all, col_baseline='glucose_fasting_duration_days_first', col_endline='glucose_fasting_duration_days_last')
})
table_fast_all_df = pd.DataFrame(glucose_table)
table_fast_all_df

In [None]:
# ENDLINE DAYS FROM BASELINE

# ALL
glucose_table = {'Condition': ['ALL', '', '', '', '', '']}
glucose_table.update({
    'Parameter': ['Endline: Days from baseline', '', '', '', '', ''],
    **get_formatted_rows_glucose(df_all, col_baseline='glucose_measured_days_first', col_endline='glucose_measured_days_last')
})
table_days_all_df = pd.DataFrame(glucose_table)
table_days_all_df

In [None]:
table_df = pd.concat([table_all_df, table_htn_dm_df, table_dm_controlled_df, table_fast_all_df, table_days_all_df, table_dm_only_df], ignore_index=True)
table = tabulate(table_df, headers='keys', tablefmt='grid')


In [None]:
path = documents_folder / 'glucose.csv'
table_df.to_csv(path_or_buf=path, index=False)


In [None]:
documents_folder = Path(os.environ["INTECOMM_DOCUMENTS_FOLDER"])
path = documents_folder / 'glucose.txt'
with open(path, 'w') as file:
    file.write(table)

In [None]:
# import matplotlib.pyplot as plt
# import seaborn as sns
#
# data = df_dminitialreview.glucose_value
# data = data.sort_values()
# # Create a figure with two subplots: histogram and boxplot
# fig, ax = plt.subplots(1, 2, figsize=(12, 6))
#
# # Histogram to inspect the distribution of glucose measurements
# sns.histplot(data, kde=True, ax=ax[0])
# ax[0].set_title('Baseline Glucose')
# ax[0].set_xlabel('Glucose (mmol/L)')
# ax[0].set_ylabel('Frequency')
#
# # Boxplot to inspect the central tendency and spread of glucose measurements
# sns.boxplot(y=data, ax=ax[1])
# ax[1].set_title('Baseline Glucose')
# ax[1].set_ylabel('Glucose Measurement (mmol/L)')
#
# # Show the plots
# plt.tight_layout()
# plt.show()