# Space

In [None]:
import sys
import os 
import logging
import pandas as pd
from pprint import pprint 
from IPython.display import display, HTML

KEY = '1-WORKSPACE'
WORKSPACE_PATH = os.getcwd().split(KEY)[0]
print(WORKSPACE_PATH); os.chdir(WORKSPACE_PATH)
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format='[%(levelname)s:%(asctime)s:(%(filename)s@%(lineno)d %(name)s)]: %(message)s')

SPACE = {
    'DATA_RAW': f'./_Data/0-Data_Raw',
    'DATA_RFT': f'./_Data/1-Data_RFT',
    'DATA_CASE': f'./_Data/2-Data_CASE',
    'DATA_AIDATA': f'./_Data/3-Data_AIDATA',
    'DATA_EXTERNAL': f'./code/external',
    'CODE_FN': f'./code/pipeline', 
}
assert os.path.exists(SPACE['CODE_FN']), f'{SPACE["CODE_FN"]} not found'

print(SPACE['CODE_FN'])
sys.path.append(SPACE['CODE_FN'])

# Step 1: OneCohort_Args

In [None]:
from config.config_record.Cohort import CohortName_to_OneCohortArgs # marked. 

CohortNames = [i for i in CohortName_to_OneCohortArgs.keys()]
print(CohortNames)

In [None]:
# # %%%%%%%%%%%%%%%%%%%%% user
CohortName = '20241013_InferencePttSampleV0'
# # %%%%%%%%%%%%%%%%%%%%% 
OneCohort_Args = CohortName_to_OneCohortArgs[CohortName]
OneCohort_Args

In [None]:
# # %%%%%%%%%%%%%%%%%%%%% user
# OneCohort_Args = {
#     'CohortLabel': 1, 
#     'CohortName': 'WellDoc2022CGM', 
#     'FolderPath': '$DATA_RAW$/WellDoc2022CGM/', 
#     'SourcePath': 'Source',
#     'Source2CohortName': 'WellDocV240629',
# }
# %%%%%%%%%%%%%%%%%%%%%

print(SPACE['DATA_RAW'])

In [None]:
from recfldtkn.record_base.cohort import Cohort   # Cohort is a class

# Define a placeholder for a cohort funciton 
cohort_fn = None
# Initialize an object of the Cohort class with initial arguments
cohort = Cohort(OneCohort_Args, SPACE, cohort_fn)  # cohort is an object of class Cohort, this object  will manage dataset parameters
# Update the cohort arguments using the update_cohort_args method
# Input is OneCohort_Args and SPACE, which are two dictionaries and the 
OneCohort_Args = cohort.update_cohort_args(OneCohort_Args, SPACE) # update cohort args
# Pretty print the updated cohort argument
pprint(OneCohort_Args, sort_dicts=False)

In [None]:
# cohort.RawName_to_dfRaw

# Step 2: Get Source Files

In [None]:
# %%%%%%%%%%%%%%%%%%%%% user
# Define file suffix list to look for .csv files
SourceFile_SuffixList = ['json'] 
# %%%%%%%%%%%%%%%%%%%%% user

# Get the folder path from OneCohort_Args dictionary (provided by the user)
Folder = OneCohort_Args['SourcePath'] 

# Use the cohort object to get a list of source files with the specified suffix
SourceFile_List = cohort.get_SourceFile_List(Folder, SourceFile_SuffixList)

# Print the list of source files
SourceFile_List

# Step 3: Get RawName from SourceFile

In [None]:
import inspect
import os

# %%%%%%%%%%%%%%%%%%%%% user
def get_RawName_from_SourceFile(file_path, OneCohort_Args):
    RawName = os.path.basename(file_path).split('.')[0].replace('inference_form_', '')
    return RawName

get_RawName_from_SourceFile.fn_string = inspect.getsource(get_RawName_from_SourceFile)
# %%%%%%%%%%%%%%%%%%%%% 

file_path = SourceFile_List[0]
print(type(file_path))
RawName = get_RawName_from_SourceFile(file_path, OneCohort_Args)
print(file_path)
print(RawName)

In [None]:
RawName_to_dfRaw = {}
for file_path in SourceFile_List:
    RawName = get_RawName_from_SourceFile(file_path, OneCohort_Args)
    RawName_to_dfRaw[RawName] = file_path

RawName_to_dfRaw

In [None]:
RawName = [i for i in RawName_to_dfRaw.keys() if 'sample' in i][0]
RawName

In [None]:
import json 

inference_form_path = RawName_to_dfRaw[RawName]
with open(inference_form_path, 'r') as f:
    inference_form = json.load(f)

inference_form

# Step 4: Process Source to Raw

## 1. sample and template

In [None]:
RawName_to_dfRaw = {}
for file_path in SourceFile_List:
    RawName = get_RawName_from_SourceFile(file_path, OneCohort_Args)
    RawName_to_dfRaw[RawName] = file_path
RawName_to_dfRaw

In [None]:
import json 

RawName = 'template'

inference_form_path = RawName_to_dfRaw[RawName]
with open(inference_form_path, 'r') as f:
    template_form = json.load(f)

pprint(template_form, sort_dicts=False)

In [None]:
import json 

RawName = [i for i in RawName_to_dfRaw if 'sample' in i][0] 

inference_form_path = RawName_to_dfRaw[RawName]
with open(inference_form_path, 'r') as f:
    inference_form = json.load(f)


print(RawName)
# pprint(inference_form, sort_dicts=False)

In [None]:
from recfldtkn.base import fill_missing_keys, replace_none_with_list

fill_missing_keys.fn_string = inspect.getsource(fill_missing_keys)  
replace_none_with_list.fn_string = inspect.getsource(replace_none_with_list)    

# inference_form = {}
inference_form = fill_missing_keys(inference_form, template_form)
# pprint(inference_form)
inference_form = replace_none_with_list(inference_form)
# pprint(inference_form)

In [None]:
def get_InferenceEntry(OneCohort_Args, 
                       SourceFile_List,
                       get_RawName_from_SourceFile):
    Inference_EntryPath = {}
    for file_path in SourceFile_List:
        RawName = get_RawName_from_SourceFile(file_path, OneCohort_Args)
        Inference_EntryPath[RawName] = file_path

    Inference_Entry = {}
    RawName = 'template'
    inference_form_path = Inference_EntryPath[RawName]
    with open(inference_form_path, 'r') as f:
        template_form = json.load(f)
    Inference_Entry['template_form'] = template_form

    RawName_list = [i for i in Inference_EntryPath.keys() if 'sample' in i]
    for RawName in RawName_list:
        inference_form_path = Inference_EntryPath[RawName]
        with open(inference_form_path, 'r') as f:
            inference_form = json.load(f)

        # inference_form = fill_missing_keys(inference_form, template_form)
        # inference_form = replace_none_with_list(inference_form)

        Inference_Entry[f'inference_form_{RawName}'] = inference_form

    return Inference_Entry


get_InferenceEntry.fn_string = inspect.getsource(get_InferenceEntry)

In [None]:
Inference_Entry = get_InferenceEntry(OneCohort_Args, 
                                    SourceFile_List,
                                    get_RawName_from_SourceFile)

print([i for i in Inference_Entry])


In [None]:
# %%%%%%%%%%%%%%%%%%%%% user
def process_Source_to_Raw(OneCohort_Args, 
                          SourceFileList_or_InferenceEntry, 
                          get_RawName_from_SourceFile,
                          SPACE):
    
    # 1. prepare inference_form
    if type(SourceFileList_or_InferenceEntry) == list:
        SourceFile_List = SourceFileList_or_InferenceEntry
        Inference_Entry = get_InferenceEntry(OneCohort_Args, 
                                             SourceFile_List, 
                                             get_RawName_from_SourceFile)
    else:
        Inference_Entry = SourceFileList_or_InferenceEntry

    assert 'template_form' in Inference_Entry
    template_form = Inference_Entry['template_form']
    # print([i for i in template_form])

    inference_form_name_list = [i for i in Inference_Entry if 'inference_form' in i]
    RawName_to_dfRawList = {}
    for inference_form_name in inference_form_name_list:
        inference_form = Inference_Entry[inference_form_name]
        inference_form = fill_missing_keys(inference_form, template_form)
        inference_form = replace_none_with_list(inference_form)
        
        for RawName in template_form: # <---- pay attention here, we use keys in template_form.
            data = inference_form[RawName]
            # data: {table_name: value_list or []}
            ############################################
            data_new = {}
            max_num = max([0] + [len(v) for v in data.values()])
            for k, v in data.items():
                if len(v) == 0:
                    data_new[k] = [None] * max_num
                else:
                    data_new[k] = v
            for k, v in data_new.items(): assert len(v) == max_num
            df = pd.DataFrame(data_new)
            ############################################


            if RawName not in RawName_to_dfRawList:
                RawName_to_dfRawList[RawName] = []
            RawName_to_dfRawList[RawName].append(df)

    RawName_to_dfRaw = {}
    for RawName, df_list in RawName_to_dfRawList.items():
        df = pd.concat(df_list)
        RawName_to_dfRaw[RawName] = df


    print([i for i in RawName_to_dfRaw])


    ############# 
    df_Patient = RawName_to_dfRaw['Patient']

    try:
        df_UserDetail = RawName_to_dfRaw['UserDetail']
        df_Ptt = pd.merge(df_Patient, df_UserDetail, on='PatientID', how='outer')
    except:
        df_Ptt = df_Patient
    RawName_to_dfRaw['Ptt'] = df_Ptt


    ############################## Diet Information #################################
    # import pandas as pd
    # pd.set_option('display.max_columns', None)

    # Part 1:
    # There is a food items table. Several food items are tied to one CarbEntry. 
    # So we need to aggregate the food items by CarbEntry. 
    ID_to_Type = {
        1:'BeforeBreakfast', 
        2:'AfterBreakfast', 
        3:'BeforeLunch', 
        4:'AfterLunch', 
        5:'BeforeDinner', 
        6:'AfterDinner', 
        7:'Bedtime',
        8: 'BeforeExercise',
        9: 'AfterExercise',
        12: 'Snack',
        14: 'Fasting', 
    }
    # df_food_path = [i for i in SourceFile_List if 'ELogFoodItem' in i][0]
    df_food = RawName_to_dfRaw['ELogFoodItem']
    df_food['ActivityType'] = df_food['ActivityTypeID'].map(ID_to_Type)
    print(df_food.shape, '<-- df_food.shape')

    columns = [
    # 'ELogFoodItemID', 
    'PatientID', 
    'CarbEntryID', 
    'FoodName',
    'EntrySourceID', 
    # 'ActivityTypeID', 
    'ObservationDateTime',
    'ObservationEntryDateTime', 
    'CreatedDateTime', 'ModifiedDateTime', # 'RowVersionID',
    'ObservationStatus', 'ObservationCreatedBy',
    'TimezoneOffset', 'Timezone', 
    'FoodID', 'ServingSize', 'ServingType', 'Carbs', 'Fiber', 'Fat', 'Calories',
    'Protein', 'Sodium', 'ServingsConsumed', #  'ExternalSourceID', 'ExternalEntryID',
    'FoodImageID', 'SaturatedFat', 'PolyUnSaturatedFat',
    'MonoUnSaturatedFat', 'TransFat', 'Cholesterol', 'Potassium', 'Sugar',
    'AddedSugars', 'ActivityType']

    df_food = df_food[columns]
    print(df_food['CarbEntryID'].nunique(), '<-- df_food carbentryid')


    # df_carbs_path = [i for i in SourceFile_List if 'ELogCarbsEntry' in i][0]
    # df_carbs = pd.read_csv(df_carbs_path, low_memory=False)

    df_carbs = RawName_to_dfRaw['ELogCarbsEntry']
    print(df_carbs.shape, '<-- df_carbs.shape')
    # print(df_carbs.shape)

    columns = [
    # 'CarbsEntryID', 
    'PatientID', 'EntryID', 
    # 'EntrySourceID',
    'ActivityTypeID', 
    'ObservationDateTime', 'ObservationEntryDateTime',
    'EntryCreatedDateTime',  'ModifiedDateTime', # 'ExternalSourceID',
    'TimezoneOffset', 'Timezone', 
    # 'ObservationCreatedBy', 'ObservationStatus', # 'RowVersionID',
    # 'SourceReferenceID', 
    'CarbsValue', 
    # 'ExternalEntryID'
    ]

    df_carbs = df_carbs.reindex(columns=columns)
    print(df_carbs['ActivityTypeID'].value_counts()) #  = df_carbs['ActivityTypeID'].astype(str)
    df_carbs['ActivityType'] = df_carbs['ActivityTypeID'].map(ID_to_Type)
    df_carbs['ActivityType'].value_counts()
    del df_carbs['ActivityTypeID']
    df_carbs = df_carbs.rename(columns = {'EntryID': 'CarbEntryID'})
    print(df_carbs.shape, '<-- df_carbs.shape')
    print(df_carbs['CarbEntryID'].nunique(), '<-- df_carbs carbentryid')

    df_food_by_meal = df_food.groupby(['PatientID', 'CarbEntryID']).agg({
        "FoodName": lambda x: "; ".join(x),
        "Carbs": "sum",
        # "ServingsConsumed": "sum",
        # 'ServingSize', 'ServingType',
        "Carbs": "sum",
        "Fiber": "sum",
        "Fat": "sum",
        "Calories": "sum",
        "Protein": "sum",
        "Sodium": "sum",
        "SaturatedFat": "sum",
        "PolyUnSaturatedFat": "sum",
        "MonoUnSaturatedFat": "sum",
        "TransFat": "sum",
        "Cholesterol": "sum",
        "Potassium": "sum",
        "Sugar": "sum",
        "AddedSugars": "sum",
        # "ActivityType": "first",
    })


    df_food_by_meal = df_food_by_meal.reset_index()
    print(df_food_by_meal.shape, '<-- df_food_by_meal.shape')
    # df_food_by_meal
    df_diet = pd.merge(df_carbs, df_food_by_meal, on=['PatientID', 'CarbEntryID'])
    print(df_diet.shape, '<-- df_diet.shape', 'the total number of diet records')
    # df_diet.shape 
    # df_diet.head()
    df_food_by_meal = df_food_by_meal.reset_index()
    df_meal = pd.merge(df_carbs, df_food_by_meal, on=['PatientID', 'CarbEntryID'])
    
    # file = os.path.join(OneCohort_Args['FolderPath'], 'processed_RawFile_Diet.csv')
    # df_meal.to_csv(file, index=False)
    # RawName_to_dfRaw['Diet'] = file
    RawName_to_dfRaw['Diet'] = df_meal


    



    ############################ add the medication ##############################
    # import pandas as pd
    # pd.set_option('display.max_columns', None)

    # medadmin_path = [i for i in SourceFile_List if 'MedAdmin' in i][0]
    # print(medadmin_path)
    # df_med = pd.read_csv(medadmin_path, low_memory=False)
    # if 'MedAdmin' in RawName_to_dfRaw:
    df_med = RawName_to_dfRaw.get('MedAdmin', pd.DataFrame())
    columns = [
    'PatientID', 'MedAdministrationID', 'AdministrationID', 'ELogEntryID',
    ###### time
    'AdministrationDate', 
    'UserAdministrationDate', 
    'EntryDateTime',  'CreatedDate', 'ModifiedDateTime', 'AdministrationTimeZoneOffset', 'AdministrationTimeZone',
    ###### 


    ######
    'MedicationID', 'Dose', 
    ######

    'MedSourceID', 
    'AdministrationTimeLabelID',
    'ActivityTypeID',
    # 'StatusID', 
    # 'CreatedBy', # 'RowVersionID', 
    # 'MedPrescriptionID', 
    # 'PrescriptionGUID', 
    # 'MedPrescriptionTime', 
    # 'AdminSlot', 'ScheduledSlot', 

    # 'BGValue', 'CarbsValue',
    # 'InsulinCalculatorUsageStatus', 
    # 'IOBValue', 'FoodInsulinDose',
    # 'ExternalEntryID'
    ]
    # df_med = df_med[columns].reset_index()
    df_med = df_med.reindex(columns=columns)
    
    # file = os.path.join(OneCohort_Args['FolderPath'], 'processed_RawFile_MedAdmin.csv')
    # df_med.to_csv(file, index=False)
    # RawName_to_dfRaw['MedAdmin'] = file
    RawName_to_dfRaw['MedAdmin'] = df_med





    ############################# add the exercise #############################
    # exercise_path = [i for i in SourceFile_List if 'ELogExercise' in i][0]
    # print(exercise_path)
    df_exercise = RawName_to_dfRaw['ELogExerciseEntry']
    columns = df_exercise.columns
    df_exercise['ActivityType'] = df_exercise['ActivityTypeID'].map(ID_to_Type)
    print(df_exercise.shape)

    columns = [
    'PatientID', 
    # 'EntryID', 
    # 'ExerciseEntryID', 
    'ObservationDateTime', 'ObservationEntryDateTime', 
    # 'EntryCreatedDateTime',  'ModifiedDateTime', #  'ObservationCreatedBy',
    'TimezoneOffset', 'Timezone',
    'ExerciseType', 'ExerciseIntensity', 'TimeSinceExercise',
    'ActivityTypeID', 
    'ExerciseDuration',
    # 'ObservationStatus',
    # 'RowVersionID', 
    # 'SourceReferenceID',
    'CaloriesBurned', 
    'DistanceInMeters', # 'ExternalEntryID',
    # 'ExternalSourceID', # 'HealthConnectMetaDataId'
    # 'EntrySourceID', 
    ]

    # df_exercise = df_exercise[columns]
    df_exercise = df_exercise.reindex(columns=columns)
    id_to_intensity = {
        0: None, 
        1: 'High', 
        2: 'Moderate', 
        3: 'Low', 
    }
    df_exercise['ExerciseIntensity'] = df_exercise['ExerciseIntensity'].map(id_to_intensity)

    id_to_exercise_type = {  
        100: 'Walking',
        101: 'Running',
        102: 'Hiking',
        103: 'Bicycling',
        104: 'Swimming',
        105: 'Strength_training',
        106: 'Home_activities',
        107: 'Gardening__Lawn',
        108: 'Dancing__Aerobics',
        109: 'Skiing__Skating',
        110: 'Yoga_Pilates',
        111: 'Other',
        1: 'Cardiovascular',
        2: 'StrengthTraining',
        3: 'Sports',
        4: 'FitnessClass',
        5: 'YogaPilates',
    }
    df_exercise['ExerciseType'] = df_exercise['ExerciseType'].apply(lambda x: id_to_exercise_type[x] if x in id_to_exercise_type else x)

    id_to_activity_type = {
        # 0: None, 
        1: 'BeforeBreakFast',
        2: 'AfterBreakFast',
        3: 'BeforeLunch',
        4: 'AfterLunch',
        5: 'BeforeDinner',
        6: 'AfterDinner',
        7: 'Bedtime',
        8: 'BeforeExercise',
        9: 'AfterExercise',
        12: 'Snack',
        14: 'Fasting',
        31: 'JustChecking',
    }
    df_exercise['ActivityType'] = df_exercise['ActivityTypeID'].apply(lambda x: id_to_activity_type[x] if x in id_to_activity_type else x)
    # df_exercise['ActivityType'].value_counts()
    print(df_exercise.shape, '<-- df_exercise.shape')

    # file = os.path.join(OneCohort_Args['FolderPath'], 'processed_RawFile_Exercise.csv')
    # df_exercise.to_csv(file, index=False)
    # RawName_to_dfRaw['Exercise'] = file
    RawName_to_dfRaw['Exercise'] = df_exercise
    
    return RawName_to_dfRaw

process_Source_to_Raw.fn_string = inspect.getsource(process_Source_to_Raw)
# %%%%%%%%%%%%%%%%%%%%% user

RawName_to_dfRaw = process_Source_to_Raw(OneCohort_Args, SourceFile_List, get_RawName_from_SourceFile,SPACE)    

In [None]:
for RawName, dfRaw in RawName_to_dfRaw.items():
    print(RawName, dfRaw.shape)
    display(dfRaw.head())

# Step 5: Save Cohort Fn

In [None]:
# Get the python file path from the cohort object 
pypath = cohort.pypath
pypath

In [None]:
prefix = [
    'import os',
    'import json', 
    'import pandas as pd', 
    'import numpy as np'
    ]
prefix

In [None]:
# List of variables to be included in the generated script
iterative_variables = [OneCohort_Args, SourceFile_SuffixList]
iterative_variables

In [None]:
from recfldtkn.base import Base

fn_variables = [
    fill_missing_keys, 
    replace_none_with_list,
    get_RawName_from_SourceFile, 
    get_InferenceEntry,
    process_Source_to_Raw,
    ]

pycode = Base.convert_variables_to_pystirng(iterative_variables = iterative_variables, 
                                            fn_variables = fn_variables, 
                                            prefix = prefix)
# print(pycode)

In [None]:
with open(pypath, 'w') as file: file.write(pycode)

# Step 6: Test 

In [None]:
from recfldtkn.record_base.cohort import CohortFn, Cohort
from config.config_record.Cohort import CohortName_to_OneCohortArgs
CohortNames = [i for i in CohortName_to_OneCohortArgs.keys()]

# # %%%%%%%%%%%%%%%%%%%%% user
# CohortName = 'WellDoc2023CVSDeRx'
# # %%%%%%%%%%%%%%%%%%%%% 
OneCohort_Args = CohortName_to_OneCohortArgs[CohortName]
OneCohort_Args

In [None]:
Source2CohortName = OneCohort_Args['Source2CohortName']
cohort_fn = CohortFn(Source2CohortName, SPACE)
cohort_fn

In [None]:
cohort = Cohort(OneCohort_Args, SPACE, cohort_fn)
cohort

In [None]:
cohort.setup_fn(cohort_fn)

In [None]:
from recfldtkn.record_base.cohort import CohortFn, Cohort

Source2CohortName = OneCohort_Args['Source2CohortName']
cohort_fn = CohortFn(Source2CohortName, SPACE)
cohort = Cohort(OneCohort_Args, SPACE, cohort_fn)
cohort.setup_fn(cohort_fn)

cohort.initialize_cohort()

In [None]:
[i for i in cohort.RawName_to_dfRaw]

In [None]:
cohort.SourceFile_List

In [None]:
for RawName, dfRaw in cohort.RawName_to_dfRaw.items():
    print(RawName, dfRaw.shape)
    # display(dfRaw.head())