# Space

In [None]:
import sys
import os 
import logging
import pandas as pd
from pprint import pprint 
from IPython.display import display, HTML

KEY = '1-WORKSPACE'
WORKSPACE_PATH = os.getcwd().split(KEY)[0]
print(WORKSPACE_PATH); os.chdir(WORKSPACE_PATH)
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format='[%(levelname)s:%(asctime)s:(%(filename)s@%(lineno)d %(name)s)]: %(message)s')

SPACE = {
    'DATA_RAW': f'./_Data/0-Data_Raw',
    'DATA_RFT': f'./_Data/1-Data_RFT',
    'DATA_CASE': f'./_Data/2-Data_CASE',
    'DATA_AIDATA': f'./_Data/3-Data_AIDATA',
    'DATA_EXTERNAL': f'./code/external',
    'CODE_FN': f'./code/pipeline', 
}
assert os.path.exists(SPACE['CODE_FN']), f'{SPACE["CODE_FN"]} not found'

print(SPACE['CODE_FN'])
sys.path.append(SPACE['CODE_FN'])

# Step 1: OneCohort_Args

In [None]:
# Import the cohort configuration dictionary
from config.config_record.Cohort import CohortName_to_OneCohortArgs

# List available cohort names
cohort_names = list(CohortName_to_OneCohortArgs.keys())
print("Available Cohorts:", cohort_names)

# Select a specific cohort and retrieve its arguments
# selected_cohort = 'WellDoc2023CVSDeRx'
selected_cohort = 'WellDoc2022CGM'
cohort_args = CohortName_to_OneCohortArgs[selected_cohort]
print("Selected Cohort Arguments:", cohort_args)

In [None]:
from config.config_record.Cohort import CohortName_to_OneCohortArgs # marked. 

CohortNames = [i for i in CohortName_to_OneCohortArgs.keys()]
print(CohortNames)

In [None]:
# # %%%%%%%%%%%%%%%%%%%%% user
# CohortName = 'WellDoc2022CGM'
# CohortName = 'WellDoc2023CVSDeRx'
# CohortName = 'WellDoc2023CVSTDC'

# CohortName = 'WellDoc2025CVS'
CohortName = 'WellDoc2025ALS'
# CohortName = 'WellDoc2025LLY'
# # %%%%%%%%%%%%%%%%%%%%% 
OneCohort_Args = CohortName_to_OneCohortArgs[CohortName]
OneCohort_Args

In [None]:
# # # %%%%%%%%%%%%%%%%%%%%% user
# OneCohort_Args = {
#     'CohortLabel': 6,
#     'CohortName': 'WellDoc2025LLY',
#     'FolderPath': '$DATA_RAW$/WellDoc2025LLY/',
#     'SourcePath': '',
#     'Source2CohortName': 'WellDocV240629',
# }
# # %%%%%%%%%%%%%%%%%%%%%

print(SPACE['DATA_RAW'])

In [None]:
from recfldtkn.record_base.cohort import Cohort   # Cohort is a class

# Define a placeholder for a cohort funciton 
cohort_fn = None
# Initialize an object of the Cohort class with initial arguments
cohort = Cohort(OneCohort_Args, SPACE, cohort_fn)  # cohort is an object of class Cohort, this object  will manage dataset parameters
# Update the cohort arguments using the update_cohort_args method
# Input is OneCohort_Args and SPACE, which are two dictionaries and the 
OneCohort_Args = cohort.update_cohort_args(OneCohort_Args, SPACE) # update cohort args
# Pretty print the updated cohort argument
pprint(OneCohort_Args, sort_dicts=False)

In [None]:
# cohort.RawName_to_dfRaw

# Step 2: Get Source Files
The purpose of this code segment is to retrieve all files with a specific suffix (in this case, .csv) from a specified folder and list their paths. The folder path and file suffix list are specified by the user.

In [None]:
# %%%%%%%%%%%%%%%%%%%%% user
# Define file suffix list to look for .csv files
SourceFile_SuffixList = ['csv'] 
# %%%%%%%%%%%%%%%%%%%%% user

# Get the folder path from OneCohort_Args dictionary (provided by the user)
Folder = OneCohort_Args['SourcePath'] 

# Use the cohort object to get a list of source files with the specified suffix
SourceFile_List = cohort.get_SourceFile_List(Folder, SourceFile_SuffixList)

# Print the list of source files
SourceFile_List

# Step 3: Get RawName from SourceFile

In [None]:
import inspect

# %%%%%%%%%%%%%%%%%%%%% user
def get_RawName_from_SourceFile(file_path, OneCohort_Args):
    """
    Extracts a 'raw name' from a given file path.

    This function takes a file path and extracts what is assumed to be a 'raw name'
    by splitting the path and selecting specific parts. The 'raw name' is considered
    to be the last part of the file name before the file extension.

    Args:
        file_path (str): The full path of the file from which to extract the raw name.
        OneCohort_Args: Currently unused. Reserved for future functionality.

    Returns:
        str: The extracted 'raw name' from the file path.

    """
    RawName = file_path.split('_')[-1].split('.')[0]
    return RawName

get_RawName_from_SourceFile.fn_string = inspect.getsource(get_RawName_from_SourceFile)
# %%%%%%%%%%%%%%%%%%%%% 

file_path = SourceFile_List[0]
print(type(file_path))
RawName = get_RawName_from_SourceFile(file_path, OneCohort_Args)
print(file_path)
print(RawName)

In [None]:
SourceFile_List

# Step 4: Process Source to Raw

In [None]:
### %%%%%%%%%%%%%%%%%%%%% user
UserDetail_file = [i for i in SourceFile_List 
                   if get_RawName_from_SourceFile(i, OneCohort_Args) == 'UserDetail'][0]
print(UserDetail_file)
df_UserDetail = pd.read_csv(UserDetail_file, low_memory=False)    


Patient_file = [i for i in SourceFile_List 
                if get_RawName_from_SourceFile(i, OneCohort_Args) == 'Patient'][0]
print(Patient_file)
df_Patient = pd.read_csv(Patient_file, low_memory=False)
### %%%%%%%%%%%%%%%%%%%%% user

In [None]:
# SourceFile_List

## Diet

In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)

# Part 1:
# There is a food items table. Several food items are tied to one CarbEntry. 
# So we need to aggregate the food items by CarbEntry. 
ID_to_Type = {
    1:'BeforeBreakfast', 
    2:'AfterBreakfast', 
    3:'BeforeLunch', 
    4:'AfterLunch', 
    5:'BeforeDinner', 
    6:'AfterDinner', 
    7:'Bedtime',
    8: 'BeforeExercise',
    9: 'AfterExercise',
    12: 'Snack',
    14: 'Fasting', 
}
df_food_path = [i for i in SourceFile_List if 'ELogFoodItem' in i][0]
df_food = pd.read_csv(df_food_path, low_memory=False)
df_food['ActivityType'] = df_food['ActivityTypeID'].map(ID_to_Type)
print(df_food.shape, '<-- df_food.shape')

columns = [
# 'ELogFoodItemID', 
'PatientID', 
'CarbEntryID', 
'FoodName',
'EntrySourceID', 
# 'ActivityTypeID', 
'ObservationDateTime',
'ObservationEntryDateTime', 
'CreatedDateTime', 'ModifiedDateTime', # 'RowVersionID',
'ObservationStatus', 'ObservationCreatedBy',
'TimezoneOffset', 'Timezone', 
'FoodID', 'ServingSize', 'ServingType', 'Carbs', 'Fiber', 'Fat', 'Calories',
'Protein', 'Sodium', 'ServingsConsumed', #  'ExternalSourceID', 'ExternalEntryID',
'FoodImageID', 'SaturatedFat', 'PolyUnSaturatedFat',
'MonoUnSaturatedFat', 'TransFat', 'Cholesterol', 'Potassium', 'Sugar',
'AddedSugars', 'ActivityType']

df_food = df_food[columns]
print(df_food['CarbEntryID'].nunique(), '<-- df_food carbentryid')


df_carbs_path = [i for i in SourceFile_List if 'ELogCarbsEntry' in i][0]
df_carbs = pd.read_csv(df_carbs_path, low_memory=False)
print(df_carbs.shape, '<-- df_carbs.shape')
# print(df_carbs.shape)

columns = [
# 'CarbsEntryID', 
'PatientID', 'EntryID', 
# 'EntrySourceID',
'ActivityTypeID', 
'ObservationDateTime', 'ObservationEntryDateTime',
'EntryCreatedDateTime',  'ModifiedDateTime', # 'ExternalSourceID',
'TimezoneOffset', 'Timezone', 
# 'ObservationCreatedBy', 'ObservationStatus', # 'RowVersionID',
# 'SourceReferenceID', 
'CarbsValue', 
# 'ExternalEntryID'
]

df_carbs = df_carbs[columns]
print(df_carbs['ActivityTypeID'].value_counts()) #  = df_carbs['ActivityTypeID'].astype(str)
df_carbs['ActivityType'] = df_carbs['ActivityTypeID'].map(ID_to_Type)
df_carbs['ActivityType'].value_counts()
del df_carbs['ActivityTypeID']
df_carbs = df_carbs.rename(columns = {'EntryID': 'CarbEntryID'})
print(df_carbs.shape, '<-- df_carbs.shape')
print(df_carbs['CarbEntryID'].nunique(), '<-- df_carbs carbentryid')

df_food_by_meal = df_food.groupby(['PatientID', 'CarbEntryID']).agg({
    "FoodName": lambda x: "; ".join(x),
    "Carbs": "sum",
    # "ServingsConsumed": "sum",
    # 'ServingSize', 'ServingType',
    "Carbs": "sum",
    "Fiber": "sum",
    "Fat": "sum",
    "Calories": "sum",
    "Protein": "sum",
    "Sodium": "sum",
    "SaturatedFat": "sum",
    "PolyUnSaturatedFat": "sum",
    "MonoUnSaturatedFat": "sum",
    "TransFat": "sum",
    "Cholesterol": "sum",
    "Potassium": "sum",
    "Sugar": "sum",
    "AddedSugars": "sum",
    # "ActivityType": "first",
})


df_food_by_meal = df_food_by_meal.reset_index()
print(df_food_by_meal.shape, '<-- df_food_by_meal.shape')
# df_food_by_meal
df_diet = pd.merge(df_carbs, df_food_by_meal, on=['PatientID', 'CarbEntryID'])
print(df_diet.shape, '<-- df_diet.shape', 'the total number of diet records')
# df_diet.shape 
df_diet.head()


## Medication

In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)

medadmin_path = [i for i in SourceFile_List if 'MedAdmin' in i][0]
print(medadmin_path)
df_med = pd.read_csv(medadmin_path, low_memory=False)
columns = [
'PatientID', 'MedAdministrationID', 'AdministrationID', 'ELogEntryID',


###### time
'AdministrationDate', 
'UserAdministrationDate', 
'EntryDateTime',  'CreatedDate', 'ModifiedDateTime', 'AdministrationTimeZoneOffset', 'AdministrationTimeZone',
###### 


######
'MedicationID', 'Dose', 
######

'MedSourceID', 
'AdministrationTimeLabelID',
'ActivityTypeID',
 # 'StatusID', 
 # 'CreatedBy', # 'RowVersionID', 
# 'MedPrescriptionID', 
# 'PrescriptionGUID', 
# 'MedPrescriptionTime', 
# 'AdminSlot', 'ScheduledSlot', 

# 'BGValue', 'CarbsValue',
# 'InsulinCalculatorUsageStatus', 
# 'IOBValue', 'FoodInsulinDose',
# 'ExternalEntryID'
]

df_med = df_med[columns]
print(df_med.shape)
df_med.head()
# df_med.columns

In [None]:
for col in ['MedicationID', 
            'Dose', 
            'MedSourceID', 
            # 'MedPrescriptionID',
            # 'MedAdministrationID',
            'ActivityTypeID', 
            'AdministrationTimeLabelID', 
            ]:
    print('\n=====', col, '=====')
    print(df_med[col].value_counts().sort_index())

# df_exercise['ExerciseType'].value_counts().sort_index()

## Exercise

In [None]:
exercise_path = [i for i in SourceFile_List if 'ELogExercise' in i][0]
print(exercise_path)
df_exercise = pd.read_csv(exercise_path, low_memory=False)
columns = df_exercise.columns
df_exercise['ActivityType'] = df_exercise['ActivityTypeID'].map(ID_to_Type)
print(df_exercise.shape)

columns = [
'PatientID', 
# 'EntryID', 
# 'ExerciseEntryID', 
'ObservationDateTime', 'ObservationEntryDateTime', 
# 'EntryCreatedDateTime',  'ModifiedDateTime', #  'ObservationCreatedBy',
'TimezoneOffset', 'Timezone',
'ExerciseType', 'ExerciseIntensity', 'TimeSinceExercise',
'ActivityTypeID', 
'ExerciseDuration',
# 'ObservationStatus',
# 'RowVersionID', 
# 'SourceReferenceID',
'CaloriesBurned', 
'DistanceInMeters', # 'ExternalEntryID',
# 'ExternalSourceID', # 'HealthConnectMetaDataId'
# 'EntrySourceID', 
]

df_exercise = df_exercise[columns]
id_to_intensity = {
    0: None, 
    1: 'High', 
    2: 'Moderate', 
    3: 'Low', 
}
df_exercise['ExerciseIntensity'] = df_exercise['ExerciseIntensity'].map(id_to_intensity)

id_to_exercise_type = {  
    100: 'Walking',
    101: 'Running',
    102: 'Hiking',
    103: 'Bicycling',
    104: 'Swimming',
    105: 'Strength_training',
    106: 'Home_activities',
    107: 'Gardening__Lawn',
    108: 'Dancing__Aerobics',
    109: 'Skiing__Skating',
    110: 'Yoga_Pilates',
    111: 'Other',
    1: 'Cardiovascular',
    2: 'StrengthTraining',
    3: 'Sports',
    4: 'FitnessClass',
    5: 'YogaPilates',
}
df_exercise['ExerciseType'] = df_exercise['ExerciseType'].apply(lambda x: id_to_exercise_type[x] if x in id_to_exercise_type else x)

id_to_activity_type = {
    # 0: None, 
    1: 'BeforeBreakFast',
    2: 'AfterBreakFast',
    3: 'BeforeLunch',
    4: 'AfterLunch',
    5: 'BeforeDinner',
    6: 'AfterDinner',
    7: 'Bedtime',
    8: 'BeforeExercise',
    9: 'AfterExercise',
    12: 'Snack',
    14: 'Fasting',
    31: 'JustChecking',
}
df_exercise['ActivityType'] = df_exercise['ActivityTypeID'].apply(lambda x: id_to_activity_type[x] if x in id_to_activity_type else x)
# df_exercise['ActivityType'].value_counts()
print(df_exercise.shape, '<-- df_exercise.shape')

## process_Source_to_Raw

In [None]:
# %%%%%%%%%%%%%%%%%%%%% user
def process_Source_to_Raw(OneCohort_Args, SourceFile_List, get_RawName_from_SourceFile,SPACE):
    """
        Process source files to raw data files, including renaming columns and merging certain files.

        Args:
        OneCohort_Args (dict): Dictionary containing processing arguments, including 'FolderPath'.
        SourceFile_List (list): List of source file paths.
        get_RawName_from_SourceFile (function): Function to extract raw name from file path.

        Returns:
        dict: Mapping of raw names to processed file paths.
        """
    # Initialize dictionary to store raw names and their corresponding file paths
    RawName_to_dfRaw = {}
    for file_path in SourceFile_List:
        # Extract the raw name for each file using the function 
        RawName = get_RawName_from_SourceFile(file_path, OneCohort_Args)
        # Assign value file_path to key RawName
        RawName_to_dfRaw[RawName] = file_path


    # ---------- update the PatientID
    # Process files to update "PatientId" to "PatientID"
    # loop RawName_to_dfRaw dictionary 
    for RawName, file_path in RawName_to_dfRaw.items():
        # Skip empty files
        if os.path.getsize(file_path) == 0: continue  
        # Read only the header of the file to check columns
        df = pd.read_csv(file_path, nrows = 0)
        # Skip files without "PatientId" column
        if 'PatientId' not in df.columns: continue 
        # Define the new file path for the processed file
        file_path_new = os.path.join(OneCohort_Args['FolderPath'], f'processed_RawFile_{RawName}.csv')
        RawName_to_dfRaw[RawName] = file_path_new
        if not os.path.exists(file_path_new): 
            # RawName_to_dfRaw.pop(RawName, None)
            df = pd.read_csv(file_path)
            df = df.rename(columns={'PatientId': 'PatientID'})
            df.to_csv(file_path, index=False)
            # print(f'processed file: {file_path_new}')
       
    # ---------- merge UserDetail and Patient for Patient
    file = os.path.join(OneCohort_Args['FolderPath'], 'processed_RawFile_Ptt.csv')
    RawName_to_dfRaw.pop('UserDetail', None)
    RawName_to_dfRaw.pop('Patient', None)
    UserDetail_file = [i for i in SourceFile_List 
                        if get_RawName_from_SourceFile(i, OneCohort_Args) == 'UserDetail' and 'processed' not in i][0]
    df_UserDetail = pd.read_csv(UserDetail_file)    
    print(UserDetail_file)
    df_UserDetail = df_UserDetail.rename(columns={'UserID': 'PatientID'})
    print(df_UserDetail.columns)
    
    Patient_file = [i for i in SourceFile_List 
                    if get_RawName_from_SourceFile(i, OneCohort_Args) == 'Patient' and 'processed' not in i][0]
    print(Patient_file)
    df_Patient = pd.read_csv(Patient_file)
    print(df_Patient.columns)

    df_Ptt = pd.merge(df_Patient, df_UserDetail, on='PatientID', how='outer')

    df_Ptt.to_csv(file, index=False)
    RawName_to_dfRaw['Ptt'] = file


    ############################## Diet Information #################################
    # import pandas as pd
    # pd.set_option('display.max_columns', None)

    # Part 1:
    # There is a food items table. Several food items are tied to one CarbEntry. 
    # So we need to aggregate the food items by CarbEntry. 
    ID_to_Type = {
        1:'BeforeBreakfast', 
        2:'AfterBreakfast', 
        3:'BeforeLunch', 
        4:'AfterLunch', 
        5:'BeforeDinner', 
        6:'AfterDinner', 
        7:'Bedtime',
        8: 'BeforeExercise',
        9: 'AfterExercise',
        12: 'Snack',
        14: 'Fasting', 
    }
    df_food_path = [i for i in SourceFile_List if 'ELogFoodItem' in i][0]
    df_food = pd.read_csv(df_food_path, low_memory=False)
    df_food['ActivityType'] = df_food['ActivityTypeID'].map(ID_to_Type)
    print(df_food.shape, '<-- df_food.shape')

    columns = [
    # 'ELogFoodItemID', 
    'PatientID', 
    'CarbEntryID', 
    'FoodName',
    'EntrySourceID', 
    # 'ActivityTypeID', 
    'ObservationDateTime',
    'ObservationEntryDateTime', 
    'CreatedDateTime', 'ModifiedDateTime', # 'RowVersionID',
    'ObservationStatus', 'ObservationCreatedBy',
    'TimezoneOffset', 'Timezone', 
    'FoodID', 'ServingSize', 'ServingType', 'Carbs', 'Fiber', 'Fat', 'Calories',
    'Protein', 'Sodium', 'ServingsConsumed', #  'ExternalSourceID', 'ExternalEntryID',
    'FoodImageID', 'SaturatedFat', 'PolyUnSaturatedFat',
    'MonoUnSaturatedFat', 'TransFat', 'Cholesterol', 'Potassium', 'Sugar',
    'AddedSugars', 'ActivityType']

    df_food = df_food[columns]
    print(df_food['CarbEntryID'].nunique(), '<-- df_food carbentryid')


    df_carbs_path = [i for i in SourceFile_List if 'ELogCarbsEntry' in i][0]
    df_carbs = pd.read_csv(df_carbs_path, low_memory=False)
    print(df_carbs.shape, '<-- df_carbs.shape')
    # print(df_carbs.shape)

    columns = [
    # 'CarbsEntryID', 
    'PatientID', 'EntryID', 
    # 'EntrySourceID',
    'ActivityTypeID', 
    'ObservationDateTime', 'ObservationEntryDateTime',
    'EntryCreatedDateTime',  'ModifiedDateTime', # 'ExternalSourceID',
    'TimezoneOffset', 'Timezone', 
    # 'ObservationCreatedBy', 'ObservationStatus', # 'RowVersionID',
    # 'SourceReferenceID', 
    'CarbsValue', 
    # 'ExternalEntryID'
    ]

    df_carbs = df_carbs[columns]
    print(df_carbs['ActivityTypeID'].value_counts()) #  = df_carbs['ActivityTypeID'].astype(str)
    df_carbs['ActivityType'] = df_carbs['ActivityTypeID'].map(ID_to_Type)
    df_carbs['ActivityType'].value_counts()
    del df_carbs['ActivityTypeID']
    df_carbs = df_carbs.rename(columns = {'EntryID': 'CarbEntryID'})
    print(df_carbs.shape, '<-- df_carbs.shape')
    print(df_carbs['CarbEntryID'].nunique(), '<-- df_carbs carbentryid')

    df_food_by_meal = df_food.groupby(['PatientID', 'CarbEntryID']).agg({
        "FoodName": lambda x: "; ".join(x),
        "Carbs": "sum",
        # "ServingsConsumed": "sum",
        # 'ServingSize', 'ServingType',
        "Carbs": "sum",
        "Fiber": "sum",
        "Fat": "sum",
        "Calories": "sum",
        "Protein": "sum",
        "Sodium": "sum",
        "SaturatedFat": "sum",
        "PolyUnSaturatedFat": "sum",
        "MonoUnSaturatedFat": "sum",
        "TransFat": "sum",
        "Cholesterol": "sum",
        "Potassium": "sum",
        "Sugar": "sum",
        "AddedSugars": "sum",
        # "ActivityType": "first",
    })


    df_food_by_meal = df_food_by_meal.reset_index()
    print(df_food_by_meal.shape, '<-- df_food_by_meal.shape')
    # df_food_by_meal
    df_diet = pd.merge(df_carbs, df_food_by_meal, on=['PatientID', 'CarbEntryID'])
    print(df_diet.shape, '<-- df_diet.shape', 'the total number of diet records')
    # df_diet.shape 
    # df_diet.head()
    df_food_by_meal = df_food_by_meal.reset_index()
    df_meal = pd.merge(df_carbs, df_food_by_meal, on=['PatientID', 'CarbEntryID'])
    
    file = os.path.join(OneCohort_Args['FolderPath'], 'processed_RawFile_Diet.csv')
    df_meal.to_csv(file, index=False)
    RawName_to_dfRaw['Diet'] = file



    ############################ add the medication ##############################
    # import pandas as pd
    # pd.set_option('display.max_columns', None)

    medadmin_path = [i for i in SourceFile_List if 'MedAdmin' in i][0]
    print(medadmin_path)
    df_med = pd.read_csv(medadmin_path, low_memory=False)
    columns = [
    'PatientID', 'MedAdministrationID', 'AdministrationID', 'ELogEntryID',
    ###### time
    'AdministrationDate', 
    'UserAdministrationDate', 
    'EntryDateTime',  'CreatedDate', 'ModifiedDateTime', 'AdministrationTimeZoneOffset', 'AdministrationTimeZone',
    ###### 


    ######
    'MedicationID', 'Dose', 
    ######

    'MedSourceID', 
    'AdministrationTimeLabelID',
    'ActivityTypeID',
    # 'StatusID', 
    # 'CreatedBy', # 'RowVersionID', 
    # 'MedPrescriptionID', 
    # 'PrescriptionGUID', 
    # 'MedPrescriptionTime', 
    # 'AdminSlot', 'ScheduledSlot', 

    # 'BGValue', 'CarbsValue',
    # 'InsulinCalculatorUsageStatus', 
    # 'IOBValue', 'FoodInsulinDose',
    # 'ExternalEntryID'
    ]
    df_med = df_med[columns].reset_index()
    file = os.path.join(OneCohort_Args['FolderPath'], 'processed_RawFile_MedAdmin.csv')
    df_med.to_csv(file, index=False)
    RawName_to_dfRaw['MedAdmin'] = file


    ############################# add the exercise #############################
    exercise_path = [i for i in SourceFile_List if 'ELogExercise' in i][0]
    print(exercise_path)
    df_exercise = pd.read_csv(exercise_path, low_memory=False)
    columns = df_exercise.columns
    df_exercise['ActivityType'] = df_exercise['ActivityTypeID'].map(ID_to_Type)
    print(df_exercise.shape)

    columns = [
    'PatientID', 
    # 'EntryID', 
    # 'ExerciseEntryID', 
    'ObservationDateTime', 'ObservationEntryDateTime', 
    # 'EntryCreatedDateTime',  'ModifiedDateTime', #  'ObservationCreatedBy',
    'TimezoneOffset', 'Timezone',
    'ExerciseType', 'ExerciseIntensity', 'TimeSinceExercise',
    'ActivityTypeID', 
    'ExerciseDuration',
    # 'ObservationStatus',
    # 'RowVersionID', 
    # 'SourceReferenceID',
    'CaloriesBurned', 
    'DistanceInMeters', # 'ExternalEntryID',
    # 'ExternalSourceID', # 'HealthConnectMetaDataId'
    # 'EntrySourceID', 
    ]

    df_exercise = df_exercise[columns]
    id_to_intensity = {
        0: None, 
        1: 'High', 
        2: 'Moderate', 
        3: 'Low', 
    }
    df_exercise['ExerciseIntensity'] = df_exercise['ExerciseIntensity'].map(id_to_intensity)

    id_to_exercise_type = {  
        100: 'Walking',
        101: 'Running',
        102: 'Hiking',
        103: 'Bicycling',
        104: 'Swimming',
        105: 'Strength_training',
        106: 'Home_activities',
        107: 'Gardening__Lawn',
        108: 'Dancing__Aerobics',
        109: 'Skiing__Skating',
        110: 'Yoga_Pilates',
        111: 'Other',
        1: 'Cardiovascular',
        2: 'StrengthTraining',
        3: 'Sports',
        4: 'FitnessClass',
        5: 'YogaPilates',
    }
    df_exercise['ExerciseType'] = df_exercise['ExerciseType'].apply(lambda x: id_to_exercise_type[x] if x in id_to_exercise_type else x)

    id_to_activity_type = {
        # 0: None, 
        1: 'BeforeBreakFast',
        2: 'AfterBreakFast',
        3: 'BeforeLunch',
        4: 'AfterLunch',
        5: 'BeforeDinner',
        6: 'AfterDinner',
        7: 'Bedtime',
        8: 'BeforeExercise',
        9: 'AfterExercise',
        12: 'Snack',
        14: 'Fasting',
        31: 'JustChecking',
    }
    df_exercise['ActivityType'] = df_exercise['ActivityTypeID'].apply(lambda x: id_to_activity_type[x] if x in id_to_activity_type else x)
    # df_exercise['ActivityType'].value_counts()
    print(df_exercise.shape, '<-- df_exercise.shape')

    file = os.path.join(OneCohort_Args['FolderPath'], 'processed_RawFile_Exercise.csv')
    df_exercise.to_csv(file, index=False)
    RawName_to_dfRaw['Exercise'] = file
    
    return RawName_to_dfRaw

process_Source_to_Raw.fn_string = inspect.getsource(process_Source_to_Raw)
# %%%%%%%%%%%%%%%%%%%%% user

RawName_to_dfRaw = process_Source_to_Raw(OneCohort_Args, SourceFile_List, get_RawName_from_SourceFile,SPACE)    
RawName_to_dfRaw

# Step 5: Save Cohort Fn

In [None]:
# Get the python file path from the cohort object 
pypath = cohort.pypath
pypath

In [None]:
prefix = [
    'import os',
    'import pandas as pd', 
    'import numpy as np'
    ]
prefix

In [None]:
# List of variables to be included in the generated script
iterative_variables = [OneCohort_Args, SourceFile_SuffixList]
iterative_variables

In [None]:
from recfldtkn.base import Base
fn_variables = [get_RawName_from_SourceFile, process_Source_to_Raw]
pycode= Base.convert_variables_to_pystirng(iterative_variables = iterative_variables, 
                                           fn_variables = fn_variables, 
                                           prefix = prefix)
# print(pycode)

In [None]:
with open(pypath, 'w') as file: file.write(pycode)

In [None]:
from recfldtkn.base import Base 
# Get the python file path from the cohort object 
pypath = cohort.pypath

# Define the import statements to be included at the begining
prefix = [
    'import os',
    'import pandas as pd', 
    'import numpy as np'
    ]
# List of variables to be included in the generated script
iterative_variables = [OneCohort_Args, SourceFile_SuffixList]
# list of the funcitons to be included in the generated script
fn_variables = [get_RawName_from_SourceFile, process_Source_to_Raw]
pycode = Base.convert_variables_to_pystirng(iterative_variables = iterative_variables, 
                                           fn_variables = fn_variables, 
                                           prefix = prefix)
# Create the directory for the Python file if it doesn't exist
if not os.path.exists(os.path.dirname(pypath)): os.makedirs(os.path.dirname(pypath))

# print(pypath)
with open(pypath, 'w') as file: file.write(pycode)
# Create a HTML link and display it
full_path = os.path.join(WORKSPACE_PATH, pypath)

display(HTML(f'{pypath} <a href="{full_path}" target="_blank">Open File</a>'))

# Step 6: Test 

In [None]:
from recfldtkn.record_base.cohort import CohortFn, Cohort
from config.config_record.Cohort import CohortName_to_OneCohortArgs
CohortNames = [i for i in CohortName_to_OneCohortArgs.keys()]

# # %%%%%%%%%%%%%%%%%%%%% user
# CohortName = 'WellDoc2022CGM'
# CohortName = 'WellDoc2023CVSDeRx'
# # %%%%%%%%%%%%%%%%%%%%% 
OneCohort_Args = CohortName_to_OneCohortArgs[CohortName]
OneCohort_Args

In [None]:
Source2CohortName = OneCohort_Args['Source2CohortName']
cohort_fn = CohortFn(Source2CohortName, SPACE)
cohort_fn

In [None]:
cohort = Cohort(OneCohort_Args, SPACE, cohort_fn)
cohort

In [None]:
cohort.setup_fn(cohort_fn)

In [None]:
from recfldtkn.record_base.cohort import CohortFn, Cohort


Source2CohortName = OneCohort_Args['Source2CohortName']
cohort_fn = CohortFn(Source2CohortName, SPACE)
cohort = Cohort(OneCohort_Args, SPACE, cohort_fn)
cohort.setup_fn(cohort_fn)

cohort.initialize_cohort(load_data=False)

In [None]:
cohort.RawName_to_dfRaw


In [None]:
cohort.SourceFile_List

In [None]:
cohort.RawName_to_dfRaw