# Space

In [None]:
import sys
import os 
import logging
import pandas as pd
from pprint import pprint 
from IPython.display import display, HTML

KEY = '1-WORKSPACE'
WORKSPACE_PATH = os.getcwd().split(KEY)[0]
print(WORKSPACE_PATH); os.chdir(WORKSPACE_PATH)
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format='[%(levelname)s:%(asctime)s:(%(filename)s@%(lineno)d %(name)s)]: %(message)s')

SPACE = {
    'DATA_RAW': f'./_Data/0-Data_Raw',
    'DATA_RFT': f'./_Data/1-Data_RFT',
    'DATA_CASE': f'./_Data/2-Data_CASE',
    'DATA_AIDATA': f'./_Data/3-Data_AIDATA',
    'DATA_EXTERNAL': f'./code/external',
    'CODE_FN': f'./code/pipeline', 
}
assert os.path.exists(SPACE['CODE_FN']), f'{SPACE["CODE_FN"]} not found'

print(SPACE['CODE_FN'])
sys.path.append(SPACE['CODE_FN'])

# Step 1: OneCohort_Args



This step is foundational for the rest of the notebook, as it ensures that the correct settings and parameters are in place for processing the cohort's data.

In [None]:

# Import the cohort configuration dictionary
from config.config_record.Cohort import CohortName_to_OneCohortArgs

# List available cohort names
cohort_names = list(CohortName_to_OneCohortArgs.keys())
print("Available Cohorts:", cohort_names)

# Select a specific cohort and retrieve its arguments
# selected_cohort = 'WellDoc2023CVSDeRx'
selected_cohort = 'OhioT1DM'
cohort_args = CohortName_to_OneCohortArgs[selected_cohort]
print("Selected Cohort Arguments:", cohort_args)

In [None]:
from recfldtkn.record_base.cohort import Cohort   # Cohort is a class

# Define a placeholder for a cohort funciton 
cohort_fn = None
# Initialize an object of the Cohort class with initial arguments
cohort = Cohort(OneCohort_Args, SPACE, cohort_fn)  # cohort is an object of class Cohort, this object  will manage dataset parameters
# Update the cohort arguments using the update_cohort_args method
# Input is OneCohort_Args and SPACE, which are two dictionaries and the 
OneCohort_Args = cohort.update_cohort_args(OneCohort_Args, SPACE) # update cohort args
# Pretty print the updated cohort argument
pprint(OneCohort_Args, sort_dicts=False)

In [None]:
# cohort.RawName_to_dfRaw

# Step 2: Get Source Files
The purpose of this code segment is to retrieve all files with a specific suffix (in this case, .csv) from a specified folder and list their paths. The folder path and file suffix list are specified by the user.

In [None]:
# %%%%%%%%%%%%%%%%%%%%% user
# Define file suffix list to look for .csv files
SourceFile_SuffixList = ['xml'] 
# %%%%%%%%%%%%%%%%%%%%% user

# Get the folder path from OneCohort_Args dictionary (provided by the user)
Folder = OneCohort_Args['SourcePath'] 

# Use the cohort object to get a list of source files with the specified suffix
SourceFile_List = cohort.get_SourceFile_List(Folder, SourceFile_SuffixList)

# Print the list of source files
SourceFile_List

# Step 3: Get RawName from SourceFile

In [None]:
import inspect

# %%%%%%%%%%%%%%%%%%%%% user
def get_RawName_from_SourceFile(file_path, OneCohort_Args):
    """
    This one is useless 
    """
    RawName = file_path.split('_')[-1].split('.')[0]
    return RawName

get_RawName_from_SourceFile.fn_string = inspect.getsource(get_RawName_from_SourceFile)
# %%%%%%%%%%%%%%%%%%%%% 

file_path = SourceFile_List[0]
print(type(file_path))
RawName = get_RawName_from_SourceFile(file_path, OneCohort_Args)
print(file_path)
print(RawName)

In [None]:
SourceFile_List

# Step 4: Process Source to Raw

## 0. get tables

In [None]:
import xmltodict
import json
from collections import defaultdict

xml_file = SourceFile_List[0]


# lession, should display the xml file content so the Cursor can understand the data structure
# Parse the XML file
with open(xml_file, 'r') as file:
    xml_content = file.read()
    data = xmltodict.parse(xml_content)

data

In [None]:
# Analyze the structure of the XML data
def analyze_xml_structure(data):
    """
    Analyze the structure of the XML data to understand its organization and content.
    
    Args:
        data (dict): The parsed XML data
        
    Returns:
        dict: A summary of the data structure
    """
    import xmltodict
    structure = {}
    
    # Check if we have patient data
    if 'patient' in data:
        patient_data = data['patient']
        structure['patient_attributes'] = [k for k in patient_data.keys() if k.startswith('@')]
        structure['patient_sections'] = [k for k in patient_data.keys() if not k.startswith('@')]
        
        # Analyze each section in patient data
        for section in structure['patient_sections']:
            section_data = patient_data[section]
            if isinstance(section_data, dict) and 'event' in section_data:
                events = section_data['event']
                if not isinstance(events, list):
                    events = [events]
                
                # Get sample event to understand structure
                if events:
                    sample_event = events[0]
                    structure[f'{section}_event_attributes'] = list(sample_event.keys())
                    structure[f'{section}_event_count'] = len(events)
    
    # Print summary of the data structure
    print("XML Data Structure Analysis:")
    print("-" * 50)
    
    if 'patient_attributes' in structure:
        print(f"Patient Attributes: {', '.join(structure['patient_attributes'])}")
    
    if 'patient_sections' in structure:
        print(f"\nPatient Sections: {', '.join(structure['patient_sections'])}")
        
        for section in structure['patient_sections']:
            if f'{section}_event_count' in structure:
                print(f"\n  {section.capitalize()} Section:")
                print(f"    - Events: {structure[f'{section}_event_count']}")
                print(f"    - Event Attributes: {', '.join(structure[f'{section}_event_attributes'])}")
    
    return structure

# Execute the analysis
# structure_summary = analyze_xml_structure(data)

In [None]:
import inspect 

def convert_ohio_xml_to_dataframes(data, xml_path=None):
    """
    Convert OhioT1DM XML data into pandas DataFrames for each section.
    
    Args:
        data (dict): The parsed XML data from OhioT1DM dataset
        xml_path (str, optional): Path to the XML file, used to extract year and dataset type
        
    Returns:
        dict: Dictionary mapping section names to pandas DataFrames
    """
    # import xmltodict


    dataframes = {}
    
    if 'patient' not in data:
        print("No patient data found in the XML")
        return dataframes
    
    patient = data['patient']
    patient_id = patient.get('@id', 'unknown')
    
    # Extract patient attributes
    patient_attrs = {k.replace('@', ''): v for k, v in patient.items() if k.startswith('@')}
    
    # Add year and dataset type (test/train) information from the file path if available
    if xml_path:
        # Extract year from the path using regex pattern matching
        import re
        year_match = re.search(r'/(\d{4})/', xml_path)
        if year_match:
            patient_attrs['year'] = year_match.group(1)
        else:
            patient_attrs['year'] = 'unknown'
            
        # Extract dataset type (test or train)
        if '/test/' in xml_path or '-testing' in xml_path:
            patient_attrs['dataset_type'] = 'test'
            patient_id = str(patient_id) + '_test'
        elif '/train/' in xml_path or '-training' in xml_path:
            patient_attrs['dataset_type'] = 'train'
            patient_id = str(patient_id) + '_train'
        else:
            patient_attrs['dataset_type'] = 'unknown'
            

        patient_attrs['patient_id'] = patient_id
        # Add the full file path for reference
        patient_attrs['file_path'] = xml_path
    
    patient_df = pd.DataFrame([patient_attrs])
    dataframes['patient_info'] = patient_df
    
    # Process each section
    for section_name, section_data in patient.items():
        if section_name.startswith('@') or not isinstance(section_data, dict):
            continue
            
        if 'event' in section_data:
            events = section_data['event']
            if not isinstance(events, list):
                events = [events]  # Convert single event to list
                
            # Extract all events into a list of dictionaries
            events_list = []
            for event in events:
                event_dict = {k.replace('@', ''): v for k, v in event.items()}
                event_dict['patient_id'] = patient_id  # Add patient ID to each event
                events_list.append(event_dict)
                
            # Create DataFrame from events
            if events_list:
                section_df = pd.DataFrame(events_list)
                
                # Convert datetime-like columns to datetime
                datetime_columns = ['ts', 'tbegin', 'tend']
                for col in datetime_columns:
                    if col in section_df.columns:
                        section_df[col] = pd.to_datetime(section_df[col], format='%d-%m-%Y %H:%M:%S', errors='coerce')
                
                dataframes[section_name] = section_df
    
    return dataframes

convert_ohio_xml_to_dataframes.fn_string = inspect.getsource(convert_ohio_xml_to_dataframes)


# Example usage
# if 'patient' in data:
#     print("\nConverting XML data to DataFrames...")
#     dfs = convert_ohio_xml_to_dataframes(data, xml_path=xml_file)
    
#     # Display summary of created DataFrames
#     print(f"\nCreated {len(dfs)} DataFrames:")
#     for section_name, df in dfs.items():
#         print(f"  - {section_name}: {df.shape[0]} rows, {df.shape[1]} columns")
#         if not df.empty:
#             print(f"    Columns: {', '.join(df.columns.tolist())}")
#             print(f"    Sample data (first 5 row):")
#             display(df.head(5))


In [None]:
# SourceFile_List

In [None]:
# Process all XML files and combine the data into consolidated DataFrames
import os
import pandas as pd
from collections import defaultdict

# Dictionary to store combined DataFrames for each section
combined_dfs = defaultdict(list)

# print("Processing all XML files in the OhioT1DM dataset...")
for xml_file in SourceFile_List:
    try:
        # Parse XML file
        # print(f"Processing file: {os.path.basename(xml_file)}")
        with open(xml_file, 'r') as f:
            xml_content = f.read()
        
        # Convert XML to dictionary using xmltodict
        import xmltodict
        data = xmltodict.parse(xml_content)
        
        # Convert to DataFrames
        if 'patient' in data:
            patient_dfs = convert_ohio_xml_to_dataframes(data, xml_path=xml_file)
            
            # Add each DataFrame to the combined collection
            for section_name, df in patient_dfs.items():
                if not df.empty:
                    combined_dfs[section_name].append(df)
    except Exception as e:
        print(f"Error processing {xml_file}: {str(e)}")

# Concatenate all DataFrames for each section
final_dfs = {}
for section_name, df_list in combined_dfs.items():
    if df_list:
        final_dfs[section_name] = pd.concat(df_list, ignore_index=True)
        print(f"tableL: {section_name}")
        print(f"{final_dfs[section_name].shape[0]} rows, \t{final_dfs[section_name].shape[1]} columns, \t patient unique number: {final_dfs[section_name]['patient_id'].nunique()}")


In [None]:
# [i for i in final_dfs]

## 1. patient_info --> Patient

In [None]:
# df = final_dfs['patient_info']

######### deal with the ElogBGEntry
df = final_dfs['patient_info']


RawName = 'Patient'
raw_columns = ['PatientID', 'MRSegmentID', 'MRSegmentModifiedDateTime', 'DiseaseType',
                             'Gender', 'ActivationDate', 'UserTimeZoneOffset', 'UserTimeZone',
                             'Description', 'YearOfBirth']

df = df.rename(columns = {'patient_id': 'PatientID'})
df = df.reindex(columns = raw_columns)
# df['BGEntryID'] = df.index
df['DiseaseType'] = 1
df['UserTimeZoneOffset'] = 0
df.head()

## 2. exercise --> ELogExerciseEntry

In [None]:
# df = final_dfs['patient_info']

######### deal with the ElogBGEntry
df = final_dfs['exercise']
display(df.head())
print(df.columns)

# print(df['type'].value_counts())

RawName = 'ELogExerciseEntry'

raw_columns = ['ExerciseEntryID', 'PatientID', 'EntryID',
                'ExerciseDuration', 'ExerciseType', 'ExerciseIntensity',
                'TimeSinceExercise', 'EntrySourceID', 'ActivityTypeID',
                'ObservationDateTime', 'ObservationEntryDateTime',
                'TimezoneOffset', 'Timezone', 'EntryCreatedDateTime',
                'ObservationCreatedBy', 'ObservationStatus',
                'SourceReferenceID', 'ModifiedDateTime', 'CaloriesBurned',
                'DistanceInMeters', 'ExternalEntryID', 'ExternalSourceID']

df = df.rename(columns = {
    'patient_id': 'PatientID', 
    'carbs': 'CarbsValue',
    'ts': 'ObservationDateTime',
    'intensity': 'ExerciseIntensity', 
    'duration': 'ExerciseDuration',
})

df = df.reindex(columns = raw_columns)
df['ExerciseEntryID'] = df.index
df['ObservationEntryDateTime'] = pd.to_datetime(df['ObservationDateTime'])
# df['DiseaseType'] = 1
df['TimezoneOffset'] = 0
df.head()

## 3. meal --> ELogCarbsEntry

In [None]:
# df = final_dfs['patient_info']

######### deal with the ElogBGEntry
df = final_dfs['meal']
print(df.columns)
display(df.head())

print(df['type'].value_counts())


# TODO: check with Abhi for the Type2ActivityID, where is Breakfast, Lunch, Dinner, etc.
Type2ActivityID_string = '''
BeforeBreakFast = 1,
AfterBreakFast = 2,
BeforeLunch = 3,
AfterLunch = 4,
BeforeDinner = 5,
AfterDinner = 6,
Bedtime = 7,
BeforeExercise = 8,
AfterExercise = 9,
Snack = 12,
Fasting = 14,
JustChecking = 31,
'''
Type2ActivityID = {i.split('=')[0].strip(): int(i.split('=')[1]) for i in Type2ActivityID_string.split(',\n') if '='  in i}
# Type2ActivityID


RawName = 'ELogCarbsEntry'
raw_columns = ['PatientID', 'CarbsEntryID', 'EntryID', 'CarbsValue',
                'EntrySourceID', 'ActivityTypeID', 'ObservationDateTime',
                'ObservationEntryDateTime', 'TimezoneOffset', 'Timezone',
                'EntryCreatedDateTime', 'ObservationCreatedBy',
                'ObservationStatus', 'SourceReferenceID', 'ModifiedDateTime',
                'ExternalSourceID', 'ExternalEntryID', 'TotalCalories']

df = df.rename(columns = {
    'patient_id': 'PatientID', 
    'carbs': 'CarbsValue',
    'ts': 'ObservationDateTime',
})
df = df.reindex(columns = raw_columns)
df['CarbsEntryID'] = df.index
df['ObservationDateTime'] = pd.to_datetime(df['ObservationDateTime'])
df['EntryCreatedDateTime'] = pd.to_datetime(df['ObservationDateTime'])
# df['DiseaseType'] = 1
df['TimezoneOffset'] = 0
df.head()

## 4. glucose_level --> ElogBGEntry

In [None]:
######### deal with the ElogBGEntry
df = final_dfs['glucose_level']


RawName = 'ElogBGEntry'
raw_columns = ['BGEntryID', 'PatientID', 'ObservationDateTime', 'BGValue',
                                 'IsNormalIndicator', 'ObservationEntryDateTime', 'TimezoneOffset',
                                 'Timezone', 'EntryCreatedDateTime', 'ActualBGValue',
                                 'ExternalSourceID', 'UserObservationDateTime']
df = df.rename(columns = {'patient_id': 'PatientID', 'value': 'BGValue', 'ts': 'ObservationDateTime'})
df = df.reindex(columns = raw_columns)
df['BGEntryID'] = df.index
df['TimezoneOffset'] = 0 
df['ExternalSourceID'] = 18
df['EntryCreatedDateTime'] = pd.to_datetime(df['ObservationDateTime'])
df.head()

## process_Source_to_Raw

In [None]:
# %%%%%%%%%%%%%%%%%%%%% user
def process_Source_to_Raw(OneCohort_Args, SourceFile_List, get_RawName_from_SourceFile,SPACE):
    """
    Process source files to raw data files, including renaming columns and merging certain files.

    Args:
    OneCohort_Args (dict): Dictionary containing processing arguments, including 'FolderPath'.
    SourceFile_List (list): List of source file paths.
    get_RawName_from_SourceFile (function): Function to extract raw name from file path.

    Returns:
    dict: Mapping of raw names to processed file paths.
    """

    from collections import defaultdict


    # Initialize dictionary to store raw names and their corresponding file paths
    RawName_to_dfRaw = {}
    # for file_path in SourceFile_List:
    #     # Extract the raw name for each file using the function 
    #     RawName = get_RawName_from_SourceFile(file_path, OneCohort_Args)
    #     # Assign value file_path to key RawName
    #     RawName_to_dfRaw[RawName] = file_path

    # import xmltodict

    combined_dfs = defaultdict(list)

    # print("Processing all XML files in the OhioT1DM dataset...")
    for xml_file in SourceFile_List:
        try:
            # Parse XML file
            # print(f"Processing file: {os.path.basename(xml_file)}")
            with open(xml_file, 'r') as f:
                xml_content = f.read()
            
            # Convert XML to dictionary using xmltodict
            import xmltodict
            data = xmltodict.parse(xml_content)
            
            # Convert to DataFrames
            if 'patient' in data:
                patient_dfs = convert_ohio_xml_to_dataframes(data, xml_path=xml_file)
                
                # Add each DataFrame to the combined collection
                for section_name, df in patient_dfs.items():
                    if not df.empty:
                        combined_dfs[section_name].append(df)
        except Exception as e:
            print(f"Error processing {xml_file}: {str(e)}")



    final_dfs = {}
    for section_name, df_list in combined_dfs.items():
        if df_list:
            final_dfs[section_name] = pd.concat(df_list, ignore_index=True)




    # ---------- process patient_info --> Patient
    df = final_dfs['patient_info']
    RawName = 'Patient'
    raw_columns = ['PatientID', 'MRSegmentID', 'MRSegmentModifiedDateTime', 'DiseaseType',
                                'Gender', 'ActivationDate', 'UserTimeZoneOffset', 'UserTimeZone',
                                'Description', 'YearOfBirth']

    df = df.rename(columns = {'patient_id': 'PatientID'})
    df = df.reindex(columns = raw_columns)
    # df['BGEntryID'] = df.index
    df['DiseaseType'] = 1
    df['UserTimeZoneOffset'] = 0
    RawName_to_dfRaw[RawName] = df 



    # ---------- process exercise --> ELogExerciseEntry
    df = final_dfs['exercise']
    # print(df['type'].value_counts())

    RawName = 'ELogExerciseEntry'

    raw_columns = ['ExerciseEntryID', 'PatientID', 'EntryID',
                    'ExerciseDuration', 'ExerciseType', 'ExerciseIntensity',
                    'TimeSinceExercise', 'EntrySourceID', 'ActivityTypeID',
                    'ObservationDateTime', 'ObservationEntryDateTime',
                    'TimezoneOffset', 'Timezone', 'EntryCreatedDateTime',
                    'ObservationCreatedBy', 'ObservationStatus',
                    'SourceReferenceID', 'ModifiedDateTime', 'CaloriesBurned',
                    'DistanceInMeters', 'ExternalEntryID', 'ExternalSourceID']

    df = df.rename(columns = {
        'patient_id': 'PatientID', 
        'carbs': 'CarbsValue',
        'ts': 'ObservationDateTime',
        'intensity': 'ExerciseIntensity', 
        'duration': 'ExerciseDuration',
    })

    df = df.reindex(columns = raw_columns)
    df['ExerciseEntryID'] = df.index
    # df['DiseaseType'] = 1
    df['TimezoneOffset'] = 0
    df['ObservationEntryDateTime'] = pd.to_datetime(df['ObservationDateTime'])
    # df.head()
    RawName_to_dfRaw[RawName] = df 



    # ---------- process meal --> ELogCarbsEntry
    # df = final_dfs['patient_info']

    ######### deal with the ElogBGEntry
    df = final_dfs['meal']
    # print(df.columns)
    # display(df.head())
    # print(df['type'].value_counts())
    # TODO: check with Abhi for the Type2ActivityID, where is Breakfast, Lunch, Dinner, etc.

    # Type2ActivityID_string = '''
    # BeforeBreakFast = 1,
    # AfterBreakFast = 2,
    # BeforeLunch = 3,
    # AfterLunch = 4,
    # BeforeDinner = 5,
    # AfterDinner = 6,
    # Bedtime = 7,
    # BeforeExercise = 8,
    # AfterExercise = 9,
    # Snack = 12,
    # Fasting = 14,
    # JustChecking = 31,
    # '''
    # Type2ActivityID = {i.split('=')[0].strip(): int(i.split('=')[1]) for i in Type2ActivityID_string.split(',\n') if '='  in i}
    # Type2ActivityID


    RawName = 'ELogCarbsEntry'
    raw_columns = ['PatientID', 'CarbsEntryID', 'EntryID', 'CarbsValue',
                    'EntrySourceID', 'ActivityTypeID', 'ObservationDateTime',
                    'ObservationEntryDateTime', 'TimezoneOffset', 'Timezone',
                    'EntryCreatedDateTime', 'ObservationCreatedBy',
                    'ObservationStatus', 'SourceReferenceID', 'ModifiedDateTime',
                    'ExternalSourceID', 'ExternalEntryID', 'TotalCalories']

    df = df.rename(columns = {
        'patient_id': 'PatientID', 
        'carbs': 'CarbsValue',
        'ts': 'ObservationDateTime',
    })
    df = df.reindex(columns = raw_columns)
    df['CarbsEntryID'] = df.index
    # df['DiseaseType'] = 1
    df['TimezoneOffset'] = 0
    df['EntryCreatedDateTime'] = pd.to_datetime(df['ObservationDateTime'])
    # df.head()
    RawName_to_dfRaw[RawName] = df 


    # ---------- process glucose_level --> ElogBGEntry
    df = final_dfs['glucose_level']
    RawName = 'ElogBGEntry'
    raw_columns = ['BGEntryID', 'PatientID', 'ObservationDateTime', 'BGValue',
                                    'IsNormalIndicator', 'ObservationEntryDateTime', 'TimezoneOffset',
                                    'Timezone', 'EntryCreatedDateTime', 'ActualBGValue',
                                    'ExternalSourceID', 'UserObservationDateTime']
    df = df.rename(columns = {'patient_id': 'PatientID', 'value': 'BGValue', 'ts': 'ObservationDateTime'})
    df = df.reindex(columns = raw_columns)
    df['BGEntryID'] = df.index
    df['TimezoneOffset'] = 0 
    df['ExternalSourceID'] = 18
    df['EntryCreatedDateTime'] = pd.to_datetime(df['ObservationDateTime'])
    RawName_to_dfRaw[RawName] = df

    for RawName, df in RawName_to_dfRaw.items():
        print(RawName, df.shape)
        print(df.columns)
        # display(df.head())

        path = os.path.join(OneCohort_Args['FolderPath'], f'processed_RawFile_{RawName}.csv')
        df.to_csv(path, index=False)
        RawName_to_dfRaw[RawName] = path# .replace(SPACE['DATA_RAW'], '$DATA_RAW$')
        
    return RawName_to_dfRaw

process_Source_to_Raw.fn_string = inspect.getsource(process_Source_to_Raw)
# %%%%%%%%%%%%%%%%%%%%% user

RawName_to_dfRaw = process_Source_to_Raw(OneCohort_Args, SourceFile_List, get_RawName_from_SourceFile,SPACE)    
RawName_to_dfRaw

# Step 5: Save Cohort Fn

In [None]:
# Get the python file path from the cohort object 
pypath = cohort.pypath
pypath

In [None]:
prefix = [
    'import os',
    'import pandas as pd', 
    'import numpy as np', 
    'import json'
    ]
prefix

In [None]:
# List of variables to be included in the generated script
iterative_variables = [OneCohort_Args, SourceFile_SuffixList]
iterative_variables

In [None]:
from recfldtkn.base import Base
fn_variables = [get_RawName_from_SourceFile, process_Source_to_Raw]
pycode= Base.convert_variables_to_pystirng(iterative_variables = iterative_variables, 
                                           fn_variables = fn_variables, 
                                           prefix = prefix)
# print(pycode)


In [None]:
with open(pypath, 'w') as file: file.write(pycode)

In [None]:
from recfldtkn.base import Base 
# Get the python file path from the cohort object 
pypath = cohort.pypath

# Define the import statements to be included at the begining
prefix = [
    'import os',
    'import pandas as pd', 
    'import numpy as np'
    ]
# List of variables to be included in the generated script
iterative_variables = [OneCohort_Args, SourceFile_SuffixList]
# list of the funcitons to be included in the generated script
fn_variables = [
    convert_ohio_xml_to_dataframes,
    get_RawName_from_SourceFile, 
    process_Source_to_Raw
    ]
pycode = Base.convert_variables_to_pystirng(iterative_variables = iterative_variables, 
                                           fn_variables = fn_variables, 
                                           prefix = prefix)
# Create the directory for the Python file if it doesn't exist
if not os.path.exists(os.path.dirname(pypath)): os.makedirs(os.path.dirname(pypath))

# print(pypath)
with open(pypath, 'w') as file: file.write(pycode)
# Create a HTML link and display it
full_path = os.path.join(WORKSPACE_PATH, pypath)

display(HTML(f'{pypath} <a href="{full_path}" target="_blank">Open File</a>'))

# Step 6: Test 

In [None]:
from recfldtkn.record_base.cohort import CohortFn, Cohort
from config.config_record.Cohort import CohortName_to_OneCohortArgs
CohortNames = [i for i in CohortName_to_OneCohortArgs.keys()]
CohortNames

In [None]:
# # %%%%%%%%%%%%%%%%%%%%% user
CohortName = 'OhioT1DM'
# CohortName = 'WellDoc2023CVSDeRx'
# # %%%%%%%%%%%%%%%%%%%%% 

OneCohort_Args = CohortName_to_OneCohortArgs[CohortName]
OneCohort_Args

In [None]:
Source2CohortName = OneCohort_Args['Source2CohortName']
cohort_fn = CohortFn(Source2CohortName, SPACE)
cohort_fn

In [None]:
cohort = Cohort(OneCohort_Args, SPACE, cohort_fn)
cohort

In [None]:
cohort.setup_fn(cohort_fn)

In [None]:
from recfldtkn.record_base.cohort import CohortFn, Cohort


Source2CohortName = OneCohort_Args['Source2CohortName']
cohort_fn = CohortFn(Source2CohortName, SPACE)
cohort = Cohort(OneCohort_Args, SPACE, cohort_fn)
cohort.setup_fn(cohort_fn)

cohort.initialize_cohort(load_data=False)

In [None]:
cohort.RawName_to_dfRaw

In [None]:
cohort.SourceFile_List

In [None]:
cohort.RawName_to_dfRaw