# Space

In [1]:
import os
import logging
import pandas as pd 
from IPython.display import display, HTML
KEY = 'WorkSpace'
WORKSPACE_PATH = os.getcwd().split(KEY)[0] + KEY
print(WORKSPACE_PATH)
os.chdir(WORKSPACE_PATH)

import sys
sys.path.append(WORKSPACE_PATH)
from proj_space import PROJECT, TaskName, SPACE
SPACE['WORKSPACE_PATH'] = WORKSPACE_PATH
sys.path.append(SPACE['CODE_FN'])

recfldtkn_config_path = os.path.join(SPACE['CODE_FN'], 'config_recfldtkn/')

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format='[%(levelname)s:%(asctime)s:(%(filename)s@%(lineno)d %(name)s)]: %(message)s')

g:\Shared drives\CDHAI-WellDoc\2024-WellDocTest-SPACE\_WellDoc-RFT-WorkSpace


# [Part 1]: develop fn_humanrec.py

## [Step 1]: Create Cohort Yaml

In [2]:
# Create a HTML link and display it
path = os.path.join(recfldtkn_config_path, 'Cohort.yaml')
full_path = os.path.join(WORKSPACE_PATH, path)
display(HTML(f'{path} <a href="{full_path}" target="_blank">Open File</a>'))

## [Step 2]: Update Cohort Yaml

**Template**

```yaml
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%  the folders where to save the data
CohortInfo:
  COHORT_NAME_XXXXXX: # <---- change this.
    cohort_label: 1
    FolderPath: $DATA_RAW$/COHORT_NAME_XXXXXX

# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%  where to get the raw Human data.
RawRootID: 'XXXXXXXXX' # <--- HumanID in Raw Data. 
RootID:  'XXXXXXXXX'        # <--- HumanID in RFT Data. 
```

In [3]:
display(HTML(f'{path} <a href="{full_path}" target="_blank">Open File</a>'))

## [Step 3] Cohort: Pick OneCohort

In [4]:
################### in notebook ###################  
args_information = ['--cohort_label', '3'] # < ------- change here
###################################################

In [5]:
import argparse
from recfldtkn.configfn import load_cohort_args

cohort_args = load_cohort_args(recfldtkn_config_path, SPACE)

my_parser = argparse.ArgumentParser(description='Process Input.')

my_parser.add_argument('--cohort_name',
                    metavar='cohort_name',
                    default = None, 
                    type=str,
                    help='the cohort_name to process')

my_parser.add_argument('--cohort_label',
                    metavar='cohort_label',
                    default = None, 
                    type=str,
                    help='the label for cohort_name to process')

args = my_parser.parse_args(args_information)

cohort_label = int(args.cohort_label)
cohort_name = [v for k, v in cohort_args['CohortInfo'].items() 
               if v['cohort_label'] == cohort_label][0]['cohort_name']

print(f'=============== {cohort_name}: {cohort_label} ======================')
print(cohort_name, cohort_label)


RawData2023_CVSDeRxAug 3


## [Step 4] OneCohort_Args

In [6]:
OneCohort_Args = cohort_args['CohortInfo'][cohort_name]
OneCohort_Args

{'cohort_label': 3,
 'cohort_name': 'RawData2023_CVSDeRxAug',
 'FolderPath': '../_Data/0-Data_Raw/RawData2023_CVSDeRxAug/'}

In [7]:
os.listdir(OneCohort_Args['FolderPath'])

['08_23_2023_MedAdministration.csv',
 '08_23_2023_CustomFood.csv',
 '08_23_2023_RecentElogBGEntry.csv',
 '08_23_2023_ImportedMedication.csv',
 '08_23_2023_ELogCarbsEntry.csv',
 '08_23_2023_CurriculumLessonProgressDetails.csv',
 '08_23_2023_CurriculumTopicProgressDetails.csv',
 '08_23_2023_CurriculumQuizResponse.csv',
 '08_23_2023_SleepEntry.csv',
 '08_23_2023_ElogKetoneEntry.csv',
 '08_23_2023_ElogBPEntry.csv',
 '08_23_2023_PatientReminder.csv',
 '08_23_2023_PatientLabTestResult.csv',
 '08_23_2023_Patient.csv',
 '08_23_2023_PatientTargetSegment.csv',
 '08_23_2023_UserDetail.csv',
 '08_23_2023_PatientBloodGlucoseTargets.csv',
 '08_23_2023_CurriculumSurveyResponse.csv',
 '08_23_2023_ELogSymptomTracker.csv',
 '08_23_2023_FavoriteFood.csv',
 '08_23_2023_StepEntry.csv',
 '08_23_2023_WeightMeter.csv',
 '08_23_2023_WeightGoal.csv',
 '08_23_2023_ElogBGEntry.csv',
 '08_23_2023_ElogWeightEntry.csv',
 '08_23_2023_ELogExerciseEntry.csv',
 '08_31_2023_DSMQuestionAnswer_CVS_Pull_De_Rx.csv',
 '08_23_

## [Step 5] selected_source_file_suffix_list

In [8]:
#########################
selected_source_file_suffix_list = ['csv']
#########################


# this step will be conducted within the pipeline
selected_file_list = [file for file in os.listdir(OneCohort_Args['FolderPath']) 
                      if file.endswith(tuple(selected_source_file_suffix_list))]
print(len(selected_file_list))

41


In [9]:
selected_file_list[:10]

['08_23_2023_MedAdministration.csv',
 '08_23_2023_CustomFood.csv',
 '08_23_2023_RecentElogBGEntry.csv',
 '08_23_2023_ImportedMedication.csv',
 '08_23_2023_ELogCarbsEntry.csv',
 '08_23_2023_CurriculumLessonProgressDetails.csv',
 '08_23_2023_CurriculumTopicProgressDetails.csv',
 '08_23_2023_CurriculumQuizResponse.csv',
 '08_23_2023_SleepEntry.csv',
 '08_23_2023_ElogKetoneEntry.csv']

## [Step 6] Map Raw Table file to Raw Table name


TODO: 


eg: '05_12_2022_PatientCGMDeviceDetail.csv' to 'PatientCGMDeviceDetail'



In [10]:
import inspect
#########################
def get_tablename_from_raw_filename(file_path):
    # name = file_path.split('/')[-1].split('_df_')[0]
    tablename = file_path.split('_')[-1].split('.')[0]
    return tablename

get_tablename_from_raw_filename.fn_string = inspect.getsource(get_tablename_from_raw_filename)
#########################

In [11]:
file_path = selected_file_list[0]

tablename = get_tablename_from_raw_filename(file_path)
print(file_path)
print(tablename)

08_23_2023_MedAdministration.csv
MedAdministration


In [12]:
# TODO: add loop

## [Step 7]: get_rawrootid_column


TODO:

In [13]:
##################################################
def get_rawrootid_from_raw_table_column(raw_table_columns):
    rawrootid = None 
    if 'PatientID' in raw_table_columns: 
        rawrootid = 'PatientID' 
    if 'UserID' in raw_table_columns:
        rawrootid = 'UserID'
    if 'PatientId' in raw_table_columns: 
        rawrootid = 'PatientId' 

    return rawrootid

get_rawrootid_from_raw_table_column.fn_string = inspect.getsource(get_rawrootid_from_raw_table_column)
##################################################


In [14]:
# interpret the code. 
for file_path in selected_file_list:
    print('\n')
    full_path = os.path.join(OneCohort_Args['FolderPath'], file_path)
    if full_path.endswith('.csv'):
        print(file_path, '<--- raw_table_columns')
        try:
            df = pd.read_csv(full_path, nrows=5)
            raw_table_columns = df.columns.tolist()
            print(raw_table_columns, '<--- file_path')
            rawrootid = get_rawrootid_from_raw_table_column(raw_table_columns)
            print(rawrootid, '<--- rawrootid')

        except:
            print(f'Error reading {full_path}')
            continue 
    

    elif full_path.endswith('.p'):
        raise ValueError(f'.p is not supported: {full_path}')
    
    else:
        raise ValueError(f'file type not supported: {full_path}')



08_23_2023_MedAdministration.csv <--- raw_table_columns
['AdministrationID', 'ELogEntryID', 'AdministrationDate', 'ActivityTypeID', 'Dose', 'MedSourceID', 'StatusID', 'EntryDateTime', 'CreatedDate', 'CreatedBy', 'RowVersionID', 'PatientID', 'MedPrescriptionID', 'AdministrationTimeZoneOffset', 'AdministrationTimeZone', 'PrescriptionGUID', 'MedAdministrationID', 'MedPrescriptionTime', 'MedicationID', 'AdministrationTimeLabelID', 'AdminSlot', 'ScheduledSlot', 'ModifiedDateTime', 'UserAdministrationDate', 'BGValue', 'CarbsValue', 'InsulinCalculatorUsageStatus', 'IOBValue', 'FoodInsulinDose', 'ExternalEntryID'] <--- file_path
PatientID <--- rawrootid


08_23_2023_CustomFood.csv <--- raw_table_columns
['CustomFoodID', 'PatientID', 'CustomFoodGUID', 'FoodName', 'CarbsValue', 'ServingType', 'ServingSize', 'StatusID', 'EntrySourceID', 'EntryDateTime', 'CreatedDate', 'ModifiedDate', 'RowVersionID', 'Fiber', 'Fat', 'Calories', 'Protein', 'Sodium', 'LastModifiedBy'] <--- file_path
PatientID <---

## [Step 8]: Exclude Raw Table when we select the Patients.


We only focus on the patients who have at least one records.

But some table we don't want to consider it. For example, Patient Table. 

eg. 

patient A: PatientTable 1, ATable 0, BTable 0, CTable 0. 

patient B: PatientTable 1, ATable 1, BTable 0, CTable 0


We don't want A because A's total record number except PatientTable is 0. 
We want B because B's total  record number except PatientTable is larger than 0.


So we put `PatientTable` into `excluded_raw_table_name`.


In [15]:
# How to get the list. 

# initially, the list is empty
# based on excluded_raw_table_name is empty
# we will get the df_Human (df_Human2RawRecNum) at the end of this notebook.
# Then, we will check the df_Human, and see if there is any table that we want to exclude.
# if there is, we will add the table name to the excluded_raw_table_name list.

#########################
# excluded_raw_table_names = [] 
excluded_raw_table_names = ['UserDetail', 'Patient', 
                           'QuestionResponse', 'PatientBloodGlucoseTargets', 
                           'Rx', 'PatientObservationSummary', 
                           'PatientTargetSegment', 'TDC']
#########################

## [Step 9]: Save the above tools into the pipeline folder

In [16]:
from recfldtkn.loadtools import convert_variables_to_pystirng, load_module_variables

prefix = ['import pandas as pd', 'import numpy as np']
iterative_variables = [selected_source_file_suffix_list, excluded_raw_table_names]
fn_variables = [get_tablename_from_raw_filename, get_rawrootid_from_raw_table_column]
pycode = convert_variables_to_pystirng(iterative_variables = iterative_variables, 
                                       fn_variables = fn_variables, 
                                       prefix = prefix)
pypath = cohort_args['pypath']
# print(pypath)
with open(pypath, 'w') as file: file.write(pycode)
# Create a HTML link and display it
full_path = os.path.join(WORKSPACE_PATH, pypath)
display(HTML(f'{pypath} <a href="{full_path}" target="_blank">Open File</a>'))

[INFO:2024-04-18 23:48:25,040:(config.py@58 datasets)]: PyTorch version 2.1.2+cu121 available.


# [Part 2] Process dfHumanRec with Pipeline


Given ../pipeline/fn_humanrec/humanrec.py  is ready. 

We have a pipeline tool: get_cohort_level_record_number_counts

This will call this humanrec.py internally, to process our records. 

In [17]:
cohort_name

'RawData2023_CVSDeRxAug'

In [18]:
cohort_label

3

In [19]:
cohort_args['pypath']

'../pipeline/fn_humanrec/humanrec.py'

## [Step 1]: Call Pipeline to get df_Human

Here we have a pipeline function `get_cohort_level_record_number_counts` to get the df_Human.

It will load the tools from '../pipeline/fn_humanrec/humanrec.py' to do the process.

If you want to know the details of the pipeline_fn, you can go to the unpacking notebook to understand it more. 

In [20]:
from recfldtkn.pipeline_record import get_RawName_to_dfRawPath


OneCohort_Args

{'cohort_label': 3,
 'cohort_name': 'RawData2023_CVSDeRxAug',
 'FolderPath': '../_Data/0-Data_Raw/RawData2023_CVSDeRxAug/'}

In [21]:
rft_config = {
    'base_config': cohort_args, 
}

In [22]:
RawName_to_dfRawPath = get_RawName_to_dfRawPath(OneCohort_Args, rft_config)
RawName_to_dfRawPath

[INFO:2024-04-18 23:48:25,930:(pipeline_record.py@39 recfldtkn.pipeline_record)]: ../_Data/0-Data_Raw/RawData2023_CVSDeRxAug/ <-- FolderPath
[INFO:2024-04-18 23:48:25,931:(pipeline_record.py@40 recfldtkn.pipeline_record)]: 41 <--- fullfile_list


{'MedAdministration': '../_Data/0-Data_Raw/RawData2023_CVSDeRxAug/08_23_2023_MedAdministration.csv',
 'CustomFood': '../_Data/0-Data_Raw/RawData2023_CVSDeRxAug/08_23_2023_CustomFood.csv',
 'RecentElogBGEntry': '../_Data/0-Data_Raw/RawData2023_CVSDeRxAug/08_23_2023_RecentElogBGEntry.csv',
 'ImportedMedication': '../_Data/0-Data_Raw/RawData2023_CVSDeRxAug/08_23_2023_ImportedMedication.csv',
 'ELogCarbsEntry': '../_Data/0-Data_Raw/RawData2023_CVSDeRxAug/08_23_2023_ELogCarbsEntry.csv',
 'CurriculumLessonProgressDetails': '../_Data/0-Data_Raw/RawData2023_CVSDeRxAug/08_23_2023_CurriculumLessonProgressDetails.csv',
 'CurriculumTopicProgressDetails': '../_Data/0-Data_Raw/RawData2023_CVSDeRxAug/08_23_2023_CurriculumTopicProgressDetails.csv',
 'CurriculumQuizResponse': '../_Data/0-Data_Raw/RawData2023_CVSDeRxAug/08_23_2023_CurriculumQuizResponse.csv',
 'SleepEntry': '../_Data/0-Data_Raw/RawData2023_CVSDeRxAug/08_23_2023_SleepEntry.csv',
 'ElogKetoneEntry': '../_Data/0-Data_Raw/RawData2023_CVSDeR

In [23]:
from recfldtkn.pipeline_record import get_CohortLevel_df_Human2RawRecNum

OneCohort_config = OneCohort_Args
RawName_to_dfRaw = RawName_to_dfRawPath

df_Human = get_CohortLevel_df_Human2RawRecNum(OneCohort_config, 
                                                rft_config, 
                                                RawName_to_dfRaw)


df_Human

[INFO:2024-04-18 23:48:25,972:(pipeline_record.py@92 recfldtkn.pipeline_record)]: {'cohort_label': 3, 'cohort_name': 'RawData2023_CVSDeRxAug', 'FolderPath': '../_Data/0-Data_Raw/RawData2023_CVSDeRxAug/'}
[INFO:2024-04-18 23:48:26,033:(pipeline_record.py@63 recfldtkn.pipeline_record)]: id_column: PatientID
[INFO:2024-04-18 23:48:26,035:(pipeline_record.py@166 recfldtkn.pipeline_record)]: MedAdministration:'path-../_Data/0-Data_Raw/RawData2023_CVSDeRxAug/08_23_2023_MedAdministration.csv' # (36, 3)
[INFO:2024-04-18 23:48:26,041:(pipeline_record.py@63 recfldtkn.pipeline_record)]: id_column: PatientID
[INFO:2024-04-18 23:48:26,042:(pipeline_record.py@166 recfldtkn.pipeline_record)]: CustomFood:'path-../_Data/0-Data_Raw/RawData2023_CVSDeRxAug/08_23_2023_CustomFood.csv' # (26, 3)
[INFO:2024-04-18 23:48:26,076:(pipeline_record.py@63 recfldtkn.pipeline_record)]: id_column: PatientID
[INFO:2024-04-18 23:48:26,077:(pipeline_record.py@166 recfldtkn.pipeline_record)]: RecentElogBGEntry:'path-../_Da

RawName,PID,PatientID,BPMeter,CurriculumCourseBadgeDetails,CurriculumCourseProgressDetails,CurriculumLessonProgressDetails,CurriculumQuizResponse,CurriculumQuizResult,CurriculumSurveyResponse,CurriculumTopicProgressDetails,...,PatientReminder,PatientTargetSegment,RecentElogBGEntry,Rx,SleepEntry,StepEntry,UserDetail,WeightGoal,TotalRecNum,CohortLabel
0,3000001,2572,,,,,,,,,...,,1.0,,5.0,,,1.0,,2,3
1,3000002,2624,,,1.0,1.0,,,,1.0,...,,1.0,,5.0,109.0,,1.0,,708,3
2,3000003,2636,,,,,,,,,...,21.0,1.0,,5.0,,,1.0,1.0,895,3
3,3000004,2689,3.0,2.0,11.0,52.0,1.0,1.0,32.0,1.0,...,6.0,1.0,10392.0,14.0,2.0,,1.0,2.0,127301,3
4,3000005,2715,1.0,2.0,11.0,52.0,2.0,2.0,32.0,1.0,...,18.0,1.0,38.0,14.0,425.0,,1.0,2.0,10496,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64,3000065,3549,2.0,2.0,11.0,55.0,1.0,1.0,30.0,1.0,...,1.0,1.0,,5.0,1174.0,187.0,1.0,,5120,3
65,3000066,3552,2.0,,,,,,,,...,7.0,1.0,3.0,15.0,38.0,170.0,1.0,4.0,2590,3
66,3000067,3625,,,,,,,,,...,,1.0,,5.0,,,1.0,,52,3
67,3000068,3686,1.0,,,,,,,,...,,1.0,,5.0,,,1.0,,31,3


In [24]:
[i for i in RawName_to_dfRawPath]

['MedAdministration',
 'CustomFood',
 'RecentElogBGEntry',
 'ImportedMedication',
 'ELogCarbsEntry',
 'CurriculumLessonProgressDetails',
 'CurriculumTopicProgressDetails',
 'CurriculumQuizResponse',
 'SleepEntry',
 'ElogKetoneEntry',
 'ElogBPEntry',
 'PatientReminder',
 'PatientLabTestResult',
 'Patient',
 'PatientTargetSegment',
 'UserDetail',
 'PatientBloodGlucoseTargets',
 'CurriculumSurveyResponse',
 'ELogSymptomTracker',
 'FavoriteFood',
 'StepEntry',
 'WeightMeter',
 'WeightGoal',
 'ElogBGEntry',
 'ElogWeightEntry',
 'ELogExerciseEntry',
 'Rx',
 'ELogCommentEntry',
 'CurriculumQuizResult',
 'PatientLastExam',
 'PatientCarbTarget',
 'PatientObservationSummary',
 'CurriculumCourseProgressDetails',
 'MedPrescription',
 'ELogFoodItem',
 'PatientMeal',
 'PatientMeter',
 'BPMeter',
 'PatientHeight',
 'CurriculumCourseBadgeDetails',
 'MedPrescriptionDaySchedule']

In [25]:


columns = [i for i in RawName_to_dfRawPath if i in df_Human.columns] 
df_Human[columns] = df_Human[columns].astype(float)

# [Step 2] Check and Update excluded_raw_table_names

In [26]:
df_Human  

RawName,PID,PatientID,BPMeter,CurriculumCourseBadgeDetails,CurriculumCourseProgressDetails,CurriculumLessonProgressDetails,CurriculumQuizResponse,CurriculumQuizResult,CurriculumSurveyResponse,CurriculumTopicProgressDetails,...,PatientReminder,PatientTargetSegment,RecentElogBGEntry,Rx,SleepEntry,StepEntry,UserDetail,WeightGoal,TotalRecNum,CohortLabel
0,3000001,2572,,,,,,,,,...,,1.0,,5.0,,,1.0,,2,3
1,3000002,2624,,,1.0,1.0,,,,1.0,...,,1.0,,5.0,109.0,,1.0,,708,3
2,3000003,2636,,,,,,,,,...,21.0,1.0,,5.0,,,1.0,1.0,895,3
3,3000004,2689,3.0,2.0,11.0,52.0,1.0,1.0,32.0,1.0,...,6.0,1.0,10392.0,14.0,2.0,,1.0,2.0,127301,3
4,3000005,2715,1.0,2.0,11.0,52.0,2.0,2.0,32.0,1.0,...,18.0,1.0,38.0,14.0,425.0,,1.0,2.0,10496,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64,3000065,3549,2.0,2.0,11.0,55.0,1.0,1.0,30.0,1.0,...,1.0,1.0,,5.0,1174.0,187.0,1.0,,5120,3
65,3000066,3552,2.0,,,,,,,,...,7.0,1.0,3.0,15.0,38.0,170.0,1.0,4.0,2590,3
66,3000067,3625,,,,,,,,,...,,1.0,,5.0,,,1.0,,52,3
67,3000068,3686,1.0,,,,,,,,...,,1.0,,5.0,,,1.0,,31,3


# [Step 3] Save ds_Human as HFDS

In [27]:
import datasets
ds_HumanRec = datasets.Dataset.from_pandas(df_Human)
print(ds_HumanRec)

Dataset({
    features: ['PID', 'PatientID', 'BPMeter', 'CurriculumCourseBadgeDetails', 'CurriculumCourseProgressDetails', 'CurriculumLessonProgressDetails', 'CurriculumQuizResponse', 'CurriculumQuizResult', 'CurriculumSurveyResponse', 'CurriculumTopicProgressDetails', 'CustomFood', 'ELogCarbsEntry', 'ELogCommentEntry', 'ELogExerciseEntry', 'ELogFoodItem', 'ElogBGEntry', 'ElogBPEntry', 'ElogKetoneEntry', 'ElogWeightEntry', 'MedAdministration', 'MedPrescription', 'Patient', 'PatientBloodGlucoseTargets', 'PatientCarbTarget', 'PatientHeight', 'PatientLabTestResult', 'PatientLastExam', 'PatientMeal', 'PatientMeter', 'PatientObservationSummary', 'PatientReminder', 'PatientTargetSegment', 'RecentElogBGEntry', 'Rx', 'SleepEntry', 'StepEntry', 'UserDetail', 'WeightGoal', 'TotalRecNum', 'CohortLabel'],
    num_rows: 69
})


In [28]:
cohort_args['RecName']

'PRawRecNum'

In [29]:
print(SPACE['DATA_RFT'])

full_cohort_name = f'{cohort_label}-{cohort_name}'
print(full_cohort_name)
# ------------------------------------------------------------------------- # 
path = os.path.join(SPACE['DATA_RFT'], full_cohort_name, cohort_args['RecName'] + '_data')
print(path)
ds_HumanRec.save_to_disk(path)
print(ds_HumanRec)

../_Data/1-Data_RFT
3-RawData2023_CVSDeRxAug
../_Data/1-Data_RFT\3-RawData2023_CVSDeRxAug\PRawRecNum_data


Saving the dataset (0/1 shards):   0%|          | 0/69 [00:00<?, ? examples/s]

Dataset({
    features: ['PID', 'PatientID', 'BPMeter', 'CurriculumCourseBadgeDetails', 'CurriculumCourseProgressDetails', 'CurriculumLessonProgressDetails', 'CurriculumQuizResponse', 'CurriculumQuizResult', 'CurriculumSurveyResponse', 'CurriculumTopicProgressDetails', 'CustomFood', 'ELogCarbsEntry', 'ELogCommentEntry', 'ELogExerciseEntry', 'ELogFoodItem', 'ElogBGEntry', 'ElogBPEntry', 'ElogKetoneEntry', 'ElogWeightEntry', 'MedAdministration', 'MedPrescription', 'Patient', 'PatientBloodGlucoseTargets', 'PatientCarbTarget', 'PatientHeight', 'PatientLabTestResult', 'PatientLastExam', 'PatientMeal', 'PatientMeter', 'PatientObservationSummary', 'PatientReminder', 'PatientTargetSegment', 'RecentElogBGEntry', 'Rx', 'SleepEntry', 'StepEntry', 'UserDetail', 'WeightGoal', 'TotalRecNum', 'CohortLabel'],
    num_rows: 69
})


In [32]:
import shutil
from recfldtkn.pipeline_record import pipeline_for_FldTkn
import shutil
from recfldtkn.pipeline_record import pipeline_for_FldTkn
from recfldtkn.configfn import load_cohort_args
from recfldtkn.configfn import load_record_args
from recfldtkn.pipeline_record import pipeline_record
from recfldtkn.configfn import load_rft_config
from recfldtkn.loadtools import filter_with_cohort_label, load_ds_rec_and_info
from recfldtkn.pipeline_record import get_parentRecord_info

cohort_args = load_cohort_args(recfldtkn_config_path, SPACE)


############################
cohort_label_list = [1, 2, 3]
############################
cohort_label_to_cohort_name = {str(v['cohort_label']): k for k, v in cohort_args['CohortInfo'].items()}

for cohort_label in cohort_label_list:
    
    cohort_name = cohort_label_to_cohort_name[str(cohort_label)]
    OneCohort_config = cohort_args['CohortInfo'][cohort_name]
    
    logger.info(f'==================== {cohort_name}: {cohort_label} =====================')
    logger.info(OneCohort_config)
    
    
    cohort_label = OneCohort_config['cohort_label']
    cohort_name = OneCohort_config['cohort_name']
    # base_config = rft_config['base_config'] 
    SPACE = cohort_args['SPACE']

    # Step 1: df_Human
    path_dfHuman = os.path.join(SPACE['DATA_RFT'], 
                                f'{cohort_label}-{cohort_name}', 
                                cohort_args['RecName'] + '_data')
    
    
    rft_config = {
        'base_config': cohort_args, 
    }

    RawName_to_dfRawPath = get_RawName_to_dfRawPath(OneCohort_config, rft_config)
    RawName_to_dfRaw = RawName_to_dfRawPath



    df_Human = get_CohortLevel_df_Human2RawRecNum(OneCohort_config, 
                                                    rft_config, 
                                                    RawName_to_dfRaw)
    
    print(df_Human.shape)
    logger.info(f'Save df_Human to: {path_dfHuman}')
    
    ds_HumanRec = datasets.Dataset.from_pandas(df_Human)
    ds_HumanRec.save_to_disk(path_dfHuman)

[INFO:2024-04-18 23:50:45,685:(976063671.py@26 __main__)]: {'cohort_label': 1, 'cohort_name': 'RawData2022_CGM', 'FolderPath': '../_Data/0-Data_Raw/RawData2022_CGM/'}
[INFO:2024-04-18 23:50:45,687:(pipeline_record.py@39 recfldtkn.pipeline_record)]: ../_Data/0-Data_Raw/RawData2022_CGM/ <-- FolderPath
[INFO:2024-04-18 23:50:45,688:(pipeline_record.py@40 recfldtkn.pipeline_record)]: 32 <--- fullfile_list
[INFO:2024-04-18 23:50:45,688:(pipeline_record.py@92 recfldtkn.pipeline_record)]: {'cohort_label': 1, 'cohort_name': 'RawData2022_CGM', 'FolderPath': '../_Data/0-Data_Raw/RawData2022_CGM/'}
[INFO:2024-04-18 23:50:45,690:(pipeline_record.py@131 recfldtkn.pipeline_record)]: '../_Data/0-Data_Raw/RawData2022_CGM/05_02_2022_ElogBGEntry.csv' # larger than 1GB
[INFO:2024-04-18 23:50:45,690:(pipeline_record.py@158 recfldtkn.pipeline_record)]: processing large file: '../_Data/0-Data_Raw/RawData2022_CGM/05_02_2022_ElogBGEntry.csv'


[INFO:2024-04-18 23:50:45,709:(pipeline_record.py@63 recfldtkn.pipeline_record)]: id_column: PatientID
[INFO:2024-04-18 23:51:19,652:(pipeline_record.py@166 recfldtkn.pipeline_record)]: ElogBGEntry:'path-../_Data/0-Data_Raw/RawData2022_CGM/05_02_2022_ElogBGEntry.csv' # (1293, 3)
[INFO:2024-04-18 23:51:19,720:(pipeline_record.py@63 recfldtkn.pipeline_record)]: id_column: PatientID
[INFO:2024-04-18 23:51:19,723:(pipeline_record.py@166 recfldtkn.pipeline_record)]: ElogBPEntry:'path-../_Data/0-Data_Raw/RawData2022_CGM/05_02_2022_ElogBPEntry.csv' # (558, 3)
[INFO:2024-04-18 23:51:19,745:(pipeline_record.py@63 recfldtkn.pipeline_record)]: id_column: PatientId
[INFO:2024-04-18 23:51:19,746:(pipeline_record.py@166 recfldtkn.pipeline_record)]: WeightMeter:'path-../_Data/0-Data_Raw/RawData2022_CGM/05_12_2022_WeightMeter.csv' # (2, 3)
[INFO:2024-04-18 23:51:19,783:(pipeline_record.py@63 recfldtkn.pipeline_record)]: id_column: PatientID
[INFO:2024-04-18 23:51:19,785:(pipeline_record.py@166 recfldt

(7379, 36)


Saving the dataset (0/1 shards):   0%|          | 0/7379 [00:00<?, ? examples/s]

[INFO:2024-04-18 23:51:22,249:(976063671.py@26 __main__)]: {'cohort_label': 2, 'cohort_name': 'RawData2023_CVSTDCAug', 'FolderPath': '../_Data/0-Data_Raw/RawData2023_CVSTDCAug/'}
[INFO:2024-04-18 23:51:22,252:(pipeline_record.py@39 recfldtkn.pipeline_record)]: ../_Data/0-Data_Raw/RawData2023_CVSTDCAug/ <-- FolderPath
[INFO:2024-04-18 23:51:22,253:(pipeline_record.py@40 recfldtkn.pipeline_record)]: 41 <--- fullfile_list
[INFO:2024-04-18 23:51:22,253:(pipeline_record.py@92 recfldtkn.pipeline_record)]: {'cohort_label': 2, 'cohort_name': 'RawData2023_CVSTDCAug', 'FolderPath': '../_Data/0-Data_Raw/RawData2023_CVSTDCAug/'}
[INFO:2024-04-18 23:51:22,333:(pipeline_record.py@63 recfldtkn.pipeline_record)]: id_column: PatientID
[INFO:2024-04-18 23:51:22,335:(pipeline_record.py@166 recfldtkn.pipeline_record)]: ElogBPEntry:'path-../_Data/0-Data_Raw/RawData2023_CVSTDCAug/08_23_2023_ElogBPEntry.csv' # (712, 3)
[INFO:2024-04-18 23:51:22,337:(pipeline_record.py@121 recfldtkn.pipeline_record)]: '../_Da

(4256, 41)


Saving the dataset (0/1 shards):   0%|          | 0/4256 [00:00<?, ? examples/s]

[INFO:2024-04-18 23:51:34,814:(976063671.py@26 __main__)]: {'cohort_label': 3, 'cohort_name': 'RawData2023_CVSDeRxAug', 'FolderPath': '../_Data/0-Data_Raw/RawData2023_CVSDeRxAug/'}
[INFO:2024-04-18 23:51:34,817:(pipeline_record.py@39 recfldtkn.pipeline_record)]: ../_Data/0-Data_Raw/RawData2023_CVSDeRxAug/ <-- FolderPath
[INFO:2024-04-18 23:51:34,817:(pipeline_record.py@40 recfldtkn.pipeline_record)]: 41 <--- fullfile_list
[INFO:2024-04-18 23:51:34,818:(pipeline_record.py@92 recfldtkn.pipeline_record)]: {'cohort_label': 3, 'cohort_name': 'RawData2023_CVSDeRxAug', 'FolderPath': '../_Data/0-Data_Raw/RawData2023_CVSDeRxAug/'}
[INFO:2024-04-18 23:51:34,880:(pipeline_record.py@63 recfldtkn.pipeline_record)]: id_column: PatientID
[INFO:2024-04-18 23:51:34,881:(pipeline_record.py@166 recfldtkn.pipeline_record)]: MedAdministration:'path-../_Data/0-Data_Raw/RawData2023_CVSDeRxAug/08_23_2023_MedAdministration.csv' # (36, 3)
[INFO:2024-04-18 23:51:34,893:(pipeline_record.py@63 recfldtkn.pipeline_r

(69, 40)


Saving the dataset (0/1 shards):   0%|          | 0/69 [00:00<?, ? examples/s]

# [Part 3] Select Patients with PID

In [None]:
# from recfldtkn.loadtools import load_ds_rec_and_info
# ds_Human, _ = load_ds_rec_and_info(cohort_args['RecName'], cohort_args)
# df_Human = ds_Human.to_pandas()
# df_Human
# RootID = cohort_args['RootID']
# RawRootID = cohort_args['RawRootID']
# PID_list = [1013405, 1002538, 1022279, 1004432, 1016032, 1032308, 1031363, 1001133, 1007343, 1026067]
# print(PID_list)
# def get_patient_records_Ri(RawRootID_sample, RawRootID, cohort_args):
#     d = {}
#     cohort_config = cohort_args['CohortInfo'][cohort_name]
#     FolderPath = cohort_config['FolderPath']
#     chunk_size = 100000

#     file_list = sorted(os.listdir(FolderPath))
#     file_list = [i for i in file_list if 'csv' in i]
#     for file in file_list:
#         full_file = os.path.join(FolderPath, file)
#         li = [chunk[chunk[RawRootID] == RawRootID_sample] 
#               for chunk in pd.read_csv(full_file, chunksize=chunk_size, low_memory=False)]
#         result = pd.concat(li)
#         logger.info(f'{result.shape}: {file}')
#         if len(result) == 0: continue
#         d[file] = result
        
#     return d

# for PID_sample in PID_list:
#     print('\n======== PID_sample:', PID_sample, '========')
#     PIDInfo_dict = df_Human[df_Human[RootID] == PID_sample].iloc[0].to_dict()
#     RawRootID_sample = PIDInfo_dict[RawRootID]
#     d = get_patient_records_Ri(RawRootID_sample, RawRootID, cohort_args)
#     folder = os.path.join(SPACE['DATA_RAW'], 'patient_sample', str(PID_sample))
#     if os.path.exists(folder) == False: os.makedirs(folder)
#     for file, df in d.items():
#         df.to_csv(os.path.join(folder, file), index = False)
#         print(file, df.shape)