# Space

In [4]:
import os
import logging
import pandas as pd 
from IPython.display import display, HTML
KEY = 'WorkSpace'
WORKSPACE_PATH = os.getcwd().split(KEY)[0] + KEY
# print(WORKSPACE_PATH)
os.chdir(WORKSPACE_PATH)
import sys
from proj_space import PROJECT, TaskName, SPACE
sys.path.append(SPACE['CODE_FN'])
SPACE['WORKSPACE_PATH'] = WORKSPACE_PATH
recfldtkn_config_path = os.path.join(SPACE['CODE_RFT'], 'config_recfldtkn')

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format='[%(levelname)s:%(asctime)s:(%(filename)s@%(lineno)d %(name)s)]: %(message)s')


# Part 1: Prepare Record Yaml

Expected outcome:

You will understand the raw data

You will get a record yaml file. 

## [Step 1]: assign RecName

Motivation: We utilize a yaml file to store information pertaining to our records or recommendations ('Rec'). In order to efficiently link our 'Rec' with the corresponding yaml file, it is necessary to assign a descriptive RecName. This name serves as an identifier, allowing for easy association and retrieval of information. Please select an appropriate RecName for this purpose.

Aim: assign RecName

Input: yaml file names 

Output: RecName

Instruction: 
change RecName for specific Rec :```RecName = 'P'# <-------- select your yaml file name```

In [6]:
###########################
RecName = 'P'# <-------- select your yaml file name
###########################

## [Step 2] Get Necessary Args
Motivation: Prepare necessary Args for future development.

Aim: get cohort_args and record_args

Input: ```recfldtkn_config_path, SPACE,RecName, cohort_args```

Output: ```cohort_args,record_args```

Instruction: 
Run following code.



In [7]:
from recfldtkn.configfn import load_cohort_args
from recfldtkn.configfn import load_record_args

cohort_args = load_cohort_args(recfldtkn_config_path, SPACE)
record_args = load_record_args(RecName, cohort_args)

## [Step 3] Create and Update Record Yaml
Motivation: To store configuration and information.

Aim: create Yaml file for rec

Input: informations about data_path, RawRoodID, RecNumColunm and raw_columms

Output: Yaml file

Instruction: 
1. change COHORT_NAME_XXXXXX
2. change raw_data_path
3. change RawRootID
4. change raw_columns



**template**

```yaml
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%% TableBase sources from different cohorts.
CohortInfo: # Cohort
  COHORT_NAME_XXXXXX: # <---- change this.
    TABLE1: 
      raw_data_path: $DATA_RAW$/Cohort_Folder_XXXXXXX/raw_table_file_name1_XXXXXXXXX.csv
      RawRootID: XXXXXXXXX
      RecNumColumn: XXXXXXXXX # in Human2RecNum, the related raw table name
      raw_columns: 
        - XXXXX # <--- to update during RecAttr 
        - XXXXX

    TABLE2:  # <-------- IN MOST OF THE TIME, WE DON'T NEED TABLE2.
      raw_data_path: $DATA_RAW$/Cohort_Folder_XXXXXXX/raw_table_file_name2_XXXXXXXXX.csv
      RawRootID: XXXXXXXXX
      RecNumColumn: XXXXXXXXX # in Human2RecNum, the related raw table name
      raw_columns: 
        - XXXXX # <--- to update during RecAttr 
        - XXXXX
```

**write and update your own yaml**


```yaml
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%% TableBase sources from different cohorts.
CohortInfo: # Cohort
  
  RawData2022_CGM:
    TableFile1: 
      raw_data_path: '$DATA_RAW$/RawData2022_CGM/05_02_2022_Patient.csv'
      RawRootID: 'PatientID'  # for merging purpose
      RecNumColumn: 'Patient' # Column in PRawRecNum
      raw_columns: 
    TableFile2: 
      raw_data_path: '$DATA_RAW$/RawData2022_CGM/05_02_2022_UserDetail.csv'
      RawRootID: 'UserID'
      RecNumColumn: 'UserDetail'
      raw_columns: 

          
  RawData2023_CVSTDCAug:
    TableFile1: 
      raw_data_path: '$DATA_RAW$/RawData2023_CVSTDCAug/08_23_2023_Patient.csv'
      RawRootID: 'PatientID'
      RecNumColumn: 'Patient' # Column in PRawRecNum
      raw_columns: 
    TableFile2: 
      raw_data_path: '$DATA_RAW$/RawData2023_CVSTDCAug/08_23_2023_UserDetail.csv'
      RawRootID: 'UserID'
      RecNumColumn: 'UserDetail'
      raw_columns: 


  RawData2023_CVSDeRxAug:
    TableFile1: 
      raw_data_path: '$DATA_RAW$/RawData2023_CVSDeRxAug/08_23_2023_Patient.csv'
      RawRootID: 'PatientID'
      RecNumColumn: 'Patient' # Column in PRawRecNum
      raw_columns: 
    TableFile2: 
      raw_data_path: '$DATA_RAW$/RawData2023_CVSDeRxAug/08_23_2023_UserDetail.csv'
      RawRootID: 'UserID'
      RecNumColumn: 'UserDetail'
      raw_columns: 
```

In [10]:
# Create a HTML link and display it
path = record_args['yaml_file_path']
full_path = os.path.join(WORKSPACE_PATH, path)
display(HTML(f'{path} <a href="{full_path}" target="_blank">Open File</a>'))

## [Step 4] Update Yaml for record's Meta


In [11]:
# Create a HTML link and display it
path = record_args['yaml_file_path']
full_path = os.path.join(WORKSPACE_PATH, path)
display(HTML(f'{path} <a href="{full_path}" target="_blank">Open File</a>'))

**template**

```yaml
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 
RecName: XXX    # name of the record.
RecID: XXX   # ID of the record. not necessary to be like this: RecID = RecName + 'ID'.
RawRecID: 
  - XXX
RecIDChain: 
  - XXX
ParentRecName:  # if no parent record, set it to empty. 
RecDT:          # if no RecDT, set it to empty. 
```

**your yaml**

```yaml
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 
RecName: P    # name of the record.
RecID: PID   # ID of the record. not necessary to be like this: RecID = RecName + 'ID'.
RawRecID: 
  - PatientID
RecIDChain: 
  - P
  - PatientID
ParentRecName:  # if no parent record, set it to empty. 
RecDT:          # if no RecDT, set it to empty. 
```

## [Step 5] Select One Cohort

Motivation: We want to choose one cohort and test our code in this one cohort.

Aim: Specify a cohort

Input: Cohort Yaml

Output: Cohort name and Cohort label of the cohort we want to choose.

Instruction: Change ```args_information = ['--cohort_label', '1'] ```

In [12]:
################### in notebook ###################
args_information = ['--cohort_label', '1']
###################################################

import argparse
my_parser = argparse.ArgumentParser(description='Process Input.')

# Add the arguments
my_parser.add_argument('--cohort_name',
                    metavar='cohort_name',
                    default = None, 
                    type=str,
                    help='the cohort_name to process')

my_parser.add_argument('--cohort_label',
                    metavar='cohort_label',
                    default = None, 
                    type=str,
                    help='the label for cohort_name to process')



args = my_parser.parse_args(args_information)
cohort_label = int(args.cohort_label)
cohort_config = [v for k, v in cohort_args['CohortInfo'].items() if v['cohort_label'] == cohort_label][0]
cohort_name = cohort_config['cohort_name']
print('\n========== cohort_config ==========')
# print(cohort_config)
print(cohort_label, cohort_name)


1 RawData2022_CGM


## [Step 6] df_Human, df_Prt and Save them in record_args for the selected OneCohort
Motivation: ????

Aim: Update record_args

Input:

Output: record_args['df_Prt'], 

Instruction:
1. Remember to restart the notebook to fully load the updated yaml files.
2. Run following code 

In [17]:
#######################
cohort_label_list = [cohort_label]
#######################

In [21]:
from recfldtkn.loadtools import filter_with_cohort_label, load_ds_rec_and_info
from recfldtkn.pipeline_record import get_parent_record_information

RootID = cohort_args['RootID']

ds_Human, _ = load_ds_rec_and_info(cohort_args['RecName'], cohort_args, cohort_label_list = cohort_label_list)
df_Human = ds_Human.to_pandas()
prt_record_args, df_Prt, df_Human = get_parent_record_information(record_args, cohort_args, df_Human)
df_Prt = filter_with_cohort_label(df_Prt, cohort_label, cohort_args)
df_Human = df_Human[df_Human[RootID].isin(df_Prt[RootID].to_list())].reset_index(drop = True)
record_args['df_Prt'] = df_Prt
record_args['prt_record_args'] = prt_record_args

In [22]:
df_Human.head()

Unnamed: 0,PID,PatientID,ActivityTracker,BPMeter,CGMConnection,CustomFood,ELogCarbsEntry,ELogCommentEntry,ELogExerciseEntry,ELogFoodItem,...,PatientMeter,PatientWeeklyChallenge,QuestionResponse,SleepEntry,StepEntry,UserDetail,WeightGoal,WeightMeter,TotalRecNum,CohortLabel
0,1000001,6,,1.0,,,11.0,4.0,22.0,19.0,...,,2.0,,8.0,8.0,1.0,1.0,,397.0,1
1,1000002,10,,,,,2.0,,,2.0,...,,,,,,1.0,,,7.0,1
2,1000003,11,,,,,,,41.0,,...,,,,34.0,,1.0,,,76.0,1
3,1000004,13,,,,,2.0,,,3.0,...,,,,,,1.0,,,8.0,1
4,1000005,14,,,,,6.0,1.0,1058.0,7.0,...,,7.0,,3317.0,273.0,1.0,,,4731.0,1


In [23]:
df_Prt.head()

Unnamed: 0,PID,PatientID
0,1000001,6
1,1000002,10
2,1000003,11
3,1000004,13
4,1000005,14


## [Step 7] OneCohortRec_args

In [24]:
print(cohort_name)
OneCohortRec_args = record_args['CohortInfo'][cohort_name]
OneCohortRec_args['cohort_name'] = cohort_name  
OneCohortRec_args['cohort_label'] = cohort_label
OneCohortRec_args

RawData2022_CGM


{'TableFile1': {'raw_data_path': '../_Data/0-Data_Raw/RawData2022_CGM/05_02_2022_Patient.csv',
  'RawRootID': 'PatientID',
  'RecNumColumn': 'Patient',
  'raw_columns': ['PatientID',
   'MaritalStatusID',
   'RaceID',
   'EthinicityID',
   'LevelOfEducationID',
   'MRSegmentID',
   'MRSegmentModifiedDateTime',
   'DiseaseType',
   'DiseaseCombinationID',
   'PAPEligibility',
   'PAPStatus',
   'PAPStatusReason']},
 'TableFile2': {'raw_data_path': '../_Data/0-Data_Raw/RawData2022_CGM/05_02_2022_UserDetail.csv',
  'RawRootID': 'UserID',
  'RecNumColumn': 'UserDetail',
  'raw_columns': ['UserID',
   'Gender',
   'YearOfBirth',
   'Language',
   'Country',
   'RoleID',
   'StatusID',
   'StatusReason',
   'IsTrainingSkipped',
   'TrainingCompletedDate',
   'ActivationDate',
   'UserTimeZoneOffset',
   'UserTimeZone',
   'Description']},
 'cohort_name': 'RawData2022_CGM',
 'cohort_label': 1}

In [23]:
source_path_not_existence_flag = 0
for tablename, tableinfo in OneCohortRec_args.items():
    if tablename in ['cohort_name', 'cohort_label']: continue
    filename = tableinfo['raw_data_path']
    print(filename)
    if not os.path.exists(filename): 
        source_path_not_existence_flag += 1
    else:
        if filename.endswith('.csv'):
            df = pd.read_csv(filename, nrows=0)
            raw_tables_columns = list(df.columns)
            print('\n=======================')
            print(filename)
            for i in raw_tables_columns:
                print('-', i)
            print('=======================\n\n')
        elif filename.endswith('.p'):
            df = pd.read_pickle(filename)
            raw_tables_columns = list(df.columns)
            print('\n=======================')
            print(filename)
            for i in raw_tables_columns:
                print('-', i)
            print('=======================\n\n')

if source_path_not_existence_flag > 0:
    print(f'=== source_path_not_existence_flag: {source_path_not_existence_flag}')

OneCohortRec_args = record_args['CohortInfo'][cohort_name]
print('\n========== OneCohortRec_args ==========')
print(OneCohortRec_args)

../_Data/0-Data_Raw/RawData2022_CGM/05_02_2022_Patient.csv

../_Data/0-Data_Raw/RawData2022_CGM/05_02_2022_Patient.csv
- PatientID
- PatientRegistrationPageID
- AlertSystemEnabled
- PatientLastMobileActivationID
- MaritalStatusID
- RaceID
- EthinicityID
- IsEligible
- MedicationReminderEnabled
- AppointmentReminderEnabled
- PatientCreatedDate
- PatientModifiedDate
- RowVersionID
- MedicationViewMode
- InPersonTrainingStatus
- InPersonTrainingStatusDatetime
- InPersonTrainingContactNumber
- InPersonTrainingScheduledSlot
- IsRefillRequired
- IsRefillRequiredAnsweredDate
- PAPEligibility
- PAPStatus
- PAPStatusReason
- MRSegmentID
- RefillReason
- LastPushNotificationID
- LastPushNotificationDate
- IsTermsAgreed
- NextWeeklyReportGeneratedDatetime
- AllowMarketingMessages
- IsWeeklyChallengeStartShown
- WeeklyChallengeCount
- IsWeeklyChallengeTwelveWeekShown
- MRSegmentModifiedDateTime
- IsWeeklyChallengeEnabled
- NextWeeklyReportGeneratedTimeZoneOffset
- NextEmailReminderCheckDateTime
- 

## [Step 8] **Important** Select useful Raw Columns

Motivation: Based on understanding of the data, choose useful Raw Columns

AIM: Select useful Raw Columns

Input: ```raw_data_path```

Output:```raw_columns```

Instruciton: Run following code and choose raw_columns based on specific project.

In [33]:
for tablename, tableinfo in OneCohortRec_args.items():
    if tablename in ['cohort_name', 'cohort_label']: continue
    print(tablename)
    print(tableinfo)
    print('\n')

TableFile1
{'raw_data_path': '../_Data/0-Data_Raw/RawData2022_CGM/05_02_2022_Patient.csv', 'RawRootID': 'PatientID', 'RecNumColumn': 'Patient', 'raw_columns': ['PatientID', 'MaritalStatusID', 'RaceID', 'EthinicityID', 'LevelOfEducationID', 'MRSegmentID', 'MRSegmentModifiedDateTime', 'DiseaseType', 'DiseaseCombinationID', 'PAPEligibility', 'PAPStatus', 'PAPStatusReason']}


TableFile2
{'raw_data_path': '../_Data/0-Data_Raw/RawData2022_CGM/05_02_2022_UserDetail.csv', 'RawRootID': 'UserID', 'RecNumColumn': 'UserDetail', 'raw_columns': ['UserID', 'Gender', 'YearOfBirth', 'Language', 'Country', 'RoleID', 'StatusID', 'StatusReason', 'IsTrainingSkipped', 'TrainingCompletedDate', 'ActivationDate', 'UserTimeZoneOffset', 'UserTimeZone', 'Description']}




In [34]:
tablename_list =  [i for i in OneCohortRec_args if i not in ['cohort_name', 'cohort_label']]
tablename_list

['TableFile1', 'TableFile2']

In [35]:
tablename = tablename_list[0]
tableinfo = OneCohortRec_args[tablename]
raw_data_path = tableinfo['raw_data_path']
print(raw_data_path)

../_Data/0-Data_Raw/RawData2022_CGM/05_02_2022_Patient.csv


In [38]:
# After checking the columns, you will find some useful raw columns
###########################################################
df = pd.read_csv(raw_data_path, low_memory=False)

raw_columns = ['PatientID',
 'MaritalStatusID',
 'RaceID',
 'EthinicityID',
 'LevelOfEducationID',
 'MRSegmentID',
 'MRSegmentModifiedDateTime',
 'DiseaseType',
 'DiseaseCombinationID',
 'PAPEligibility',
 'PAPStatus',
 'PAPStatusReason']

for i in raw_columns:
    print('-', i)
###########################################################

- PatientID
- MaritalStatusID
- RaceID
- EthinicityID
- LevelOfEducationID
- MRSegmentID
- MRSegmentModifiedDateTime
- DiseaseType
- DiseaseCombinationID
- PAPEligibility
- PAPStatus
- PAPStatusReason


In [40]:
tablename = tablename_list[1]
tableinfo = OneCohortRec_args[tablename]
raw_data_path = tableinfo['raw_data_path']
print(raw_data_path)

../_Data/0-Data_Raw/RawData2022_CGM/05_02_2022_UserDetail.csv


In [43]:
# After checking the columns, you will find some useful raw columns
###########################################################
df = pd.read_csv(raw_data_path, low_memory=False)

raw_columns = ['UserID',
 'Gender',
 'YearOfBirth',
 'Language',
 'Country',
 'RoleID',
 'StatusID',
 'StatusReason',
 'IsTrainingSkipped',
 'TrainingCompletedDate',
 'ActivationDate',
 'UserTimeZoneOffset',
 'UserTimeZone',
 'Description']

for i in raw_columns:
    print('-', i)
###########################################################

- UserID
- Gender
- YearOfBirth
- Language
- Country
- RoleID
- StatusID
- StatusReason
- IsTrainingSkipped
- TrainingCompletedDate
- ActivationDate
- UserTimeZoneOffset
- UserTimeZone
- Description


## [Step 9] Update Yaml: OneCohort's Table raw_columns
Motivation: update Yaml file

Instruciton: Copy the above raw_columns to the corresponding raw column attributes



In [44]:
# Create a HTML link and display it
path = record_args['yaml_file_path']
full_path = os.path.join(WORKSPACE_PATH, path)
display(HTML(f'{path} <a href="{full_path}" target="_blank">Open File</a>'))

**example**

```yaml
RawData2022_CGM:
    TableFile1: 
      raw_data_path: '$DATA_RAW$/RawData2022_CGM/05_02_2022_Patient.csv'
      RawRootID: 'PatientID'  # for merging purpose
      RecNumColumn: 'Patient' # Column in PRawRecNum
      raw_columns: 
        - PatientID
        - MaritalStatusID
        - RaceID
        - EthinicityID
        - LevelOfEducationID
        - MRSegmentID
        - MRSegmentModifiedDateTime
        - DiseaseType
        - DiseaseCombinationID
        - PAPEligibility
        - PAPStatus
        - PAPStatusReason
```

# [Part 2] Load HumanRecRaw


We have a pipeline fn to do it. 

If you have interests in understanding this pipeline. 

These pipeline functions: `get_df_HumanSelected_from_OneCohortRecArgs` and `get_HumanRawRec_for_HumanGroup`

It will take the `record.yaml` within the function and then load the data as the `dfHumanRecRaw`.

Before this part, you must make your yaml file ready. 

In [46]:
print([tablename for tablename in OneCohortRec_args])

['TableFile1', 'TableFile2', 'cohort_name', 'cohort_label']


## [Step 1] Load the df_HuamnRawRec
Motivation:

Input:

Output:

Instruction:

In [28]:
from recfldtkn.pipeline_record import get_df_HumanSelected_from_OneCohortRecArgs
from recfldtkn.pipeline_record import get_HumanRawRec_for_HumanGroup

df_HumanSelected = get_df_HumanSelected_from_OneCohortRecArgs(df_Human, record_args, OneCohortRec_args, cohort_args)
logger.info(f'{df_HumanSelected.shape} === df_HumanSelected <-- df_Human: selected from in CohortLabel {cohort_label}: {cohort_name} and with RecordNum > 0')

######################
filepath_to_rawdf = {}
######################

RawRootID = cohort_args['RawRootID']
for index_group, df_HumanGroup in df_HumanSelected.groupby('index_group'): 
    logger.info(f'current index_group: {index_group} ...')

    # ---------------------- this is the core part of the pipeline ----------------------
    # 7.1 get the df_HumanRawRec
    #     this function can be used independently to get the raw df_HumanRawRec. 
    df_HumanRawRec = get_HumanRawRec_for_HumanGroup(df_HumanGroup, OneCohortRec_args, RawRootID, filepath_to_rawdf)
    index = df_HumanRawRec[RawRootID].isin(df_HumanSelected[RawRootID].to_list())
    df_HumanRawRec = df_HumanRawRec[index].reset_index(drop = True)
    logger.info(f'current df_HumanRawRec: {df_HumanRawRec.shape} ...')

    break

[INFO:2024-02-11 18:36:12,769:(3388162799.py@5 __main__)]: (7296, 6) === df_HumanSelected <-- df_Human: selected from in CohortLabel 1: RawData2022_CGM and with RecordNum > 0
[INFO:2024-02-11 18:36:12,777:(3388162799.py@13 __main__)]: current index_group: 0 ...


[INFO:2024-02-11 18:36:13,296:(3388162799.py@21 __main__)]: current df_HumanRawRec: (7296, 25) ...


## [Step 2] Display df_HumanRawRec

In [29]:
df_HumanRawRec

Unnamed: 0,PatientID,MaritalStatusID,RaceID,EthinicityID,LevelOfEducationID,MRSegmentID,MRSegmentModifiedDateTime,DiseaseType,DiseaseCombinationID,PAPEligibility,...,Country,RoleID,StatusID,StatusReason,IsTrainingSkipped,TrainingCompletedDate,ActivationDate,UserTimeZoneOffset,UserTimeZone,Description
0,6,0.0,0,0,0,MR_6,1/15/2021 3:18:26 PM,1,0,,...,1,1,1,,True,11/21/2020 4:23:38 PM,11/21/2020 4:23:38 PM,330,Asia/Kolkata,
1,10,,0,0,0,MR_1,11/25/2020 10:08:01 PM,2,0,,...,1,1,1,,True,11/25/2020 10:07:54 PM,11/25/2020 10:07:54 PM,-240,America/New_York,
2,11,,0,0,0,MR_1,11/30/2020 1:43:22 PM,2,0,,...,1,1,2,Patient initiated stop,True,11/30/2020 1:43:22 PM,11/30/2020 1:43:22 PM,-300,America/New_York,
3,13,,0,0,0,MR_1,11/30/2020 4:28:36 PM,2,0,,...,1,1,1,,True,11/30/2020 4:28:34 PM,11/30/2020 4:28:34 PM,-420,America/Los_Angeles,
4,14,,0,0,0,MR_5.6,6/28/2021 11:42:25 AM,2,0,,...,1,1,1,,True,12/1/2020 6:35:59 PM,12/1/2020 6:35:59 PM,330,Asia/Kolkata,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7291,14711,,0,0,0,MR_0,5/1/2022 2:29:13 PM,2,0,,...,1,1,1,,True,5/1/2022 2:29:12 PM,5/1/2022 2:29:12 PM,-240,America/New_York,
7292,14712,,0,0,0,MR_0,5/1/2022 2:47:54 PM,2,0,,...,1,1,1,,True,5/1/2022 2:47:52 PM,5/1/2022 2:47:52 PM,-300,America/Chicago,
7293,14713,,0,0,0,MR_0,5/1/2022 3:47:44 PM,2,0,,...,1,1,1,,True,5/1/2022 3:47:43 PM,5/1/2022 3:47:43 PM,-300,America/Chicago,
7294,14718,,0,0,0,MR_0,5/1/2022 6:45:25 PM,2,0,,...,1,1,1,,True,5/1/2022 6:45:25 PM,5/1/2022 6:45:25 PM,-300,America/Chicago,


# [Part 3] HumanRecAttr

In [30]:
# Create a HTML link and display it
path = record_args['pypath']
full_path = os.path.join(WORKSPACE_PATH, path)
link = f'{path} <a href="{full_path}" target="_blank">Open File</a>'
display(HTML(link))

## [Step 1] **Important** RawRec_to_RecAttr Code

Motivation: To prepare and organize the raw data into a structured format that is suitable for further analysis or processing.

AIM: This determine how do you map the raw_columns to clean attr_columns.

Input: df_Prt and df_HumanRawRec from last step

Output: df with clean attribute

Instruction: Depend on specific project, usually we will need the lase three steps.
Refer to the Welldoc example below.




In [31]:
#------------------- Template
# df = df_HumanRawRec

# -. filter out the records we don't need (optional) 

# -. create a new column for raw record id (optional)

# -. have a check that the raw record id is unique

# -. update datetime columns
   
# -. select a DT. TODO: you might need to localize the datetime to local timezone. 

# -. merge with the parent record (a must except Human Records)

# -. sort the table by Parent IDs and DT

# -. create a new column for RecID

# df = df 
#-------------------

In [32]:
#-------------------
df = df_HumanRawRec

# 1. filter out the records we don't need (optional) 

# 2. create a new column for raw record id (optional)

# 3. update datetime columns 
column = 'ActivationDate'
df[column] = pd.to_datetime(df[column], format='mixed')
column = 'MRSegmentModifiedDateTime'
df[column] = pd.to_datetime(df[column], format = 'mixed')

# 4. select a DT as the RecDT

# 5. merge with the parent record 
print(df.shape)
df_Prt = record_args['df_Prt']
print(df_Prt.shape)
prt_record_args = record_args['prt_record_args']
df_merged = pd.merge(df_Prt, df, how = 'inner', on = prt_record_args['RawRecID'])
print(df_merged.shape)
df = df_merged

# 6. sort the table by RootID and RecDT
RootID = cohort_args['RootID']
df = df.sort_values(RootID).reset_index(drop = True)
#-------------------
df

(7296, 25)
(7379, 2)
(7296, 26)


Unnamed: 0,PID,PatientID,MaritalStatusID,RaceID,EthinicityID,LevelOfEducationID,MRSegmentID,MRSegmentModifiedDateTime,DiseaseType,DiseaseCombinationID,...,Country,RoleID,StatusID,StatusReason,IsTrainingSkipped,TrainingCompletedDate,ActivationDate,UserTimeZoneOffset,UserTimeZone,Description
0,1000001,6,0.0,0,0,0,MR_6,2021-01-15 15:18:26,1,0,...,1,1,1,,True,11/21/2020 4:23:38 PM,2020-11-21 16:23:38,330,Asia/Kolkata,
1,1000002,10,,0,0,0,MR_1,2020-11-25 22:08:01,2,0,...,1,1,1,,True,11/25/2020 10:07:54 PM,2020-11-25 22:07:54,-240,America/New_York,
2,1000003,11,,0,0,0,MR_1,2020-11-30 13:43:22,2,0,...,1,1,2,Patient initiated stop,True,11/30/2020 1:43:22 PM,2020-11-30 13:43:22,-300,America/New_York,
3,1000004,13,,0,0,0,MR_1,2020-11-30 16:28:36,2,0,...,1,1,1,,True,11/30/2020 4:28:34 PM,2020-11-30 16:28:34,-420,America/Los_Angeles,
4,1000005,14,,0,0,0,MR_5.6,2021-06-28 11:42:25,2,0,...,1,1,1,,True,12/1/2020 6:35:59 PM,2020-12-01 18:35:59,330,Asia/Kolkata,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7291,1007292,14711,,0,0,0,MR_0,2022-05-01 14:29:13,2,0,...,1,1,1,,True,5/1/2022 2:29:12 PM,2022-05-01 14:29:12,-240,America/New_York,
7292,1007293,14712,,0,0,0,MR_0,2022-05-01 14:47:54,2,0,...,1,1,1,,True,5/1/2022 2:47:52 PM,2022-05-01 14:47:52,-300,America/Chicago,
7293,1007294,14713,,0,0,0,MR_0,2022-05-01 15:47:44,2,0,...,1,1,1,,True,5/1/2022 3:47:43 PM,2022-05-01 15:47:43,-300,America/Chicago,
7294,1007295,14718,,0,0,0,MR_0,2022-05-01 18:45:25,2,0,...,1,1,1,,True,5/1/2022 6:45:25 PM,2022-05-01 18:45:25,-300,America/Chicago,


## [Step 2] Pin Down and Attr Cols and Update it to Yaml

Motivation: Choose the final attr cols 

Aim:Update final attr cols to Yaml file

Input: attr_cols

Output: Yaml file

Instruction: change the following code regarding to a specific project.

**example**

```yaml
attr_cols:
  - PID
  - PatientID
  - YearOfBirth
  - ActivationDate
  - MRSegmentModifiedDateTime
  - UserTimeZone
  - UserTimeZoneOffset
  - Gender
  - MRSegmentID
  - DiseaseType
```

In [55]:
# Create a HTML link and display it
path = record_args['yaml_file_path']
full_path = os.path.join(WORKSPACE_PATH, path)
link = f'{path} <a href="{full_path}" target="_blank">Open File</a>'
display(HTML(link))

In [33]:
# attr_cols = [

# ###################################
# # ---- TOADD ATTRIBUTE COLUMNS ----
# ###################################

# ]

attr_cols = [
# RID
'PID', 'PatientID', 
 
# DT
'YearOfBirth', 'ActivationDate',
'MRSegmentModifiedDateTime', 
'UserTimeZone', 'UserTimeZoneOffset', # fields
    
# Values
'Gender',  'MRSegmentID', 'DiseaseType', 
# 'BPMeterConnected', 'WeightMeterConnected', 'PatientMeterConnected',

]

for i in attr_cols: print('-', i)

df[attr_cols].head()

- PID
- PatientID
- YearOfBirth
- ActivationDate
- MRSegmentModifiedDateTime
- UserTimeZone
- UserTimeZoneOffset
- Gender
- MRSegmentID
- DiseaseType


Unnamed: 0,PID,PatientID,YearOfBirth,ActivationDate,MRSegmentModifiedDateTime,UserTimeZone,UserTimeZoneOffset,Gender,MRSegmentID,DiseaseType
0,1000001,6,1990,2020-11-21 16:23:38,2021-01-15 15:18:26,Asia/Kolkata,330,2,MR_6,1
1,1000002,10,1980,2020-11-25 22:07:54,2020-11-25 22:08:01,America/New_York,-240,2,MR_1,2
2,1000003,11,1980,2020-11-30 13:43:22,2020-11-30 13:43:22,America/New_York,-300,1,MR_1,2
3,1000004,13,1980,2020-11-30 16:28:34,2020-11-30 16:28:36,America/Los_Angeles,-420,1,MR_1,2
4,1000005,14,1980,2020-12-01 18:35:59,2021-06-28 11:42:25,Asia/Kolkata,330,2,MR_5.6,2


## [Step 3] Write down RawRec_to_RecAttr_fn

Movivation: saving such a RawRec_to_RecAttr_fn is to create clean, efficient, and maintainable code that can be easily shared and reused.

Aim: Save RawRec_RecAttr_fn

Input: df_HumanRawRec, df_Human, cohort_args, record_args, attr_cols

output: RawRec_RecAttr_fn

Instruction: Copy the code from above and run it.

In [57]:
from recfldtkn.loadtools import convert_variables_to_pystirng, load_module_variables
import inspect

###########################
def RawRec_to_RecAttr_fn(df_HumanRawRec, df_Human, cohort_args, record_args, attr_cols):
    #-------------------
    df = df_HumanRawRec

    # 1. filter out the records we don't need (optional) 

    # 2. create a new column for raw record id (optional)

    # 3. update datetime columns 
    column = 'ActivationDate'
    df[column] = pd.to_datetime(df[column], format='mixed')
    column = 'MRSegmentModifiedDateTime'
    df[column] = pd.to_datetime(df[column], format = 'mixed')

    # 4. select a DT as the RecDT

    # 5. merge with the parent record 
    df_Prt = record_args['df_Prt']
    prt_record_args = record_args['prt_record_args']
    df_merged = pd.merge(df_Prt, df, how = 'inner', on = prt_record_args['RawRecID'])
    df = df_merged

    # 6. sort the table by RootID and RecDT
    RootID = cohort_args['RootID']
    df = df.sort_values(RootID).reset_index(drop = True)
    #-------------------

    df_HumanRecFld = df[attr_cols].reset_index(drop = True)
    return df_HumanRecFld
###########################

RawRec_to_RecAttr_fn.fn_string = inspect.getsource(RawRec_to_RecAttr_fn)

## [Step 4] Save as the pipeline fn

Instruction:  Run the following code.


In [58]:
prefix = ['import pandas as pd', 'import numpy as np']
pycode = convert_variables_to_pystirng(fn_variables = [RawRec_to_RecAttr_fn], prefix = prefix)
RecName = record_args['RecName']
pypath = record_args['pypath']
print(pypath)
if not os.path.exists(os.path.dirname(pypath)): os.makedirs(os.path.dirname(pypath))
with open(pypath, 'w') as file: file.write(pycode)

RecName = record_args['RecName']
pypath = record_args['pypath']
module = load_module_variables(pypath)
RawRec_to_RecAttr_fn = module.MetaDict['RawRec_to_RecAttr_fn']

../pipeline/fn_recattr/P.py


## [Step 5] Test the save pipeline fn

In [59]:

df_HumanRecAttr = RawRec_to_RecAttr_fn(df_HumanRawRec, df_Human, cohort_args, record_args, attr_cols)
df_HumanRecAttr

Unnamed: 0,PID,PatientID,YearOfBirth,ActivationDate,MRSegmentModifiedDateTime,UserTimeZone,UserTimeZoneOffset,Gender,MRSegmentID,DiseaseType
0,1000001,6,1990,2020-11-21 16:23:38,2021-01-15 15:18:26,Asia/Kolkata,330,2,MR_6,1
1,1000002,10,1980,2020-11-25 22:07:54,2020-11-25 22:08:01,America/New_York,-240,2,MR_1,2
2,1000003,11,1980,2020-11-30 13:43:22,2020-11-30 13:43:22,America/New_York,-300,1,MR_1,2
3,1000004,13,1980,2020-11-30 16:28:34,2020-11-30 16:28:36,America/Los_Angeles,-420,1,MR_1,2
4,1000005,14,1980,2020-12-01 18:35:59,2021-06-28 11:42:25,Asia/Kolkata,330,2,MR_5.6,2
...,...,...,...,...,...,...,...,...,...,...
7291,1007292,14711,1961,2022-05-01 14:29:12,2022-05-01 14:29:13,America/New_York,-240,2,MR_0,2
7292,1007293,14712,1957,2022-05-01 14:47:52,2022-05-01 14:47:54,America/Chicago,-300,2,MR_0,2
7293,1007294,14713,1962,2022-05-01 15:47:43,2022-05-01 15:47:44,America/Chicago,-300,1,MR_0,2
7294,1007295,14718,1965,2022-05-01 18:45:25,2022-05-01 18:45:25,America/Chicago,-300,1,MR_0,2
