# Space

In [None]:
import os
import logging
import pandas as pd
from IPython.display import display, HTML
KEY = 'WorkSpace'
WORKSPACE_PATH = os.getcwd().split(KEY)[0] + KEY
# print(WORKSPACE_PATH)
os.chdir(WORKSPACE_PATH)
import sys
from proj_space import PROJECT, TaskName, SPACE
sys.path.append(SPACE['CODE_FN'])
SPACE['WORKSPACE_PATH'] = WORKSPACE_PATH
recfldtkn_config_path = os.path.join(SPACE['CODE_RFT'], 'config_recfldtkn')

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format='[%(levelname)s:%(asctime)s:(%(filename)s@%(lineno)d %(name)s)]: %(message)s')


# Part 1: Prepare Record Yaml

Expected outcome:

You will understand the raw data

You will get a record yaml file.

## [Step 1]: assign RecName

Motivation: We utilize a yaml file to store information pertaining to our records or recommendations ('Rec'). In order to efficiently link our 'Rec' with the corresponding yaml file, it is necessary to assign a descriptive RecName. This name serves as an identifier, allowing for easy association and retrieval of information. Please select an appropriate RecName for this purpose.

Aim: assign RecName

Input: yaml file names

Output: RecName

Instruction:
change RecName for specific Rec :```RecName = 'P'# <-------- select your yaml file name```

In [None]:
###########################
RecName = 'P'# <-------- select your yaml file name
###########################

## [Step 2] Get Necessary Args
Motivation: Prepare necessary Args for future development.

Aim: get cohort_args and record_args

Input: ```recfldtkn_config_path, SPACE,RecName, cohort_args```

Output: ```cohort_args,record_args```

Instruction:
Run following code.



In [None]:
from recfldtkn.configfn import load_cohort_args
from recfldtkn.configfn import load_record_args

cohort_args = load_cohort_args(recfldtkn_config_path, SPACE)
record_args = load_record_args(RecName, cohort_args)
record_args

## [Step 3] Create and Update Record Yaml
Motivation: To store configuration and information.

Aim: create Yaml file for rec

Input: informations about data_path, RawRoodID, RecNumColunm and raw_columms

Output: Yaml file

Instruction:
1. change COHORT_NAME_XXXXXX
2. change raw_data_path
3. change RawRootID
4. change raw_columns



**template**

```yaml
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%% TableBase sources from different cohorts.
CohortInfo: # Cohort
  COHORT_NAME_XXXXXX: # <---- change this.
    {YourRawName1} :
      raw_data_path: $DATA_RAW$/Cohort_Folder_XXXXXXX/raw_table_file_name1_XXXXXXXXX.csv
      
    {YourRawName2} :  # <-------- IN MOST OF THE TIME, WE DON'T NEED TABLE2.
      raw_data_path: $DATA_RAW$/Cohort_Folder_XXXXXXX/raw_table_file_name2_XXXXXXXXX.csv


RawInfo:
  {YourRawName1}:
    RawRootID: XXXX
    RawName: YourRawName1 # in Human2RecNum, the related raw table name
    raw_columns:
  {YourRawName2}:
    RawRootID: XXXX
    RawName: YourRawName2 # in Human2RecNum, the related raw table name
    raw_columns:
```

In [None]:
# Create a HTML link and display it
path = record_args['yaml_file_path']
full_path = os.path.join(WORKSPACE_PATH, path)
display(HTML(f'{path} <a href="{full_path}" target="_blank">Open File</a>'))

## [Step 4] Update Yaml for record's Meta


In [None]:
# Create a HTML link and display it
path = record_args['yaml_file_path']
full_path = os.path.join(WORKSPACE_PATH, path)
display(HTML(f'{path} <a href="{full_path}" target="_blank">Open File</a>'))

**template**

```yaml
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
RecName: XXX    # name of the record.
RecID: XXX   # ID of the record. not necessary to be like this: RecID = RecName + 'ID'.
RawRecID:
  - XXX
RecIDChain:
  - XXX
ParentRecName:  # if no parent record, set it to empty.
RecDT:          # if no RecDT, set it to empty.
```

## [Step 5] Select One Cohort

Motivation: We want to choose one cohort and test our code in this one cohort.

Aim: Specify a cohort

Input: Cohort Yaml

Output: Cohort name and Cohort label of the cohort we want to choose.

Instruction: Change ```args_information = ['--cohort_label', '1'] ```

In [None]:
################### in notebook ###################
args_information = ['--cohort_label', '1']
###################################################

import argparse
my_parser = argparse.ArgumentParser(description='Process Input.')

# Add the arguments
my_parser.add_argument('--cohort_name',
                    metavar='cohort_name',
                    default = None,
                    type=str,
                    help='the cohort_name to process')

my_parser.add_argument('--cohort_label',
                    metavar='cohort_label',
                    default = None,
                    type=str,
                    help='the label for cohort_name to process')


args = my_parser.parse_args(args_information)
cohort_label = int(args.cohort_label)
cohort_config = [v for k, v in cohort_args['CohortInfo'].items() if v['cohort_label'] == cohort_label][0]
cohort_name = cohort_config['cohort_name']
print('\n========== cohort_config ==========')
# print(cohort_config)
print(cohort_label, cohort_name)

## [Step 6] df_Human, df_Prt and Save them in record_args for the selected OneCohort
Motivation: ????

Aim: Update record_args

Input:

Output: record_args['df_Prt'],

Instruction:
1. Remember to restart the notebook to fully load the updated yaml files.
2. Run following code

In [None]:
#######################
cohort_label_list = [cohort_label]
#######################

In [None]:
#
# cohort_args base_config

In [None]:
from recfldtkn.loadtools import filter_with_cohort_label, load_ds_rec_and_info
from recfldtkn.pipeline_record import get_parentRecord_info

RootID = cohort_args['RootID']

ds_Human, _ = load_ds_rec_and_info(cohort_args['RecName'], cohort_args, cohort_label_list = cohort_label_list)
df_Human = ds_Human.to_pandas()

df_Human

In [None]:

#########--------
try:
    ds_P, _ = load_ds_rec_and_info('P', cohort_args, cohort_label_list = cohort_label_list)
    print(ds_P)
    df_P = ds_P.to_pandas()[[RootID, 'UserTimeZoneOffset']].rename(columns = {'UserTimeZoneOffset': 'user_tz'})
    df_Human = pd.merge(df_Human, df_P, how = 'left', on = RootID)
    print('SUCCESS ------> user_tz is available')
except:
    print("No user_timezone available")
#########--------


In [None]:
from recfldtkn.pipeline_record import get_parentRecord_info

In [None]:
record_args['ParentRecName']

In [None]:

################
rft_config = {'base_config': cohort_args}

parentResult = get_parentRecord_info(record_args, rft_config, df_Human)
prt_record_args = parentResult['prt_record_args']
df_Prt = parentResult['df_Prt']
df_Human = parentResult['df_Human']
##########################

df_Prt

In [None]:

print(df_Prt.shape)
df_Prt = filter_with_cohort_label(df_Prt, cohort_label, cohort_args)
print(df_Prt.shape)

In [None]:

df_Human = df_Human[df_Human[RootID].isin(df_Prt[RootID].to_list())].reset_index(drop = True)
record_args['df_Prt'] = df_Prt
record_args['prt_record_args'] = prt_record_args

In [None]:
df_Prt

In [None]:
cohort_args['RootIDLength']

In [None]:
rft_config['base_config']['RootID']

In [None]:
df_Human.head()

In [None]:
df_Prt.head()

## [Step 7] OneCohortRec_args

In [None]:
print(cohort_name)
OneCohortRec_args = record_args['CohortInfo'][cohort_name]
OneCohortRec_args['cohort_name'] = cohort_name
OneCohortRec_args['cohort_label'] = cohort_label
OneCohortRec_args

In [None]:
source_path_not_existence_flag = 0
for tablename, tableinfo in OneCohortRec_args.items():
    if tablename in ['cohort_name', 'cohort_label']: continue
    filename = tableinfo['raw_data_path']
    print(filename)
    if not os.path.exists(filename):
        source_path_not_existence_flag += 1
    else:
        if filename.endswith('.csv'):
            df = pd.read_csv(filename, nrows=0)
            raw_tables_columns = list(df.columns)
            print('\n=======================')
            print(filename)
            for i in raw_tables_columns:
                print('-', i)
            print('=======================\n\n')
        elif filename.endswith('.csv.gz'):
            df = pd.read_csv(filename, nrows=0)
            raw_tables_columns = list(df.columns)
            print('\n=======================')
            print(filename)
            for i in raw_tables_columns:
                print('-', i)
            print('=======================\n\n')
        elif filename.endswith('.p'):
            df = pd.read_pickle(filename)
            raw_tables_columns = list(df.columns)
            print('\n=======================')
            print(filename)
            for i in raw_tables_columns:
                print('-', i)
            print('=======================\n\n')

if source_path_not_existence_flag > 0:
    print(f'=== source_path_not_existence_flag: {source_path_not_existence_flag}')

OneCohortRec_args = record_args['CohortInfo'][cohort_name]
print('\n========== OneCohortRec_args ==========')
print(OneCohortRec_args)

## [Step 8] **Important** Select useful Raw Columns

Motivation: Based on understanding of the data, choose useful Raw Columns

AIM: Select useful Raw Columns

Input: ```raw_data_path```

Output:```raw_columns```

Instruciton: Run following code and choose raw_columns based on specific project.

In [None]:
for tablename, tableinfo in OneCohortRec_args.items():
    if tablename in ['cohort_name', 'cohort_label']: continue
    print(tablename)
    print(tableinfo)
    print('\n')

In [None]:
tablename_list =  [i for i in OneCohortRec_args if i not in ['cohort_name', 'cohort_label']]
tablename_list

In [None]:
tablename = tablename_list[0]
tableinfo = OneCohortRec_args[tablename]
raw_data_path = tableinfo['raw_data_path']
print(raw_data_path)

In [None]:
# After checking the columns, you will find some useful raw columns
###########################################################
df = pd.read_csv(raw_data_path, low_memory=False, nrows = 10)

print(df.columns)

######################## <- you need to test this
raw_columns = []
########################

print('Your selected raw columns are:')
for i in raw_columns:
    print('-', i)
###########################################################

## [Step 9] Update Yaml: OneCohort's Table raw_columns
Motivation: update Yaml file

Instruciton: Copy the above raw_columns to the corresponding raw column attributes



In [None]:
# Create a HTML link and display it
path = record_args['yaml_file_path']
full_path = os.path.join(WORKSPACE_PATH, path)
display(HTML(f'{path} <a href="{full_path}" target="_blank">Open File</a>'))

**example**

```yaml
RawData2022_CGM:
    TableFile1:
      raw_data_path: '$DATA_RAW$/RawData2022_CGM/05_02_2022_Patient.csv'
      RawRootID: 'PatientID'  # for merging purpose
      RecNumColumn: 'Patient' # Column in PRawRecNum
      raw_columns:
        - PatientID
        - MaritalStatusID
        - RaceID
        - EthinicityID
        - LevelOfEducationID
        - MRSegmentID
        - MRSegmentModifiedDateTime
        - DiseaseType
        - DiseaseCombinationID
        - PAPEligibility
        - PAPStatus
        - PAPStatusReason
```

# [Part 2] Load HumanRecRaw


We have a pipeline fn to do it.

If you have interests in understanding this pipeline.

These pipeline functions: `get_df_HumanSelected_from_OneCohortRecArgs` and `get_HumanRawRec_for_HumanGroup`

It will take the `record.yaml` within the function and then load the data as the `dfHumanRecRaw`.

Before this part, you must make your yaml file ready.

In [None]:
print([tablename for tablename in OneCohortRec_args])

## [Step 1] Load the df_HuamnRawRec
Motivation:

Input:

Output:

Instruction:

In [None]:
from recfldtkn.pipeline_record import get_df_HumanSelected_from_OneCohortRecArgs
from recfldtkn.pipeline_record import get_HumanRawRec_for_HumanGroup

rec_config = record_args
RawName_to_RawConfig = record_args['RawInfo']
RawName_to_dfRaw = {RawName: i['raw_data_path'] for RawName, i in OneCohortRec_args.items()
                    if RawName not in ['cohort_name', 'cohort_label']}

RawName_to_dfRaw

In [None]:
RawName_to_RawConfig

In [None]:

OneCohort_config = OneCohortRec_args
base_config = cohort_args
df_HumanSelected = get_df_HumanSelected_from_OneCohortRecArgs(rec_config,
                                                                RawName_to_RawConfig,
                                                                OneCohort_config,
                                                                df_Human,
                                                                base_config)

logger.info(f'{df_HumanSelected.shape} === df_HumanSelected <-- df_Human: selected from in CohortLabel {cohort_label}: {cohort_name} and with RecordNum > 0')

df_HumanSelected.head()

In [None]:

RawRootID = cohort_args['RawRootID']
for index_group, df_HumanGroup in df_HumanSelected.groupby('index_group'):
    logger.info(f'current index_group: {index_group} ...')

    # ---------------------- this is the core part of the pipeline ----------------------
    # 7.1 get the df_HumanRawRec
    #     this function can be used independently to get the raw df_HumanRawRec.
    df_HumanRawRec = get_HumanRawRec_for_HumanGroup(df_HumanGroup,
                                                    RawName_to_RawConfig,
                                                    RawName_to_dfRaw,
                                                    base_config)
    index = df_HumanRawRec[RawRootID].isin(df_HumanSelected[RawRootID].to_list())
    df_HumanRawRec = df_HumanRawRec[index].reset_index(drop = True)
    logger.info(f'current df_HumanRawRec: {df_HumanRawRec.shape} ...')

    break

## [Step 2] Display df_HumanRawRec

In [None]:
df_HumanRawRec

# [Part 3] HumanRecAttr

In [None]:
# Create a HTML link and display it
path = record_args['pypath']
full_path = os.path.join(WORKSPACE_PATH, path)
link = f'{path} <a href="{full_path}" target="_blank">Open File</a>'
display(HTML(link))

## [Step 1] **Important** RawRec_to_RecAttr Code

Motivation: To prepare and organize the raw data into a structured format that is suitable for further analysis or processing.

AIM: This determine how do you map the raw_columns to clean attr_columns.

Input: df_Prt and df_HumanRawRec from last step

Output: df with clean attribute

Instruction: Depend on specific project, usually we will need the lase three steps.
Refer to the Welldoc example below.




In [None]:
#------------------- Template
from recfldtkn.pipeline_record import post_record_process

df = df_HumanRawRec

# -. filter out the records we don't need (optional)

# -. create a new column for raw record id (optional)

# -. have a check that the raw record id is unique

# -. update datetime columns
df['dod'] = pd.to_datetime(df['dod'])
# -. select a DT. TODO: you might need to localize the datetime to local timezone.

# x. merge with the parent record (a must except Human Records)
# xyz: merge parent, sort records, and generate RecID. 
df = post_record_process(df, record_args)
#-------------------
df

In [None]:
df.columns

## [Step 2] Pin Down and Attr Cols and Update it to Yaml

Motivation: Choose the final attr cols

Aim:Update final attr cols to Yaml file

Input: attr_cols

Output: Yaml file

Instruction: change the following code regarding to a specific project.

**example**

```yaml
attr_cols:
  - PID
  - PatientID
  - YearOfBirth
  - ActivationDate
  - MRSegmentModifiedDateTime
  - UserTimeZone
  - UserTimeZoneOffset
  - Gender
  - MRSegmentID
  - DiseaseType
```

In [None]:
# Create a HTML link and display it
path = record_args['yaml_file_path']
full_path = os.path.join(WORKSPACE_PATH, path)
link = f'{path} <a href="{full_path}" target="_blank">Open File</a>'
display(HTML(link))

In [None]:
# attr_cols = [

# ###################################
# # ---- TOADD ATTRIBUTE COLUMNS ----
# ###################################

# ]

attr_cols = [
'PID', 'subject_id', 'gender', 'anchor_age', 'anchor_year',
       'anchor_year_group', 'dod'
]

for i in attr_cols: print('-', i)

df[attr_cols].head()

## [Step 3] Write down RawRec_to_RecAttr_fn

Movivation: saving such a RawRec_to_RecAttr_fn is to create clean, efficient, and maintainable code that can be easily shared and reused.

Aim: Save RawRec_RecAttr_fn

Input: df_HumanRawRec, df_Human, cohort_args, record_args, attr_cols

output: RawRec_RecAttr_fn

Instruction: Copy the code from above and run it.

In [None]:
from recfldtkn.loadtools import convert_variables_to_pystirng, load_module_variables
import inspect

###########################
def RawRec_to_RecAttr_fn(df_HumanRawRec, df_Human, cohort_args, record_args, attr_cols):
    

    #------------------- Template
    from recfldtkn.pipeline_record import post_record_process

    df = df_HumanRawRec

    # -. filter out the records we don't need (optional)

    # -. create a new column for raw record id (optional)

    # -. have a check that the raw record id is unique

    # -. update datetime columns
    df['dod'] = pd.to_datetime(df['dod'])
    # -. select a DT. TODO: you might need to localize the datetime to local timezone.

    # x. merge with the parent record (a must except Human Records)
    # xyz: merge parent, sort records, and generate RecID. 
    df = post_record_process(df, record_args)
    #-------------------
    
    df_HumanRecAttr = df[attr_cols].reset_index(drop = True)
    return df_HumanRecAttr
###########################

RawRec_to_RecAttr_fn.fn_string = inspect.getsource(RawRec_to_RecAttr_fn)

## [Step 4] Save as the pipeline fn

Instruction:  Run the following code.


In [None]:
prefix = ['import pandas as pd', 'import numpy as np']
pycode = convert_variables_to_pystirng(fn_variables = [RawRec_to_RecAttr_fn], prefix = prefix)
RecName = record_args['RecName']
pypath = record_args['pypath']
print(pypath)
if not os.path.exists(os.path.dirname(pypath)): os.makedirs(os.path.dirname(pypath))
with open(pypath, 'w') as file: file.write(pycode)

RecName = record_args['RecName']
pypath = record_args['pypath']
module = load_module_variables(pypath)
RawRec_to_RecAttr_fn = module.MetaDict['RawRec_to_RecAttr_fn']

## [Step 5] Test the save pipeline fn

In [None]:
df_HumanRecAttr = RawRec_to_RecAttr_fn(df_HumanRawRec, df_Human, cohort_args, record_args, attr_cols)
df_HumanRecAttr

# Save to RFT

In [None]:
from recfldtkn.pipeline_record import pipeline_record
from recfldtkn.configfn import load_rft_config

record_to_recfldtkn_list = {
    RecName: []
}

RecName_list = [RecName]
FldTknName_list = None  
rft_config = load_rft_config(recfldtkn_config_path, RecName_list, 
                             FldTknName_list, SPACE, use_inference = False)

results = pipeline_record(record_to_recfldtkn_list, 
                            OneCohort_config,
                            rft_config, 
                            df_Human, 
                            RawName_to_dfRaw, 
                            load_from_disk = False, 
                            reuse_old_rft = False, 
                            save_to_disk = True)

[i for i in results]

In [None]:
results['RecName_to_dsRec']