# Space


version: 2024-02-11

In [1]:
import os
import sys 
import pandas as pd 
from IPython.display import display, HTML
KEY = 'WorkSpace'
WORKSPACE_PATH = os.getcwd().split(KEY)[0] + KEY
print(WORKSPACE_PATH)
os.chdir(WORKSPACE_PATH)
sys.path.append(WORKSPACE_PATH)
import sys
from proj_space import PROJECT, TaskName, SPACE
SPACE['WORKSPACE_PATH'] = WORKSPACE_PATH
sys.path.append(SPACE['CODE_FN'])
recfldtkn_config_path = os.path.join(SPACE['CODE_RFT'], 'config_recfldtkn')

/Users/xiruihan/Library/CloudStorage/GoogleDrive-rui.han.cdhai@gmail.com/Shared drives/CDHAI-WellDoc/2024-WellDocTest-SPACE/_RFT-WorkSpace


# [Part 1] Load dfHumanRecAttr

## [Step 1]** Create FldTknName

Motivation: To provide a flexible way to handle different types of configuration files or data records

Aim: FldTknName is set to the name of a file ('Food-NutriN2CTkn' in this case); FldType is set to 'N2C' in this case.

Input:

Output:FldTknName and FldType



<span style="color:red;">Instruction:</span>
1. change 'P-DemoCateTkn', in this example: P is rec name and DemoCateTkn is a specific tkn name
2. change FldType. In general, we have a few different types.

In [2]:
###########################
FldTknName = 'P-DemoCateTkn' # <-------- select your yaml file name
FldType = 'Cate'
###########################

FLD_TYPE_LIST = ['Cate', 'N2C', 'Nume', 'External']
assert FldType in FLD_TYPE_LIST
RecName = FldTknName.split('-')[0]
print(RecName)

P


## [Step 2] Open  Rec yaml file 


In [3]:
import pandas as pd
from recfldtkn.configfn import load_cohort_args, load_record_args, load_fldtkn_args

# step 1: create the FldTkn yaml file in recfldtkn_config_path
cohort_args = load_cohort_args(recfldtkn_config_path, SPACE)
record_args = load_record_args(RecName, cohort_args)
fldtkn_args = load_fldtkn_args(RecName, FldTknName, cohort_args)

# Create a HTML link and display it
path = record_args['yaml_file_path']
full_path = os.path.join(WORKSPACE_PATH, path)
display(HTML(f'{path} <a href="{full_path}" target="_blank">Open File</a>'))

## [Step 3]**: Add FldTknInfo to Record Yaml

``` yaml
FldTknInfo:
  # <---- uodated information >
  P-DemoCateTkn:  # RecName-FldTkn 
    value_cols:   # value columns
      - Gender
      - DiseaseType
      - MRSegmentID
  # <--------------------->
```


## [Step 4] Load FldTkn Args (from Record yaml's FldTkn part)

In [4]:
from recfldtkn.loadtools import load_ds_rec_and_info
# load fldtkn_args
fldtkn_args = load_fldtkn_args(RecName, FldTknName, cohort_args)
fldtkn_args['attr_cols']

# load dfHumanRecAttr
value_cols = fldtkn_args['value_cols']
attr_cols  = fldtkn_args['attr_cols'] # record_args['RecIDChain'] + value_cols

print(value_cols)
print(attr_cols)

['Gender', 'DiseaseType', 'MRSegmentID']
['PID', 'PatientID', 'Gender', 'DiseaseType', 'MRSegmentID']


## [Step 5] Prepare dfHumanRecAttr

In [5]:

dsHumanRecAttr, _ = load_ds_rec_and_info(RecName, cohort_args)
dfHumanRecAttr = dsHumanRecAttr.select_columns(attr_cols).to_pandas()
print(dfHumanRecAttr.shape)
dfHumanRecAttr.head()

(7296, 5)


Unnamed: 0,PID,PatientID,Gender,DiseaseType,MRSegmentID
0,1000001,6,2,1,MR_6
1,1000002,10,2,2,MR_1
2,1000003,11,1,2,MR_1
3,1000004,13,1,2,MR_1
4,1000005,14,2,2,MR_5.6


# [Part 2]: Design $\phi$ pipeline

## [Step 1]* [Pre-defined Token Vocab]

In [6]:
print(fldtkn_args['attr_cols'])
print(fldtkn_args['value_cols'])
print('FldType:', FldType)

column_to_top_values = {}
item_to_configs = {}

if FldType == 'Cate':
    ############################### for Cate Tkn only
    column_to_top_values = {}
    TOP_NUM = 30
    cols = fldtkn_args['value_cols']
    for col in cols:
        top_tkn = list(dfHumanRecAttr[col].value_counts().iloc[:TOP_NUM].index)
        print(col, len(top_tkn), top_tkn)
        column_to_top_values[col] = top_tkn # tolist()
    ###############################
        
elif FldType == 'N2C':
    ############################### for N2C Tkn only, you need to modify this part. 
    cols = fldtkn_args['value_cols']
    descp = dfHumanRecAttr[cols].astype(float).describe().round(2)#.to_dict()
    print(descp)
    item_to_configs = {
       #  'XXX': {'Max': 600, 'Min': 1, 'INTERVAL': 1}, # <--- you need to modify this part
    }
    ###############################

elif FldType == 'External':
    df_db = fldtkn_args['external_source']
    display(HTML(df_db.head().to_html()))

else:
    assert FldType in FLD_TYPE_LIST
    
fldtkn_args[f'column_to_top_values'] = column_to_top_values
fldtkn_args['item_to_configs'] = item_to_configs

['PID', 'PatientID', 'Gender', 'DiseaseType', 'MRSegmentID']
['Gender', 'DiseaseType', 'MRSegmentID']
FldType: Cate
Gender 2 [1, 2]
DiseaseType 2 [2, 1]
MRSegmentID 17 ['MR_6', 'MR_0', 'MR_1', 'MR_7', 'MR_2', 'MR_4', 'MR_5.6', 'MR_5.0', 'MR_5.7', 'MR_3', 'MR_5.4', 'MR_5.8', 'MR_5.1', 'MR_4.1', 'MR_5.2', 'MR_-1', 'MR_5.5']


## [Step 2]* Tokenizer

In [7]:
import inspect

################################## You might need to change it a bit. 
def tokenizer_fn(rec, fldtkn_args):
    d = {}

    # #----------- Cate
    column_to_top_values = fldtkn_args[f'column_to_top_values']
    for key in column_to_top_values:
        top_values = column_to_top_values[key]
        value = rec.get(key, 'unk')
        if value not in top_values and value != 'unk': value = 'minor'
        key_value = f"{key}_{value}"  # Concatenate key and value
        d[key_value] = 1

    # #------------ N2C with interval and intervel level. 
    # item_to_configs = fldtkn_args['item_to_configs']
    # for item, configs in item_to_configs.items():
    #     Max = configs['Max']
    #     Min = configs['Min']
    #     INTERVAL = configs['INTERVAL']
    #     if pd.isnull(rec.get(item, None)):
    #         d[f"{item}:None"] = 1
    #     elif float(rec[item]) > Max:
    #         d[ f"{item}:Above{Max}"] = 1
    #     elif float(rec[item]) < Min:
    #         d[ f"{item}:Below{Min}"] = 1
    #     else:
    #         lower_bound = int((float(rec[item]) // INTERVAL) * INTERVAL)
    #         upper_bound = int(lower_bound + INTERVAL)
    #         # Calculate the proportion of value within the interval
    #         proportion = (float(rec[item]) - lower_bound) / INTERVAL
    #         # Construct the keys
    #         key1 = f"{item}:{lower_bound}~{upper_bound}"
    #         key2 = f"{key1}Level"
    #         # Add them to the dictionary with appropriate weights
    #         d[key1] = 1
    #         d[key2] = proportion

    # #------------ Nume
    # for col in fldtkn_args['value_cols']:
    #     x = rec[col]
    #     if pd.isnull(x):
    #         d[f'{col}_None'] = 1
    #     else:
    #         d[col] = float(x)

    # #------------ ExternalSource: zip3 as an example. This is case by case. 
    ##############################################################
    ##     this part can be moved to big Phi in the future.     ##
    ##############################################################
    # df_db = fldtkn_args['external_source']
    # try:
    #     external_id = str(int(rec['patient_zipcode_3']))
    # except:
    #     external_id = str(rec['patient_zipcode_3'])

    # if external_id not in df_db['Zip3'].to_list():
    #     return {'tkn': ['zip3-None'], 'wgt':[1]}
    # row = df_db[df_db['Zip3'] == external_id].iloc[0].to_dict()
    # tkn_col = [i for i in df_db.columns if 'tkn' in i][0]
    # wgt_col = [i for i in df_db.columns if 'wgt' in i][0]
    # d = dict(zip(row[tkn_col], row[wgt_col]))


    tkn = list(d.keys())
    wgt = list(d.values())
    output = {'tkn': tkn, 'wgt': wgt}
    return output
##################################

tokenizer_fn.fn_string = inspect.getsource(tokenizer_fn)

print('show tokenizer_fn result')
rec = dfHumanRecAttr.iloc[0]
print(rec.to_dict())
print(tokenizer_fn(rec, fldtkn_args))

show tokenizer_fn result
{'PID': 1000001, 'PatientID': 6, 'Gender': 2, 'DiseaseType': 1, 'MRSegmentID': 'MR_6'}
{'tkn': ['Gender_2', 'DiseaseType_1', 'MRSegmentID_MR_6'], 'wgt': [1, 1, 1]}


## [Step 3]* Vocab

In [8]:
import itertools

print('show idx2tkn for FldType:', FldType )
if FldType == 'Cate':
    ############################################## Cate Tkn only
    idx2tkn = []
    for col, values in column_to_top_values.items():
        idx2tkn = idx2tkn + [f'{col}_unk', f'{col}_minor']
        for val in values:
            idx2tkn.append(f"{col}_{val}")
    print(len(idx2tkn))
    print(idx2tkn[:10])
    ##############################################

elif FldType == 'N2C':
    ############################################## for N2C only
    Min, Max = -10, 3000 # <---- keep this as default. don't change it.
    df_simu = pd.DataFrame({
        col: [None] + list(range(Min, Max )) for col in item_to_configs.keys()
    })
    df_sim = pd.DataFrame(df_simu.apply(lambda rec: tokenizer_fn(rec, fldtkn_args), axis = 1).to_list())
    idx2tkn = sorted(list(set(itertools.chain(*df_sim['tkn'].to_list()))))
    print(len(idx2tkn))
    print(idx2tkn[:10])
    ##############################################

elif FldType == 'Nume':
    ############################################## for Nume
    idx2tkn = fldtkn_args['value_cols'] + [f'{col}_None' for col in fldtkn_args['value_cols']]
    print(len(idx2tkn))
    print(idx2tkn[:10])
    ##############################################

elif FldType == 'External':
    none_tkn = 'zip3-None' # <--- you need to change this.
    tkn_col = [i for i in df_db.columns if 'tkn' in i][0]
    idx2tkn = [none_tkn] +sorted(list(set(itertools.chain(*fldtkn_args['external_source'][tkn_col].to_list()))))
    print(len(idx2tkn[:10]))

else:
    assert FldType in FLD_TYPE_LIST

show idx2tkn for FldType: Cate
27
['Gender_unk', 'Gender_minor', 'Gender_1', 'Gender_2', 'DiseaseType_unk', 'DiseaseType_minor', 'DiseaseType_2', 'DiseaseType_1', 'MRSegmentID_unk', 'MRSegmentID_minor']


In [9]:
from recfldtkn.pipeline_record import get_and_save_vocab_from_idx2tkn
Vocab = get_and_save_vocab_from_idx2tkn(idx2tkn, **fldtkn_args)
Vocab

idx2tkn    [Gender_unk, Gender_minor, Gender_1, Gender_2,...
tkn2idx    {'Gender_unk': 0, 'Gender_minor': 1, 'Gender_1...
tkn2fld    {'Gender_unk': 'P-DemoCateTkn', 'Gender_minor'...
dtype: object

# Part 4: Application

## [Step 1] Save PyFile

In [10]:
from recfldtkn.loadtools import convert_variables_to_pystirng, load_module_variables

prefix = ['import pandas as pd', 'import numpy as np']
iterative_variables = [column_to_top_values, item_to_configs, idx2tkn] # <-- don't forget to update this.
fn_variables = [tokenizer_fn]
pycode = convert_variables_to_pystirng(iterative_variables = iterative_variables, fn_variables = fn_variables, prefix = prefix)
RecName = record_args['RecName']
pypath = fldtkn_args['pypath']
# print(pypath)
with open(pypath, 'w') as file: file.write(pycode)

# Create a HTML link and display it
path = fldtkn_args['pypath']
full_path = os.path.join(WORKSPACE_PATH, path)
display(HTML(f'{path} <a href="{full_path}" target="_blank">Open File</a>'))


## [Step 2] Load PyFile

In [11]:
module = load_module_variables(pypath)

# tokenizer_fn
tokenizer_fn = module.MetaDict['tokenizer_fn']

# idx2tkn
idx2tkn = module.MetaDict['idx2tkn']

if 'column_to_top_values' in module.MetaDict:
    fldtkn_args['column_to_top_values'] = module.MetaDict['column_to_top_values']
if 'item_to_configs' in module.MetaDict:
    fldtkn_args['item_to_configs'] = module.MetaDict['item_to_configs']

## [Step 3] Application

In [12]:
from recfldtkn.pipeline_record import tokenizer_dfHumanRecAttr
from datetime import datetime

print('s', datetime.now())
RootID, RecID = cohort_args['RootID'], record_args['RecID']
df_fld = tokenizer_dfHumanRecAttr(dfHumanRecAttr, RootID, RecID, FldTknName, 
                                 tokenizer_fn, Vocab, fldtkn_args,
                                 use_tknidx = True)
print('e', datetime.now())
total_memory = df_fld.memory_usage(index=True).sum()
print(f"Total memory usage: {total_memory / 1024**2:.2f} MB")
df_fld.head()

s 2024-02-11 21:38:28.086439
e 2024-02-11 21:38:28.242044
Total memory usage: 0.17 MB


Unnamed: 0,PID,P-DemoCateTkn_tknidx,P-DemoCateTkn_wgt
0,1000001,"[3, 7, 10]","[1, 1, 1]"
1,1000002,"[3, 6, 12]","[1, 1, 1]"
2,1000003,"[2, 6, 12]","[1, 1, 1]"
3,1000004,"[2, 6, 12]","[1, 1, 1]"
4,1000005,"[3, 6, 16]","[1, 1, 1]"


In [13]:
print('s', datetime.now())
RootID, RecID = cohort_args['RootID'], record_args['RecID']
df_fld = tokenizer_dfHumanRecAttr(dfHumanRecAttr, RootID, RecID, FldTknName, 
                                 tokenizer_fn, Vocab, fldtkn_args, 
                                 use_tknidx = False)
print('e', datetime.now())
total_memory = df_fld.memory_usage(index=True).sum()
print(f"Total memory usage: {total_memory / 1024**2:.2f} MB")
df_fld.head()

s 2024-02-11 21:38:29.031967
e 2024-02-11 21:38:29.311077
Total memory usage: 0.17 MB


Unnamed: 0,PID,P-DemoCateTkn_tkn,P-DemoCateTkn_wgt
0,1000001,"[Gender_2, DiseaseType_1, MRSegmentID_MR_6]","[1, 1, 1]"
1,1000002,"[Gender_2, DiseaseType_2, MRSegmentID_MR_1]","[1, 1, 1]"
2,1000003,"[Gender_1, DiseaseType_2, MRSegmentID_MR_1]","[1, 1, 1]"
3,1000004,"[Gender_1, DiseaseType_2, MRSegmentID_MR_1]","[1, 1, 1]"
4,1000005,"[Gender_2, DiseaseType_2, MRSegmentID_MR_5.6]","[1, 1, 1]"
