# Space


version: 2024-02-11

In [1]:
import os
import sys 
import pandas as pd 
from IPython.display import display, HTML
KEY = 'WorkSpace'
WORKSPACE_PATH = os.getcwd().split(KEY)[0] + KEY
print(WORKSPACE_PATH)
os.chdir(WORKSPACE_PATH)
sys.path.append(WORKSPACE_PATH)
import sys
from proj_space import PROJECT, TaskName, SPACE
SPACE['WORKSPACE_PATH'] = WORKSPACE_PATH
sys.path.append(SPACE['CODE_FN'])
recfldtkn_config_path = os.path.join(SPACE['CODE_RFT'], 'config_recfldtkn')

g:\Shared drives\CDHAI-WellDoc\2024-WellDocTest-SPACE\_WellDoc-RFT-WorkSpace


# [Part 1] Load dfHumanRecAttr

## [Step 1]** Create FldTknName

Motivation: To provide a flexible way to handle different types of configuration files or data records

Aim: FldTknName is set to the name of a file ('Food-NutriN2CTkn' in this case); FldType is set to 'N2C' in this case.

Input:

Output:FldTknName and FldType



<span style="color:red;">Instruction:</span>
1. change 'P-DemoCateTkn', in this example: P is rec name and DemoCateTkn is a specific tkn name
2. change FldType. In general, we have a few different types.

In [2]:
###########################
FldTknName = 'WeightU-N2CTkn' # <-------- select your yaml file name
FldType = 'N2C'
###########################

FLD_TYPE_LIST = ['Cate', 'N2C', 'Nume', 'External']
assert FldType in FLD_TYPE_LIST
RecName = FldTknName.split('-')[0]
print(RecName)

WeightU


## [Step 2] Open  Rec yaml file 


In [3]:
import pandas as pd
from recfldtkn.configfn import load_cohort_args, load_record_args, load_fldtkn_args

# step 1: create the FldTkn yaml file in recfldtkn_config_path
cohort_args = load_cohort_args(recfldtkn_config_path, SPACE)
record_args = load_record_args(RecName, cohort_args)
fldtkn_args = load_fldtkn_args(RecName, FldTknName, cohort_args)

# Create a HTML link and display it
path = record_args['yaml_file_path']
full_path = os.path.join(WORKSPACE_PATH, path)
display(HTML(f'{path} <a href="{full_path}" target="_blank">Open File</a>'))

[INFO:2024-04-19 13:25:13,566:(configfn.py@116 recfldtkn.configfn)]: file_path in load_fldtkn_args: ../pipeline\config_recfldtkn\Record\WeightU.yaml


## [Step 3]**: Add FldTknInfo to Record Yaml

``` yaml
FldTknInfo:
  # <---- uodated information >
  P-DemoCateTkn:  # RecName-FldTkn 
    value_cols:   # value columns
      - Gender
      - DiseaseType
      - MRSegmentID
  # <--------------------->
```


## [Step 4] Load FldTkn Args (from Record yaml's FldTkn part)

In [4]:
from recfldtkn.loadtools import load_ds_rec_and_info
# load fldtkn_args
fldtkn_args = load_fldtkn_args(RecName, FldTknName, cohort_args)
fldtkn_args['attr_cols']

# load dfHumanRecAttr
value_cols = fldtkn_args['value_cols']
attr_cols  = fldtkn_args['attr_cols'] # record_args['RecIDChain'] + value_cols

print(value_cols)
print(attr_cols)

[INFO:2024-04-19 13:25:13,867:(config.py@58 datasets)]: PyTorch version 2.1.2+cu121 available.
[INFO:2024-04-19 13:25:14,140:(configfn.py@116 recfldtkn.configfn)]: file_path in load_fldtkn_args: ../pipeline\config_recfldtkn\Record\WeightU.yaml


['Weight']
['PID', 'WeightUID', 'DT_s', 'Weight']


## [Step 5] Prepare dfHumanRecAttr

In [5]:
############################
cohort_label_list = [1]
############################


dsHumanRecAttr, _ = load_ds_rec_and_info(RecName, cohort_args, cohort_label_list)
dfHumanRecAttr = dsHumanRecAttr.select_columns(attr_cols).to_pandas()
print(dfHumanRecAttr.shape)
del dsHumanRecAttr


if len(dfHumanRecAttr) > 100000:
    dfHumanRecAttr = dfHumanRecAttr.head(100000)
    
dfHumanRecAttr.head()

(28171, 4)


Unnamed: 0,PID,WeightUID,DT_s,Weight
0,1000001,1000001-0,2018-06-25 16:43:50,209.439149
1,1000001,1000001-1,2018-08-16 06:51:00,207.234526
2,1000001,1000001-2,2018-09-04 07:34:00,207.234526
3,1000001,1000001-3,2018-12-12 15:06:00,208.998225
4,1000001,1000001-4,2018-12-13 14:54:00,209.439149


# [Part 2]: Design $\phi$ pipeline

## [Step 1]* [Pre-defined Token Vocab]

In [6]:
print(fldtkn_args['attr_cols'])
print(fldtkn_args['value_cols'])
print('FldType:', FldType)

column_to_top_values = {}
item_to_configs = {}

if FldType == 'Cate':
    ############################### for Cate Tkn only
    column_to_top_values = {}
    TOP_NUM = 30
    cols = fldtkn_args['value_cols']
    for col in cols:
        top_tkn = list(dfHumanRecAttr[col].value_counts().iloc[:TOP_NUM].index)
        print(col, len(top_tkn), top_tkn)
        column_to_top_values[col] = top_tkn # tolist()
    ###############################
        
elif FldType == 'N2C':
    ############################### for N2C Tkn only, you need to modify this part. 
    cols = fldtkn_args['value_cols']
    descp = dfHumanRecAttr[cols].astype(float).describe().round(2)#.to_dict()
    print(descp)
    item_to_configs = {
       'Weight': {'Max': 300, 'Min': 100, 'INTERVAL': 10}, # <--- you need to modify this part
    }
    ###############################

elif FldType == 'External':
    df_db = fldtkn_args['external_source']
    display(HTML(df_db.head().to_html()))

else:
    assert FldType in FLD_TYPE_LIST
    
fldtkn_args[f'column_to_top_values'] = column_to_top_values
fldtkn_args['item_to_configs'] = item_to_configs

['PID', 'WeightUID', 'DT_s', 'Weight']
['Weight']
FldType: N2C
         Weight
count  28171.00
mean     208.19
std       44.73
min        1.00
25%      174.52
50%      209.80
75%      236.60
max      555.00


## [Step 2]* Tokenizer

In [7]:
import inspect

################################## You might need to change it a bit. 
def tokenizer_fn(rec, fldtkn_args):
    d = {}

    # #----------- Cate
    # column_to_top_values = fldtkn_args[f'column_to_top_values']
    # for key in column_to_top_values:
    #     top_values = column_to_top_values[key]
    #     value = rec.get(key, 'unk')
    #     if value not in top_values and value != 'unk': value = 'minor'
    #     key_value = f"{key}_{value}"  # Concatenate key and value
    #     d[key_value] = 1

    #------------ N2C with interval and intervel level. 
    item_to_configs = fldtkn_args['item_to_configs']
    for item, configs in item_to_configs.items():
        Max = configs['Max']
        Min = configs['Min']
        INTERVAL = configs['INTERVAL']
        if pd.isnull(rec.get(item, None)):
            d[f"{item}:None"] = 1
        elif float(rec[item]) > Max:
            d[ f"{item}:Above{Max}"] = 1
        elif float(rec[item]) < Min:
            d[ f"{item}:Below{Min}"] = 1
        else:
            lower_bound = int((float(rec[item]) // INTERVAL) * INTERVAL)
            upper_bound = int(lower_bound + INTERVAL)
            # Calculate the proportion of value within the interval
            proportion = (float(rec[item]) - lower_bound) / INTERVAL
            # Construct the keys
            key1 = f"{item}:{lower_bound}~{upper_bound}"
            key2 = f"{key1}Level"
            # Add them to the dictionary with appropriate weights
            d[key1] = 1
            d[key2] = proportion

    # #------------ Nume
    # for col in fldtkn_args['value_cols']:
    #     x = rec[col]
    #     if pd.isnull(x):
    #         d[f'{col}_None'] = 1
    #     else:
    #         d[col] = float(x)

    # #------------ ExternalSource: zip3 as an example. This is case by case. 
    ##############################################################
    ##     this part can be moved to big Phi in the future.     ##
    ##############################################################
    # df_db = fldtkn_args['external_source']
    # try:
    #     external_id = str(int(rec['patient_zipcode_3']))
    # except:
    #     external_id = str(rec['patient_zipcode_3'])

    # if external_id not in df_db['Zip3'].to_list():
    #     return {'tkn': ['zip3-None'], 'wgt':[1]}
    # row = df_db[df_db['Zip3'] == external_id].iloc[0].to_dict()
    # tkn_col = [i for i in df_db.columns if 'tkn' in i][0]
    # wgt_col = [i for i in df_db.columns if 'wgt' in i][0]
    # d = dict(zip(row[tkn_col], row[wgt_col]))


    tkn = list(d.keys())
    wgt = list(d.values())
    output = {'tkn': tkn, 'wgt': wgt}
    return output
##################################

tokenizer_fn.fn_string = inspect.getsource(tokenizer_fn)

print('show tokenizer_fn result')
rec = dfHumanRecAttr.iloc[0]
print(rec.to_dict())
print(tokenizer_fn(rec, fldtkn_args))

show tokenizer_fn result
{'PID': 1000001, 'WeightUID': '1000001-0', 'DT_s': Timestamp('2018-06-25 16:43:50'), 'Weight': 209.439149075634}
{'tkn': ['Weight:200~210', 'Weight:200~210Level'], 'wgt': [1, 0.9439149075633992]}


## [Step 3]* Vocab

In [8]:
import itertools

def sort_fn(s):
    try:
        return int(s.split(':')[-1].split('~')[0])
    except:
        return float('inf')
    
print('show idx2tkn for FldType:', FldType )
if FldType == 'Cate':
    ############################################## Cate Tkn only
    idx2tkn = []
    for col, values in column_to_top_values.items():
        idx2tkn = idx2tkn + [f'{col}_unk', f'{col}_minor']
        for val in values:
            idx2tkn.append(f"{col}_{val}")
    print(len(idx2tkn))
    print(idx2tkn[:10])
    ##############################################

elif FldType == 'N2C':
    ############################################## for N2C only
    Min, Max = -10, 3000 # <---- keep this as default. don't change it.
    df_simu = pd.DataFrame({
        col: [None] + list(range(Min, Max )) for col in item_to_configs.keys()
    })
    df_sim = pd.DataFrame(df_simu.apply(lambda rec: tokenizer_fn(rec, fldtkn_args), axis = 1).to_list())
    idx2tkn = sorted(list(set(itertools.chain(*df_sim['tkn'].to_list()))))
    print(len(idx2tkn))
    print(idx2tkn[:10])
    ##############################################

elif FldType == 'Nume':
    ############################################## for Nume
    idx2tkn = fldtkn_args['value_cols'] + [f'{col}_None' for col in fldtkn_args['value_cols']]
    print(len(idx2tkn))
    print(idx2tkn[:10])
    ##############################################

elif FldType == 'External':
    none_tkn = 'zip3-None' # <--- you need to change this.
    tkn_col = [i for i in df_db.columns if 'tkn' in i][0]
    idx2tkn = [none_tkn] +sorted(list(set(itertools.chain(*fldtkn_args['external_source'][tkn_col].to_list()))))
    print(len(idx2tkn[:10]))

else:
    assert FldType in FLD_TYPE_LIST


idx2tkn = sorted(idx2tkn, key = sort_fn)
idx2tkn = ['unk'] + idx2tkn
print(len(idx2tkn))
idx2tkn

show idx2tkn for FldType: N2C
45
['Weight:100~110', 'Weight:100~110Level', 'Weight:110~120', 'Weight:110~120Level', 'Weight:120~130', 'Weight:120~130Level', 'Weight:130~140', 'Weight:130~140Level', 'Weight:140~150', 'Weight:140~150Level']
46


['unk',
 'Weight:100~110',
 'Weight:100~110Level',
 'Weight:110~120',
 'Weight:110~120Level',
 'Weight:120~130',
 'Weight:120~130Level',
 'Weight:130~140',
 'Weight:130~140Level',
 'Weight:140~150',
 'Weight:140~150Level',
 'Weight:150~160',
 'Weight:150~160Level',
 'Weight:160~170',
 'Weight:160~170Level',
 'Weight:170~180',
 'Weight:170~180Level',
 'Weight:180~190',
 'Weight:180~190Level',
 'Weight:190~200',
 'Weight:190~200Level',
 'Weight:200~210',
 'Weight:200~210Level',
 'Weight:210~220',
 'Weight:210~220Level',
 'Weight:220~230',
 'Weight:220~230Level',
 'Weight:230~240',
 'Weight:230~240Level',
 'Weight:240~250',
 'Weight:240~250Level',
 'Weight:250~260',
 'Weight:250~260Level',
 'Weight:260~270',
 'Weight:260~270Level',
 'Weight:270~280',
 'Weight:270~280Level',
 'Weight:280~290',
 'Weight:280~290Level',
 'Weight:290~300',
 'Weight:290~300Level',
 'Weight:300~310',
 'Weight:300~310Level',
 'Weight:Above300',
 'Weight:Below100',
 'Weight:None']

In [9]:
from recfldtkn.pipeline_record import get_and_save_vocab_from_idx2tkn
Vocab = get_and_save_vocab_from_idx2tkn(idx2tkn, **fldtkn_args)
Vocab

idx2tkn    [unk, Weight:100~110, Weight:100~110Level, Wei...
tkn2idx    {'unk': 0, 'Weight:100~110': 1, 'Weight:100~11...
tkn2fld    {'unk': 'WeightU-N2CTkn', 'Weight:100~110': 'W...
dtype: object

# Part 4: Application

## [Step 1] Save PyFile

In [10]:
from recfldtkn.loadtools import convert_variables_to_pystirng, load_module_variables

prefix = ['import pandas as pd', 'import numpy as np']
iterative_variables = [column_to_top_values, item_to_configs, idx2tkn] # <-- don't forget to update this.
fn_variables = [tokenizer_fn]
pycode = convert_variables_to_pystirng(iterative_variables = iterative_variables, fn_variables = fn_variables, prefix = prefix)
RecName = record_args['RecName']
pypath = fldtkn_args['pypath']
# print(pypath)
with open(pypath, 'w') as file: file.write(pycode)

# Create a HTML link and display it
path = fldtkn_args['pypath']
full_path = os.path.join(WORKSPACE_PATH, path)
display(HTML(f'{path} <a href="{full_path}" target="_blank">Open File</a>'))


## [Step 2] Load PyFile

In [11]:
module = load_module_variables(pypath)

# tokenizer_fn
tokenizer_fn = module.MetaDict['tokenizer_fn']

# idx2tkn
idx2tkn = module.MetaDict['idx2tkn']

if 'column_to_top_values' in module.MetaDict:
    fldtkn_args['column_to_top_values'] = module.MetaDict['column_to_top_values']
if 'item_to_configs' in module.MetaDict:
    fldtkn_args['item_to_configs'] = module.MetaDict['item_to_configs']

## [Step 3] Application

In [12]:
from recfldtkn.pipeline_record import tokenizer_dfHumanRecAttr
from datetime import datetime

print('s', datetime.now())
RootID, RecID = cohort_args['RootID'], record_args['RecID']
df_fld = tokenizer_dfHumanRecAttr(dfHumanRecAttr, RootID, RecID, FldTknName, 
                                 tokenizer_fn, Vocab, fldtkn_args,
                                 use_tknidx = True)
print('e', datetime.now())
total_memory = df_fld.memory_usage(index=True).sum()
print(f"Total memory usage: {total_memory / 1024**2:.2f} MB")
df_fld.head()

s 2024-04-19 13:25:15.004504
e 2024-04-19 13:25:15.304026
Total memory usage: 0.86 MB


Unnamed: 0,PID,WeightUID,WeightU-N2CTkn_tknidx,WeightU-N2CTkn_wgt
0,1000001,1000001-0,"[21, 22]","[1, 0.9439149075633992]"
1,1000001,1000001-1,"[21, 22]","[1, 0.7234526453785008]"
2,1000001,1000001-2,"[21, 22]","[1, 0.7234526453785008]"
3,1000001,1000001-3,"[21, 22]","[1, 0.8998224551264002]"
4,1000001,1000001-4,"[21, 22]","[1, 0.9439149075633992]"


In [13]:
print('s', datetime.now())
RootID, RecID = cohort_args['RootID'], record_args['RecID']
df_fld = tokenizer_dfHumanRecAttr(dfHumanRecAttr, RootID, RecID, FldTknName, 
                                 tokenizer_fn, Vocab, fldtkn_args, 
                                 use_tknidx = False)
print('e', datetime.now())
total_memory = df_fld.memory_usage(index=True).sum()
print(f"Total memory usage: {total_memory / 1024**2:.2f} MB")
df_fld.head()

s 2024-04-19 13:25:15.336731
e 2024-04-19 13:25:15.621844
Total memory usage: 0.86 MB


Unnamed: 0,PID,WeightUID,WeightU-N2CTkn_tkn,WeightU-N2CTkn_wgt
0,1000001,1000001-0,"[Weight:200~210, Weight:200~210Level]","[1, 0.9439149075633992]"
1,1000001,1000001-1,"[Weight:200~210, Weight:200~210Level]","[1, 0.7234526453785008]"
2,1000001,1000001-2,"[Weight:200~210, Weight:200~210Level]","[1, 0.7234526453785008]"
3,1000001,1000001-3,"[Weight:200~210, Weight:200~210Level]","[1, 0.8998224551264002]"
4,1000001,1000001-4,"[Weight:200~210, Weight:200~210Level]","[1, 0.9439149075633992]"


# Save RFT

In [14]:
import shutil
from recfldtkn.pipeline_record import pipeline_for_FldTkn
import logging
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format='[%(levelname)s:%(asctime)s:(%(filename)s@%(lineno)d %(name)s)]: %(message)s')

############################
cohort_label_list = [1, 2, 3]
############################


cohort_label_to_cohort_name = {i.split('-')[0]:i.split('-')[1]  for i in os.listdir(SPACE['DATA_RFT']) if '-' in i}

for cohort_label in cohort_label_list:
    cohort_name = cohort_label_to_cohort_name[str(cohort_label)]
    cohort_full_name = f'{cohort_label}-{cohort_name}'
    logger.info(f'\n\n================={cohort_full_name}=================\n')
    
    
    data_folder = os.path.join(SPACE['DATA_RFT'], cohort_full_name,  RecName)
    logger.info(data_folder)
    ds_rec, _ = load_ds_rec_and_info(RecName, cohort_args, [cohort_label])
    logger.info(ds_rec)
    
    fldtkn_args['Name'] = FldTknName
    
    if 'tkn' in ds_rec.column_names:
        logger.info(ds_rec.column_names)
        ds_rec = ds_rec.remove_columns('tkn')
        
    ds_rec = pipeline_for_FldTkn(ds_rec, fldtkn_args)
    # ds_rec = ds_rec.map(lambda x: tokenizer_fn(x, fldtkn_args))
    logger.info(ds_rec)
    
    # ds_rec.save_to_disk(data_folder + '_data')
    # # save to another folder
    ds_rec.save_to_disk(data_folder + '_data_tmp')
    # ds_rec_info.save_to_disk(data_folder + '_info_tmp')
    del ds_rec # , ds_rec_info
    # shutil.rmtree(data_folder + '_data')
    directory_to_remove = data_folder + '_data'
    shutil.rmtree(directory_to_remove)
    os.rename(data_folder + '_data_tmp', data_folder + '_data')
    
ds_rec, _ = load_ds_rec_and_info(RecName, cohort_args)
ds_rec

[INFO:2024-04-19 13:25:15,663:(3358717945.py@17 __main__)]: 


[INFO:2024-04-19 13:25:15,664:(3358717945.py@21 __main__)]: ../_Data/1-Data_RFT\1-RawData2022_CGM\WeightU
[INFO:2024-04-19 13:25:15,731:(3358717945.py@23 __main__)]: Dataset({
    features: ['PID', 'PatientID', 'WeightUID', 'DT_r', 'DT_s', 'DT_tz', 'Weight', 'WeightU-NumeTkn_wgt', 'WeightU-NumeTkn_tknidx'],
    num_rows: 28171
})
[INFO:2024-04-19 13:25:15,732:(pipeline_record.py@477 recfldtkn.pipeline_record)]: load fldtkn pipeline from: ../pipeline/fn_fldtkn/WeightU_N2CTkn.py ...


Map (num_proc=4):   0%|          | 0/28171 [00:00<?, ? examples/s]

[INFO:2024-04-19 13:25:18,411:(pipeline_record.py@494 recfldtkn.pipeline_record)]: ds_rec.column_names: ['PID', 'PatientID', 'WeightUID', 'DT_r', 'DT_s', 'DT_tz', 'Weight', 'WeightU-NumeTkn_wgt', 'WeightU-NumeTkn_tknidx', 'WeightU-N2CTkn_wgt', 'WeightU-N2CTkn_tknidx'] ...
[INFO:2024-04-19 13:25:18,412:(3358717945.py@33 __main__)]: Dataset({
    features: ['PID', 'PatientID', 'WeightUID', 'DT_r', 'DT_s', 'DT_tz', 'Weight', 'WeightU-NumeTkn_wgt', 'WeightU-NumeTkn_tknidx', 'WeightU-N2CTkn_wgt', 'WeightU-N2CTkn_tknidx'],
    num_rows: 28171
})


Saving the dataset (0/1 shards):   0%|          | 0/28171 [00:00<?, ? examples/s]

[INFO:2024-04-19 13:25:18,654:(3358717945.py@17 __main__)]: 


[INFO:2024-04-19 13:25:18,654:(3358717945.py@21 __main__)]: ../_Data/1-Data_RFT\2-RawData2023_CVSTDCAug\WeightU
[INFO:2024-04-19 13:25:18,744:(3358717945.py@23 __main__)]: Dataset({
    features: ['PID', 'PatientID', 'WeightUID', 'DT_r', 'DT_s', 'DT_tz', 'Weight', 'WeightU-NumeTkn_wgt', 'WeightU-NumeTkn_tknidx'],
    num_rows: 83107
})
[INFO:2024-04-19 13:25:18,744:(pipeline_record.py@477 recfldtkn.pipeline_record)]: load fldtkn pipeline from: ../pipeline/fn_fldtkn/WeightU_N2CTkn.py ...


Map (num_proc=4):   0%|          | 0/83107 [00:00<?, ? examples/s]

[INFO:2024-04-19 13:25:21,929:(pipeline_record.py@494 recfldtkn.pipeline_record)]: ds_rec.column_names: ['PID', 'PatientID', 'WeightUID', 'DT_r', 'DT_s', 'DT_tz', 'Weight', 'WeightU-NumeTkn_wgt', 'WeightU-NumeTkn_tknidx', 'WeightU-N2CTkn_wgt', 'WeightU-N2CTkn_tknidx'] ...
[INFO:2024-04-19 13:25:21,930:(3358717945.py@33 __main__)]: Dataset({
    features: ['PID', 'PatientID', 'WeightUID', 'DT_r', 'DT_s', 'DT_tz', 'Weight', 'WeightU-NumeTkn_wgt', 'WeightU-NumeTkn_tknidx', 'WeightU-N2CTkn_wgt', 'WeightU-N2CTkn_tknidx'],
    num_rows: 83107
})


Saving the dataset (0/1 shards):   0%|          | 0/83107 [00:00<?, ? examples/s]

[INFO:2024-04-19 13:25:22,410:(3358717945.py@17 __main__)]: 


[INFO:2024-04-19 13:25:22,410:(3358717945.py@21 __main__)]: ../_Data/1-Data_RFT\3-RawData2023_CVSDeRxAug\WeightU
[INFO:2024-04-19 13:25:22,481:(3358717945.py@23 __main__)]: Dataset({
    features: ['PID', 'PatientID', 'WeightUID', 'DT_r', 'DT_s', 'DT_tz', 'Weight', 'WeightU-NumeTkn_wgt', 'WeightU-NumeTkn_tknidx'],
    num_rows: 5194
})
[INFO:2024-04-19 13:25:22,482:(pipeline_record.py@477 recfldtkn.pipeline_record)]: load fldtkn pipeline from: ../pipeline/fn_fldtkn/WeightU_N2CTkn.py ...


Map (num_proc=4):   0%|          | 0/5194 [00:00<?, ? examples/s]

[INFO:2024-04-19 13:25:24,913:(pipeline_record.py@494 recfldtkn.pipeline_record)]: ds_rec.column_names: ['PID', 'PatientID', 'WeightUID', 'DT_r', 'DT_s', 'DT_tz', 'Weight', 'WeightU-NumeTkn_wgt', 'WeightU-NumeTkn_tknidx', 'WeightU-N2CTkn_wgt', 'WeightU-N2CTkn_tknidx'] ...
[INFO:2024-04-19 13:25:24,913:(3358717945.py@33 __main__)]: Dataset({
    features: ['PID', 'PatientID', 'WeightUID', 'DT_r', 'DT_s', 'DT_tz', 'Weight', 'WeightU-NumeTkn_wgt', 'WeightU-NumeTkn_tknidx', 'WeightU-N2CTkn_wgt', 'WeightU-N2CTkn_tknidx'],
    num_rows: 5194
})


Saving the dataset (0/1 shards):   0%|          | 0/5194 [00:00<?, ? examples/s]

Dataset({
    features: ['PID', 'PatientID', 'WeightUID', 'DT_r', 'DT_s', 'DT_tz', 'Weight', 'WeightU-NumeTkn_wgt', 'WeightU-NumeTkn_tknidx', 'WeightU-N2CTkn_wgt', 'WeightU-N2CTkn_tknidx'],
    num_rows: 116472
})