# Space

In [None]:
import os
import sys 
import logging
import random
import pandas as pd 
from pprint import pprint 
from IPython.display import display, HTML

# WorkSpace
KEY = 'WorkSpace'; WORKSPACE_PATH = os.getcwd().split(KEY)[0] + KEY; print(WORKSPACE_PATH)
os.chdir(WORKSPACE_PATH)
sys.path.append(WORKSPACE_PATH)

# Pipeline Space
from proj_space import SPACE
SPACE['WORKSPACE_PATH'] = WORKSPACE_PATH
sys.path.append(SPACE['CODE_FN'])
pprint(SPACE)

# Available Packages
import argparse
import datasets
import pandas as pd
from datetime import datetime 

from recfldtkn.ckpd_obs import Ckpd_ObservationS
from recfldtkn.configfn import load_cohort_args
from recfldtkn.loadtools import load_module_variables, update_args_to_list
from recfldtkn.observer import get_RecObsName_to_RecObsInfo, CaseObserverTransformer

logger = logging.getLogger(__name__)
recfldtkn_config_path = os.path.join(SPACE['CODE_FN'], 'config_recfldtkn/')


# [Part 1] Get a Caseset: Case Examples

## [Step 1]

In [None]:
from recfldtkn.loadtools import fetch_TriggerEvent_tools


####################
TriggerCaseMethod = 'CGM5MinEntry'
####################

Trigger_tools = fetch_TriggerEvent_tools(TriggerCaseMethod, SPACE)
Trigger_tools 

In [None]:
##################################
CaseSetName = TriggerCaseMethod
case_id_columns = Trigger_tools['case_id_columns']
special_columns = Trigger_tools['special_columns']
##################################

TriggerCasePath = os.path.join(SPACE['DATA_CaseSet'], f'{CaseSetName}.p')
cohort_args = load_cohort_args(recfldtkn_config_path, SPACE)
cohort_args['Ckpd_ObservationS'] = Ckpd_ObservationS
print(cohort_args)
print(TriggerCasePath)

df_case = pd.read_pickle(TriggerCasePath) # (1000)
df_case = df_case.sample(100, random_state=0).reset_index(drop=True)
ds_case = datasets.Dataset.from_pandas(df_case)
ds_case


## [Step 2] RO: Record Observation & CasePhi


In [None]:
###########################
ROName_List = ['CGM5Min-Bf24H-N2Cin1']
name_CasePhi = '1TknIn5Mintid'
###########################

In [None]:
from recfldtkn.obsname import convert_RONameList_to_COName

CaseObsName = convert_RONameList_to_COName(ROName_List, name_CasePhi)
print(CaseObsName)

In [None]:
# part of fetch_caseobs_Phi_tools
CO_Folder = os.path.join(SPACE['DATA_CaseObs'], CaseObsName)
if not os.path.exists(CO_Folder): os.makedirs(CO_Folder)
print(CO_Folder)

# [Part 2] Tools for ds_case_obs

## [Step 1] Prepare examples

In [None]:
case_examples = ds_case[:5]
print(case_examples)

In [None]:
length = len(case_examples[list(case_examples.keys())[0]])
case_examples_list = [{k: v[i] for k, v in case_examples.items()} for i in range(length)]
idx_to_examples = {i: case_examples_list[i] for i in range(length)}
pprint(idx_to_examples, sort_dicts=False)

In [None]:
case_example = idx_to_examples[0]
case_example

## [Step 2] get_caseobs_id

In [None]:
import inspect
#################################################
def get_CO_id(case_example, CaseObsName):
    PIDValue = case_example['PID']
    ObsDTValue = case_example['ObsDT'].isoformat()
    return f'{PIDValue}&{ObsDTValue}'

get_CO_id.fn_string = inspect.getsource(get_CO_id)
#################################################

In [None]:
caseobs_id = get_CO_id(case_example, CaseObsName)
caseobs_id # CO_id



## [Step 3] RO List and CasePhi

In [None]:
name_CasePhi 

In [None]:
## we need to prepare these five arguments.
# CaseTkn     # <------- we have this.
# cohort_args # <------- we have this.

# RecObs_Name 
# rec_args
# column_names # attribute name of R^recordname

In [None]:
######### RecObs_Name
print(ROName_List) # RO_List
# Let's focus on one RO
ROName = ROName_List[0]
print(ROName)

In [None]:
# RO = 'Rx-Bf2M-InsCate'
# RO = 'Rx-Bf2M'
# RO = 'P-Zip3DemoNume'

## [Step4] RO Information

In [None]:
# decompose RO to RecName, CkpdName, FldName (small phi)
from recfldtkn.obsname import parse_RecObsName

d = parse_RecObsName(ROName)
RecName = d['RecName']
CkpdName = d['CkpdName']
FldName = d['FldName']
print(RecName, CkpdName, FldName)

In [None]:
RecName

In [None]:
# load all information about the R^record_name. 
from recfldtkn.configfn import load_record_args

print(RecName)
rec_args = load_record_args(RecName, cohort_args)
[i for i in rec_args]

## [Step 5] Load ds_rec

In [None]:
from recfldtkn.loadtools import load_ds_rec_and_info

ds_rec, ds_rec_info = load_ds_rec_and_info(RecName, cohort_args)
# R^Rx
# ROdf # Rx-Bf2M (i, t, Bf2M)

In [None]:
######### column_names (or we can all it attribute columns)
column_names = ds_rec.column_names # attr_columns + phi_columns
column_names
# ROdf_ij --(Phi: RecNum)-> COdf_ij 

## [Step 6] get_selected_columns (attr + phi) from ds_rec

In [None]:
column_names

In [None]:
from recfldtkn.obsname import parse_RecObsName
# RecName = RecObs_Name.split('-')[0]
base_columns = [cohort_args['RootID'], rec_args['RecID'], rec_args['RecDT']]
base_columns

In [None]:
RecName = parse_RecObsName(ROName)['RecName']
FldName = parse_RecObsName(ROName)['FldName']
RecFldTknName = RecName + '-' + FldName + 'Tkn'
# rec_args['FldTknInfo'][RecFldTknName]
RecFldTknName

In [None]:
###############################
def get_selected_columns(ROName, column_names, cohort_args, rec_args, CaseTkn):
    from recfldtkn.obsname import parse_RecObsName
    RecName = parse_RecObsName(ROName)['RecName']
    FldName = parse_RecObsName(ROName)['FldName']
    base_columns = [cohort_args['RootID'], rec_args['RecID'], rec_args['RecDT']]
    
    FldName_cols = [i for i in column_names if FldName in i]
    if len(FldName_cols) == 0: 
        parse_RecObsName(ROName)['FldName']
        RecFldTknName = RecName + '-' + FldName + 'Tkn'
        # rec_args['FldTknInfo'][RecFldTknName]
        
        val_cols = rec_args['FldTknInfo'][RecFldTknName]['value_cols']
        final_cols = base_columns + val_cols
    else:
        final_cols = base_columns + FldName_cols
    return final_cols

get_selected_columns.fn_string = inspect.getsource(get_selected_columns)
###############################

In [None]:
# big question here
# if small phi is not applied in the record level
# how do we deploy the tools of small phi within this process?
get_selected_columns(ROName, column_names, cohort_args, rec_args, name_CasePhi)

In [None]:
print(ROName_List)
print(name_CasePhi)
print(get_selected_columns)

In [None]:
# RecObsName_to_RecObsInfo # RO_to_ROinfo

from recfldtkn.observer import get_RecObsName_to_RecObsInfo

record_to_ds_rec = {}        # set this to empty dictionary, then we will load data from disk
record_to_ds_rec_info = {}   # set this to empty dictionary, then we will load data from disk
ROName_to_ROInfo = get_RecObsName_to_RecObsInfo(ROName_List, 
                                                name_CasePhi, 
                                                get_selected_columns,
                                                cohort_args, 
                                                cohort_args['Ckpd_ObservationS'], 
                                                record_to_ds_rec, 
                                                record_to_ds_rec_info)
    
ROName = ROName_List[0] # Rx-Bf2M  
ROInfo = ROName_to_ROInfo[ROName] # RxInfo + R^Rx (with necessary columns) + Ckpd + FldTkn (phi)

In [None]:
[i for i in ROInfo]
# pprint(ROInfo)

In [None]:
ROInfo['ds_rec'] # get_selected_columns

## [Step 7] get_CO_vocab

In [None]:
def get_CO_vocab(RecObsName_to_RecObsInfo):
    # phi_list
    RecFldName_list = list(set([RecObsInfo['RecName'] + '-' +RecObsInfo['FldName'] 
                                for RecObsName, RecObsInfo in RecObsName_to_RecObsInfo.items()
                                if RecObsInfo['FldName'] is not None ]
                                ))
    
    # print('RecFldName_list----->', RecFldName_list)
    # for current version: for any Phi, we only process one or zero phi. 
    assert len(RecFldName_list) <= 1

    ############################ tkn 
    RecObsName = [i for i in RecObsName_to_RecObsInfo][0]
    RecObsInfo = RecObsName_to_RecObsInfo[RecObsName]
    idx2tkn = RecObsInfo['FldIdx2Tkn'] + ['NoObs']   
    tid2tkn = {tid: tkn for tid, tkn in enumerate(idx2tkn)}
    tkn2tid = {tkn: tid for tid, tkn in tid2tkn.items()}
    CaseTknVocab = {}
    CaseTknVocab['tid'] = {'tid2tkn': tid2tkn, 'tkn2tid': tkn2tid}
    # CaseTknVocab['wgt'] = {} # we don't use wgt in this version
    ############################
    
    return CaseTknVocab

get_CO_vocab.fn_string = inspect.getsource(get_CO_vocab)

In [None]:
CO_vocab = get_CO_vocab(ROName_to_ROInfo)
pprint(CO_vocab, sort_dicts=False)


## [Step 8] Fetch ds_rec

In [None]:
case_example = idx_to_examples[4]
case_example

In [None]:
ROName

In [None]:
pprint(idx_to_examples, sort_dicts=False)

In [None]:
for ROName, ROInfo in ROName_to_ROInfo.items():
    print(ROName, ':', [i for i in ROInfo])

In [None]:
# RO_to_ROdf: RecObsName_to_RecObsDS
from recfldtkn.observer import CaseObserverTransformer

get_Record_P = CaseObserverTransformer.get_Record_P
RecName_to_REC_P = get_Record_P(idx_to_examples, ROName_to_ROInfo)
for RecName, REC_P in RecName_to_REC_P.items():
    print(f'============ {RecName} ============')
    pprint(REC_P)

## [Step 9] Fetch ROdf: $R_i^{recname}$

In [None]:
get_idx_to_RecObsName_to_RecObsDS = CaseObserverTransformer.get_idx_to_RecObsName_to_RecObsDS
idx_to_ROName_to_ROds = get_idx_to_RecObsName_to_RecObsDS(idx_to_examples, ROName_to_ROInfo, RecName_to_REC_P)

for idx, case in idx_to_examples.items():
    print('\n====================')
    print('idx:', idx)
    print('case:', case)
    print('ROdf:', idx_to_ROName_to_ROds[idx]) # R_i^{RecNum, Ckpd}
    print('====================')

In [None]:
[i for i in ROName_to_ROInfo]

In [None]:
idx = 1
case_example = idx_to_examples[idx]
print('case_example:', case_example)
ROName_to_ROds = idx_to_ROName_to_ROds[idx] # RO_to_ROdf # RO_ij
print('RO_to_ROdf (RO_ij):', ROName_to_ROds)
print([i for i in ROName_to_ROds])

# [Step 10]: Develop $\Phi$. 

In [None]:
ObsDTValue = case_example['ObsDT'] 
ObsDTValue

In [None]:
ROName_to_ROds

In [None]:
assert len(ROName_to_ROds) == 1 

In [None]:
RecObsName = [i for i in ROName_to_ROds][0] # RO
RecObsName


In [None]:
RecObsDS   = ROName_to_ROds[RecObsName]
RecObsDS

In [None]:
RecObsInfo = ROName_to_ROInfo[RecObsName] 
RecObsInfo

In [None]:
RecDT = RecObsInfo['rec_args']['RecDT'] 
RecDT

In [None]:
CkpdInfo = RecObsInfo['CkpdInfo']
CkpdInfo

In [None]:
StartIdx5Min = CkpdInfo['StartIdx5Min']
StartIdx5Min

In [None]:
RO_ds = RecObsDS
RO_ds

In [None]:
df = RO_ds.to_pandas()
df

In [None]:
df[RecDT]

In [None]:
ObsDTValue

In [None]:
case_example

In [None]:
df['5MinInCP'] = ((df[RecDT] - ObsDTValue).dt.total_seconds() / (60 * 5)).astype(int)
df

In [None]:
EndIdx5Min = CkpdInfo['EndIdx5Min']
new_index = range(StartIdx5Min, EndIdx5Min + 1)  # Include 24
new_index

In [None]:
desired_range_df = pd.DataFrame({'5MinInCP': new_index})
desired_range_df

In [None]:
df = pd.merge(df, desired_range_df, on='5MinInCP', how='right')
df

In [None]:
df = df.fillna(0)
df

In [None]:
columns = ['tknidx', '5MinInCP']
df = df.rename(columns = {i: i.split('Tkn_')[-1] for i in df.columns if 'Tkn_' in i})
df = df[columns]
df

In [None]:
EXPLODE_COLS = ['tknidx']  # 'TknInFld' included as it's okay even if not used in the model
EXPLODE_COLS

In [None]:
df = df.apply(lambda col: col.explode() if col.name in EXPLODE_COLS else col).reset_index(drop=True)
df

In [None]:
output = df.to_dict(orient='list')
output

In [None]:
CO = {'tid': output['tknidx']}

In [None]:
import inspect

##################################
def fn_CasePhi(case_example, ROName_to_ROds, ROName_to_ROInfo, CO_vocab):
    # input: RecObsName_to_RecObsDS, RecObsName_to_RecObsInfo
    # output: CaseObservation

    ObsDTValue = case_example['ObsDT']   # T_ij value
    assert len(ROName_to_ROds) == 1 # assert that RecObsDS has at least one record   
    RecObsName = [i for i in ROName_to_ROds][0] # RO
    RecObsDS   = ROName_to_ROds[RecObsName]  # RO_ds(RecDT is not caseDT)
    RecObsInfo = ROName_to_ROInfo[RecObsName] # RecObsInfo
    RecDT = RecObsInfo['rec_args']['RecDT'] 

    CkpdInfo = RecObsInfo['CkpdInfo']  # Ckpd
    StartIdx5Min = CkpdInfo['StartIdx5Min'] 
    EndIdx5Min = CkpdInfo['EndIdx5Min']
        
    # 1. get a subset of a Record Type: e.g., CGM5Min_Bf24H, with TknIdx.
    RO_ds = RecObsDS   
    df = RO_ds.to_pandas()
    # some requirements for df
    # each row is a 5Min, and each tknidx is just a list with one token.
            
    # 2. get the 5MinLoc, generate some time_location features (optional)
    df['5MinInCP'] = ((df[RecDT] - ObsDTValue).dt.total_seconds() / (60 * 5)).astype(int)

    # 3. filling with the empty time_location to get the full range of cgm. 
    new_index = range(StartIdx5Min, EndIdx5Min + 1)  # Include 24
    desired_range_df = pd.DataFrame({'5MinInCP': new_index})
    df = pd.merge(df, desired_range_df, on='5MinInCP', how='right')
    df = df.fillna(0)

    # 4. select the columns. 
    columns = ['tknidx', '5MinInCP']
    df = df.rename(columns = {i: i.split('Tkn_')[-1] for i in df.columns if 'Tkn_' in i})
    df = df[columns]

    # 5. Explode list columns if needed
    EXPLODE_COLS = ['tknidx']  # 'TknInFld' included as it's okay even if not used in the model
    df = df.apply(lambda col: col.explode() if col.name in EXPLODE_COLS else col).reset_index(drop=True)
    output = df.to_dict(orient='list')

    CO = {'tid': output['tknidx']}
    # CaseObservation = {'tid': output['tknidx'],  '5MinInCP': output['5MinInCP']} # <-- in the future
    return CO
##################################

fn_CasePhi.fn_string = inspect.getsource(fn_CasePhi)

In [None]:
ROName_to_ROds

In [None]:
fn_CasePhi(case_example, ROName_to_ROds, ROName_to_ROInfo, CO_vocab)

In [None]:
CO_vocab

## Save Files

In [None]:
from recfldtkn.loadtools import convert_variables_to_pystirng, load_module_variables

prefix = ['import pandas as pd', 'import numpy as np']
fn_variables = [get_CO_id, get_selected_columns, get_CO_vocab, fn_CasePhi]
pycode = convert_variables_to_pystirng(fn_variables = fn_variables, prefix = prefix)
pypath = os.path.join(SPACE['CODE_FN'], 'fn_casephi', f'phi_{name_CasePhi}.py')
with open(pypath, 'w') as file: file.write(pycode)

# Create a HTML link and display it
full_path = os.path.join(WORKSPACE_PATH, pypath)
display(HTML(f'{pypath} <a href="{full_path}" target="_blank">Open File</a>'))

# Application

In [None]:
CaseObsName

In [None]:
from recfldtkn.observer import get_CaseObsInfo_for_a_CaseObsName

CaseObsInfo = get_CaseObsInfo_for_a_CaseObsName(CaseObsName,
                                                SPACE, 
                                                cohort_args, 
                                                record_to_ds_rec = {}, 
                                                record_to_ds_rec_info = {})
[i for i in CaseObsInfo]

In [None]:
import random 
from pprint import pprint
from datetime import datetime 
from recfldtkn.observer import get_RecObsName_to_RecObsInfo, CaseObserverTransformer


ro_to_ROName = {} # leave it empty for now 
ROName_to_ROInfo = CaseObsInfo['ROName_to_ROInfo']
name_CasePhi = CaseObsInfo['name_CasePhi']
fn_CasePhi = CaseObsInfo['fn_CasePhi']
CO_vocab = CaseObsInfo['CO_vocab']
get_CO_id = CaseObsInfo['get_CO_id']
CO_Folder = CaseObsInfo['CO_Folder']
df_case = None 
use_CO_from_disk = False
batch_size = CaseObsInfo.get('batch_size', 1000)

fn_caseobs_Phi = CaseObserverTransformer(ro_to_ROName,
                                         ROName_to_ROInfo, 
                                         name_CasePhi, 
                                         fn_CasePhi, 
                                         CO_vocab, 
                                         get_CO_id,
                                         CO_Folder, 
                                         df_case,
                                         use_CO_from_disk)

# Check Cache Functions

In [None]:
fn_caseobs_Phi.new_COs

In [None]:
start = datetime.now()
ds_casetkn = ds_case.map(fn_caseobs_Phi, 
                         batched = True, 
                         batch_size= batch_size, 
                         load_from_cache_file = False, 
                         new_fingerprint = CaseObsName)
end = datetime.now()
print('Elipse Time: ', end - start)
print(ds_casetkn)

In [None]:
print(len(fn_caseobs_Phi.new_COs))

In [None]:
start = datetime.now()
ds_casetkn = ds_case.map(fn_caseobs_Phi, 
                         batched = True, 
                         batch_size= batch_size, 
                         load_from_cache_file = False, 
                         new_fingerprint = CaseObsName)
end = datetime.now()
print('Elipse Time: ', end - start)
print(ds_casetkn)

In [None]:
print(CaseObsName)
random_int = random.randint(0, len(ds_casetkn))
print(random_int)
pprint(ds_casetkn[random_int])
print(CO_vocab)

In [None]:
ds_casetkn[4]

## Save

In [None]:
fn_caseobs_Phi.df_CO_info

In [None]:
CO_Folder_data = fn_caseobs_Phi.CO_Folder_data 
CO_Folder_data

In [None]:
fn_caseobs_Phi.save_new_COs_to_disk(CO_Folder_data)

In [None]:
fn_caseobs_Phi.df_CO_info

In [None]:
fn_caseobs_Phi.ds_CO_data

# Save Vocab

In [None]:
CO_vocab = fn_caseobs_Phi.CO_vocab
CO_vocab

In [None]:
CO_Folder_vocab = fn_caseobs_Phi.CO_Folder_vocab
CO_Folder_vocab

In [None]:
df_Vocab = pd.DataFrame({CaseObsName: CO_vocab})
df_Vocab.to_pickle(CO_Folder_vocab)
df_Vocab