# Space

In [None]:
import os
import sys 
import logging
import random
import pandas as pd 
from pprint import pprint 
from IPython.display import display, HTML

# WorkSpace
KEY = 'WorkSpace'; WORKSPACE_PATH = os.getcwd().split(KEY)[0] + KEY; print(WORKSPACE_PATH)
os.chdir(WORKSPACE_PATH)
sys.path.append(WORKSPACE_PATH)

# Pipeline Space
from proj_space import SPACE
SPACE['WORKSPACE_PATH'] = WORKSPACE_PATH
sys.path.append(SPACE['CODE_FN'])
# pprint(SPACE)

# Available Packages
import argparse
import datasets
import pandas as pd
from datetime import datetime 


from recfldtkn.configfn import load_cohort_args
from recfldtkn.loadtools import load_module_variables, update_args_to_list
from recfldtkn.observer import get_RecObsName_to_RecObsInfo, CaseObserverTransformer
from config_observer.CKPD import ckpd_to_CkpdObsConfig

logger = logging.getLogger(__name__)
recfldtkn_config_path = os.path.join(SPACE['CODE_FN'], 'config_recfldtkn/')


# [Part 1] Get a Caseset: Case Examples

## [Step 1] Trigger Event

In [None]:
from recfldtkn.loadtools import fetch_TriggerEvent_tools

####################
TriggerCaseMethod = 'TrulicityRx'
####################

Trigger_tools = fetch_TriggerEvent_tools(TriggerCaseMethod, SPACE)
Trigger_tools 

In [None]:
##################################
CaseSetName = TriggerCaseMethod
case_id_columns = Trigger_tools['case_id_columns']
special_columns = Trigger_tools['special_columns']
##################################

TriggerCasePath = os.path.join(SPACE['DATA_CaseSet'], f'{CaseSetName}.p')
cohort_args = load_cohort_args(recfldtkn_config_path, SPACE)

cohort_args['ckpd_to_CkpdObsConfig'] = ckpd_to_CkpdObsConfig
cohort_args['case_id_columns'] = case_id_columns
cohort_args['ObsDTName'] = 'ObsDT'
cohort_args['PID_ObsDT_columns'] = [cohort_args['RootID'], cohort_args['ObsDTName']]

print(cohort_args)
print(TriggerCasePath)

df_case = pd.read_pickle(TriggerCasePath) # (1000)
df_case = df_case.sample(1000, random_state=0).reset_index(drop=True)
ds_case = datasets.Dataset.from_pandas(df_case)
ds_case

## [Step 2] Generate Arguments

In [None]:
#################################
# case_observations = [
#     'PDemo:ro.P-Demo_ct.InCaseInfo',
#     'PZip3Demo:ro.P-Zip3DemoNume_ct.InCaseInfo',
#     'RxInCase6:ro.Rx-ObsPnt-SysCate_ct.InCaseInfo',
#     'RxObsPntNum:ro.Rx-ObsPnt_ct.RecNum',
# ]

case_observations = [
    'Af1wClick:ro.EgmClick-Af1Wlft_ct.InvEgmInfo',
    'Af1wAuthen:ro.EgmAuthen-Af1Wlft_ct.InvEgmInfo',
    
    'Af1wCallPhm:ro.EgmCallPharm-Af1Wlft_ct.InvEgmInfo',
    
    'Af1wEdu:ro.EgmEdu-Af1Wlft_ct.InvEgmInfo',
    'Af1wRmd:ro.EgmRmd-Af1Wlft_ct.InvEgmInfo',
    'Af1wCpy:ro.EgmCopay-Af1Wlft_ct.InvEgmInfo',
]
name_CaseGamma = 'CatUnseqTknsOneTS'
#################################

In [None]:
from pprint import pprint 
from recfldtkn.obsname import convert_case_observations_to_co_to_observation
from recfldtkn.obsname import get_RecNameList_and_FldTknList

co_to_COName, co_to_CONameInfo = convert_case_observations_to_co_to_observation(case_observations)
co_to_COName

In [None]:
# check whether this information is ready. 
PipelineInfo = get_RecNameList_and_FldTknList(co_to_CONameInfo, ckpd_to_CkpdObsConfig)
pprint(PipelineInfo, sort_dicts=False)

In [None]:
COName_List = [CaseName for co, CaseName in co_to_COName.items()]
COName_List

In [None]:
from recfldtkn.obsname import convert_CONameList_to_CFName

CaseFeatName = convert_CONameList_to_CFName(COName_List, name_CaseGamma)
print(CaseFeatName)

In [None]:
# part of fetch_caseobs_Phi_tools
CF_Folder = os.path.join(SPACE['DATA_CaseFeat'], CaseFeatName)
if not os.path.exists(CF_Folder): os.makedirs(CF_Folder)
print(CF_Folder)

# [Part 2] Tools for ds_case_obs

## [Step 1] Prepare examples

In [None]:
case_examples = ds_case[:5]
print(case_examples)

In [None]:
length = len(case_examples[list(case_examples.keys())[0]])
case_examples_list = [{k: v[i] for k, v in case_examples.items()} for i in range(length)]
idx_to_examples = {i: case_examples_list[i] for i in range(length)}
pprint(idx_to_examples, sort_dicts=False)

In [None]:
case_example = idx_to_examples[0]
case_example

## [Step 2] get_CF_id

In [None]:
# check whether this information is ready. 
PipelineInfo = get_RecNameList_and_FldTknList(co_to_CONameInfo, ckpd_to_CkpdObsConfig)
pprint(PipelineInfo, sort_dicts=False)

In [None]:
CaseFeatName

In [None]:
case_id_columns

In [None]:
case_id_columns

In [None]:
import inspect
#################################################
def get_CF_id(case_example, case_id_columns, cohort_args):
    # to be update it to a more general function

    ############################## # consider the case_id_columns for DrFirst.
    case_id_columns = cohort_args['case_id_columns'] # ['PID', 'ObsDT', 'PInvID', 'RxID']
    ############################## 

    # case_id_columns = [i for i in case_id_columns if i in case_example]
    li = [col + ':'+ str(case_example[col]) for col in case_id_columns]
    CF_id = '_'.join(li)
    return CF_id 

get_CF_id.fn_string = inspect.getsource(get_CF_id)
#################################################

In [None]:
get_CF_id(case_example, case_id_columns, cohort_args)

## [Step 3] get_CaseObsName_to_CaseObsInfo

In [None]:
from recfldtkn.observer import get_CaseObsName_to_CaseObsInfo

record_to_ds_rec = {}        # set this to empty dictionary, then we will load data from disk
record_to_ds_rec_info = {}   # set this to empty dictionary, then we will load data from disk
COName_to_COInfo = get_CaseObsName_to_CaseObsInfo(COName_List,
                                                  SPACE, 
                                                  cohort_args, 
                                                  record_to_ds_rec, 
                                                  record_to_ds_rec_info)

In [None]:
for CaseObsName, CaseObsInfo in COName_to_COInfo.items():
    print(CaseObsName)
    print([i for i in CaseObsInfo])

In [None]:
COName_to_co = {v: k for k, v in co_to_COName.items()}
COName_to_co

In [None]:
co_to_COvocab = {COName_to_co[COName]: CaseObsInfo['CO_vocab'] for COName, CaseObsInfo in COName_to_COInfo.items()}

for co, vocab in co_to_COvocab.items():
    print(co)
    print(vocab)

## [Step 3] get_CF_vocab

In [None]:
##################################
def get_CF_vocab(co_to_COvocab):
    CF_vocab = {}
    # co_Y_list = [i for i in co_to_COvocab if 'Fut' in i or 'Af' in i]
    # co_X_list = [i for i in co_to_COvocab if i not in co_Y_list]
    co_list = [i for i in co_to_COvocab]
    SeqType = 'input_ids'
    # idx2tkn_all = []
    idx2tkn_all = ['[PAD]', '[CLS]', '[SEP]', '[MASK]', '[UNK]', '[BOS]', '[EOS]']
    for co in co_list:
        CaseObsVocab = co_to_COvocab[co] 
        idx2tkn = [co+':'+ tkn for tid, tkn in CaseObsVocab['tid']['tid2tkn'].items()]
        idx2tkn_all = idx2tkn_all + idx2tkn
    tid2tkn_all = {i: tkn for i, tkn in enumerate(idx2tkn_all)}
    tkn2tid_all = {tkn: i for i, tkn in enumerate(idx2tkn_all)}
    CF_vocab[SeqType] = {'tid2tkn': tid2tkn_all, 'tkn2tid': tkn2tid_all}
    CF_vocab['input_wgts'] = {}
    
    # SeqType_list = [i for i in CF_vocab]
    return CF_vocab
##################################

get_CF_vocab.fn_string = inspect.getsource(get_CF_vocab)

In [None]:
CF_vocab = get_CF_vocab(co_to_COvocab)
for SeqType, vocab in CF_vocab.items():
    print(SeqType, ':', vocab)
    print('\n')

## [Step 4] Fetch $o_{ij}$ List

In [None]:
pprint(idx_to_examples, sort_dicts=False)

In [None]:
use_CO_from_disk = False
COName_to_FnCaseObsPhi = {}
for COName, COInfo in COName_to_COInfo.items():
    ro_to_ROName = None 
    FnCaseObsPhi = CaseObserverTransformer(ro_to_ROName,
                                            COInfo['ROName_to_ROInfo'], 
                                            COInfo['name_CasePhi'], 
                                            COInfo['fn_CasePhi'], 
                                            COInfo['CO_vocab'], 
                                            COInfo['get_CO_id'],
                                            cohort_args,
                                            COInfo['CO_Folder'], 
                                            df_case,
                                            use_CO_from_disk)
    COName_to_FnCaseObsPhi[COName] = FnCaseObsPhi

In [None]:
from recfldtkn.observer import CaseFeatureTransformer

fetch_examples_with_complete_COs = CaseFeatureTransformer.fetch_examples_with_complete_COs

use_CO_from_disk = False 
results = fetch_examples_with_complete_COs(idx_to_examples, 
                                            COName_to_co, 
                                            COName_to_COInfo, 
                                            COName_to_FnCaseObsPhi)

idx_to_examples, COName_to_FnCaseObsPhi = results

In [None]:
for i, example in idx_to_examples.items():
    print(i)
    print(example)

## [Step 3]* Develop $\Gamma$. (To Dev)

In [None]:
idx = 1
case_example = idx_to_examples[idx]
# print('case_example:', case_example)
pprint(case_example, compact= True, sort_dicts=False)

In [None]:
##################################
def fn_CaseGamma(case_example, co_to_COvocab, CF_vocab, cohort_args):
    # print(case_example)
    co_list = [i for i in co_to_COvocab]
    
    result_case = {}

    # get the input ids
    SeqType = 'input_ids' 
    X_tid_total = []
    X_wgt_total = []
    tkn2tid_CF = CF_vocab[SeqType]['tkn2tid']

    # print(co_list)
    for co in co_list:
        CO_vocab = co_to_COvocab[co]
        X_tid_co = case_example[co +'_tid']
        X_wgt_co = case_example[co +'_wgt']
        X_tkn_co = [co + ':' + CO_vocab['tid']['tid2tkn'][tid] for tid in X_tid_co]
        X_tid_CF = [tkn2tid_CF[i] for i in X_tkn_co]
        
        X_tid_total = X_tid_total + X_tid_CF
        X_wgt_total = X_wgt_total + X_wgt_co
        
    d = dict(zip(X_tid_total, X_wgt_total))
    d = {k: v for k, v in d.items() if v > 0}
    
    # version 1: for DL and ML

    result_case['input_ids']  = [i for i in d]
    result_case['input_wgts'] = [d[i] for i in d]

    return result_case 
##################################

fn_CaseGamma.fn_string = inspect.getsource(fn_CaseGamma)

In [None]:
result = fn_CaseGamma(case_example, co_to_COvocab, CF_vocab, cohort_args)
for SeqType, SeqValue in result.items():
    print(SeqType, ':', len(SeqValue), SeqValue)
    print('\n')

In [None]:
CF_vocab

## Save to Files

In [None]:
from recfldtkn.loadtools import convert_variables_to_pystirng, load_module_variables

prefix = ['import pandas as pd', 'import numpy as np']
fn_variables = [get_CF_id, get_CF_vocab, fn_CaseGamma]
pycode = convert_variables_to_pystirng(fn_variables = fn_variables, prefix = prefix)
pypath = os.path.join(SPACE['CODE_FN'], 'fn_casegamma', f'gamma_{name_CaseGamma}.py')
with open(pypath, 'w') as file: file.write(pycode)

# Create a HTML link and display it
full_path = os.path.join(WORKSPACE_PATH, pypath)
display(HTML(f'{pypath} <a href="{full_path}" target="_blank">Open File</a>'))

# Application

In [None]:
CaseFeatName

In [None]:
from recfldtkn.observer import get_CaseFeatInfo_for_a_CaseFeatName


record_to_ds_rec = {}
record_to_ds_rec_info = {}

CaseFeatInfo = get_CaseFeatInfo_for_a_CaseFeatName(name_CaseGamma,
                                                   case_observations,
                                                   SPACE, 
                                                   cohort_args, 
                                                   record_to_ds_rec, 
                                                   record_to_ds_rec_info)
[i for i in CaseFeatInfo]

In [None]:
import random 
from pprint import pprint
from datetime import datetime 
from recfldtkn.observer import CaseFeatureTransformer

In [None]:

co_to_COName = CaseFeatInfo['co_to_COName']
COName_to_COInfo = CaseFeatInfo['COName_to_COInfo']
name_CaseGamma = CaseFeatInfo['name_CaseGamma']
fn_CaseGamma = CaseFeatInfo['fn_CaseGamma']
CF_vocab = CaseFeatInfo['CF_vocab']
get_CF_id = CaseFeatInfo['get_CF_id']
CF_Folder = CaseFeatInfo['CF_Folder']
df_case = ds_case.to_pandas()
use_CF_from_disk = False
use_CO_from_disk = False
batch_size = CaseFeatInfo.get('batch_size', 1000)

FnCaseFeatGamma = CaseFeatureTransformer(co_to_COName,
                                          COName_to_COInfo, 
                                          name_CaseGamma, 
                                          fn_CaseGamma, 
                                          CF_vocab, 
                                          get_CF_id,
                                          cohort_args,
                                          CF_Folder, 
                                          df_case,
                                          use_CF_from_disk,
                                          use_CO_from_disk)

# Check Cache Functions

In [None]:
FnCaseFeatGamma.new_CFs

In [None]:
start = datetime.now()
batch_size = 100
ds_casetkn = ds_case.map(FnCaseFeatGamma, 
                         batched = True, 
                         batch_size = batch_size, 
                         load_from_cache_file = False, 
                         new_fingerprint = CaseFeatName)
end = datetime.now()
print('Elipse Time: ', end - start)
print(ds_casetkn)

In [None]:
start = datetime.now()
batch_size = 100
ds_casetkn = ds_case.map(FnCaseFeatGamma, 
                         batched = True, 
                         batch_size = batch_size, 
                         load_from_cache_file = False, 
                         new_fingerprint = CaseFeatName)
end = datetime.now()
print('Elipse Time: ', end - start)
print(ds_casetkn)

In [None]:
print(CaseObsName)
random_int = random.randint(0, len(ds_casetkn))
print(random_int)
pprint(ds_casetkn[random_int])
print(CF_vocab)

In [None]:
ds_casetkn[8]

# Save

In [None]:
FnCaseFeatGamma.df_CF_info

In [None]:
CF_Folder_data = FnCaseFeatGamma.CF_Folder_data
CF_Folder_data

In [None]:
FnCaseFeatGamma.save_new_CFs_to_disk(CF_Folder_data)

In [None]:
FnCaseFeatGamma.df_CF_info

In [None]:
FnCaseFeatGamma.ds_CF_data

In [None]:
CFids = FnCaseFeatGamma.CFids
ds_CF_data, df_CF_info = FnCaseFeatGamma.load_CFs_from_disk(CF_Folder_data, CFids)

In [None]:
ds_CF_data

In [None]:
df_CF_info

# Save Vocab

In [None]:
CF_vocab = FnCaseFeatGamma.CF_vocab
CF_vocab

In [None]:
CF_Folder_vocab = FnCaseFeatGamma.CF_Folder_vocab
CF_Folder_vocab

In [None]:
df_Vocab = pd.DataFrame({CaseObsName: CF_vocab})
df_Vocab.to_pickle(CF_Folder_vocab)
df_Vocab