
# Space

In [None]:
import os
import sys 
import logging
import pandas as pd 
from pprint import pprint 

# WorkSpace
KEY = 'WorkSpace'; WORKSPACE_PATH = os.getcwd().split(KEY)[0] + KEY; print(WORKSPACE_PATH)
os.chdir(WORKSPACE_PATH)
sys.path.append(WORKSPACE_PATH)

# Pipeline Space
from proj_space import SPACE
SPACE['WORKSPACE_PATH'] = WORKSPACE_PATH
sys.path.append(SPACE['CODE_FN'])
pprint(SPACE)

# Available Packages
import pandas as pd
from datetime import datetime 

logger = logging.getLogger(__name__)
recfldtkn_config_path = os.path.join(SPACE['CODE_FN'], 'config_recfldtkn/')


# Dataset

In [None]:
import datasets
from recfldtkn.loadtools import load_ds_rec_and_info
from recfldtkn.configfn import load_cohort_args, load_record_args

base_config = load_cohort_args(recfldtkn_config_path, SPACE)
print(base_config)

In [None]:
from recfldtkn.loadtools import fetch_trigger_tools
from recfldtkn.loadtools import fetch_casetag_tools, fetch_casefilter_tools


def get_Trigger_Cases(TriggerCaseMethod, 
                      cohort_label_list, 
                      base_config, 
                      SPACE, 
                      RecName_to_dsRec = {},
                      RecName_to_dsRecInfo = {}):
    
    Trigger_Tools = fetch_trigger_tools(TriggerCaseMethod, SPACE)
    case_id_columns = Trigger_Tools['case_id_columns']
    special_columns = Trigger_Tools['special_columns'] 
    TriggerRecName = Trigger_Tools['TriggerRecName']
    convert_TriggerEvent_to_Caseset = Trigger_Tools['convert_TriggerEvent_to_Caseset']
    ###########################
    if TriggerRecName in RecName_to_dsRec:
        ds_rec = RecName_to_dsRec[TriggerRecName]
    else:
        ds_rec, _ = load_ds_rec_and_info(TriggerRecName, base_config, cohort_label_list)
    df_case = convert_TriggerEvent_to_Caseset(ds_rec, case_id_columns, special_columns, base_config)
    ###########################
    return df_case

In [None]:
def convert_TriggerCases_to_LearningCases(df_case, 
                                          cohort_label_list,
                                          Trigger2LearningMethods, 
                                          base_config, 
                                          use_inference):
    if use_inference == True:
        Trigger2LearningMethods = [i for i in Trigger2LearningMethods if i['type'] != 'learning-only']
    else:
        Trigger2LearningMethods = [i for i in Trigger2LearningMethods if i['type'] != 'inference-only']
        
    # print(Trigger2LearningMethods)
    for method in Trigger2LearningMethods:
        if method['op'] == 'Tag':
            name = method['Name']
            logger.info(f'CaseTag: {name}')
            CaseTag_Tools = fetch_casetag_tools(name, SPACE)

            subgroup_columns = CaseTag_Tools['subgroup_columns']
            if 'InfoRecName' in CaseTag_Tools:
                InfoRecName = CaseTag_Tools['InfoRecName']
                ds_info, _ = load_ds_rec_and_info(InfoRecName, base_config, cohort_label_list)
            else:
                ds_info = None

            fn_case_tagging = CaseTag_Tools['fn_case_tagging']
            df_case = fn_case_tagging(df_case, ds_info, subgroup_columns, base_config)

        elif method['op'] == 'Filter':
            name = method['Name']
            logger.info(f'CaseFilter: {name}')
            CaseFilter_Tools = fetch_casefilter_tools(name, SPACE)
            fn_case_filtering = CaseFilter_Tools['fn_case_filtering']
            
            logger.info(f'Before Filter: {df_case.shape}')
            df_case = fn_case_filtering(df_case)
            logger.info(f'After Filter: {df_case.shape}')

        else:
            raise ValueError(f'Unknown method: {method}')

    return df_case

# TriggerCaseMethod

In [None]:
use_learning = True 
use_inference = not use_learning

######################################
TriggerCaseMethod = 'TrulicityRx'
cohort_label_list = [1]
Trigger2LearningMethods = [
    {'op':'Tag',    'Name': 'TagPttBasicInfo', 'type': 'learning-only'},
    {'op':'Filter', 'Name': 'FilterBasicPRx',  'type': 'learning-only'},
]
######################################

In [None]:
use_inference

In [None]:
RecName_to_dsRec = {}
RecName_to_dsRecInfo = {}

df_case = get_Trigger_Cases(TriggerCaseMethod, 
                            cohort_label_list, 
                            base_config, 
                            SPACE, 
                            RecName_to_dsRec, 
                            RecName_to_dsRecInfo)

logger.info(f'Before: {df_case.shape}')
df_case = convert_TriggerCases_to_LearningCases(df_case, 
                                                cohort_label_list,
                                                Trigger2LearningMethods, 
                                                base_config, 
                                                use_inference)
logger.info(f'After: {df_case.shape}')

# Split

In [None]:
df_case # df_case_learning

In [None]:
from recfldtkn.pipeline_model import generate_random_tags, assign_caseSplitTag_to_dsCaseLearning

##############
RANDOM_SEED = 42
downsample_ratio = 1 # 1 (don't drop any case), 0.1 (drop 90% of cases of one patient).
out_ratio = 0 # hold-out patients
test_ratio = '2023.10.15'#  'tail0.1' # '0.1'
valid_ratio = 0.1 
##############

SplitMethod = f'rs{RANDOM_SEED}-ds{downsample_ratio}-out{out_ratio}ts{test_ratio}vd{valid_ratio}' 
print(SplitMethod)

In [None]:
logger.info('Generate Random Tags for Downsample, In/Out, and Train/Test/Valid Split')
logger.info(f'SplitMethod: {SplitMethod}')
logger.info(f'Before Split: {df_case.shape}')
df_case = assign_caseSplitTag_to_dsCaseLearning(df_case, 
                                                RANDOM_SEED, 
                                                downsample_ratio, out_ratio, 
                                                test_ratio, valid_ratio)
logger.info(f'After Split: {df_case.shape}')
df_case.head()

In [None]:
print(f'\ntotal---> recnum {len(df_case)}')
for i in ['In', 'Out', 'Train', 'Valid', 'Test']:
    print(i, df_case[i].mean())

print('Valid/(Train+Test):', df_case['Valid'].sum() / (df_case['Train'].sum() + df_case['Valid'].sum()))

subtype_list = ['patient_gender', 'patient_age_bucket', 'patient_zipcode_3']
subtype_list = ['patient_gender', 'patient_age_bucket']
for subtype in subtype_list:
    for subname, df_sub in df_case.groupby(subtype):
        print(f'\n{subname}---> recnum {len(df_sub)}')
        for i in ['In', 'Out', 'Train', 'Valid', 'Test']:
            print(i, df_sub[i].mean())
        print('Valid/(Train+Test):', df_sub['Valid'].sum() / (df_sub['Train'].sum() + df_sub['Valid'].sum()))