
# Space

In [None]:
import os
import sys 
import logging
import pandas as pd 
from pprint import pprint 

# WorkSpace
KEY = 'WorkSpace'; WORKSPACE_PATH = os.getcwd().split(KEY)[0] + KEY; print(WORKSPACE_PATH)
os.chdir(WORKSPACE_PATH)
sys.path.append(WORKSPACE_PATH)

# Pipeline Space
from proj_space import SPACE
SPACE['WORKSPACE_PATH'] = WORKSPACE_PATH
sys.path.append(SPACE['CODE_FN'])
# pprint(SPACE)

# Available Packages
import pandas as pd
from datetime import datetime 

logger = logging.getLogger(__name__)
recfldtkn_config_path = os.path.join(SPACE['CODE_FN'], 'config_recfldtkn/')

import datasets
from recfldtkn.loadtools import load_ds_rec_and_info
from recfldtkn.configfn import load_cohort_args, load_record_args
from config_observer.CKPD import ckpd_to_CkpdObsConfig


cohort_config = load_cohort_args(recfldtkn_config_path, SPACE)
cohort_config['ckpd_to_CkpdObsConfig'] = ckpd_to_CkpdObsConfig
cohort_config['ObsDTName'] = 'ObsDT'
cohort_config['PID_ObsDT_columns'] = [cohort_config['RootID'], cohort_config['ObsDTName']]
print(cohort_config)

# 1. DsCase

In [None]:
# 0. ************ RFT config ************
RecName_to_dsRec, RecName_to_dsRecInfo = {}, {}
cohort_label_list = [1]

# 1. ************ Case Trigger config ************
TriggerCaseMethod = 'TrulicityRx'


# 2. ************ InputCaseSetName ************
# option 1
# InputCaseSetName = 'C1.2.3-CGM5MinEntry'
# InputCaseSetName = 'sftcgmbf24haf2h-rs42-ds0.1-out0.1tstail0.1vd0.1'
InputCaseSetName = None 

In [None]:
from recfldtkn.loadtools import fetch_trigger_tools

Trigger_Tools = fetch_trigger_tools(TriggerCaseMethod, SPACE)
case_id_columns = Trigger_Tools['case_id_columns']
cohort_config['case_id_columns'] = case_id_columns
case_id_columns

In [None]:
from recfldtkn.pipeline_case import get_ds_case_to_process


InputCaseSetName, df_case = get_ds_case_to_process(InputCaseSetName, 
                                                   cohort_label_list, 
                                                   TriggerCaseMethod, 
                                                   cohort_config, 
                                                   SPACE, 
                                                   RecName_to_dsRec, 
                                                   RecName_to_dsRecInfo)

logger.info(f'InputCaseSetName: {InputCaseSetName}')
logger.info(f'df_case shape: {df_case.shape}')

In [None]:
df_case = df_case.sample(1000)

# 2. Tagging

In [None]:
from recfldtkn.pipeline_case import process_df_tagging_tasks_in_chunks
from config_observer.QCF import cf_to_QueryCaseFeatConfig


CASE_TAGGING_PROC_CONFIG = {
    'use_CF_from_disk': False,
    'use_CO_from_disk': False,
    'start_chunk_id': 0,
    'end_chunk_id': None,
    'chunk_size': 500000,
    'save_to_pickle': False,
    'num_processors': 1
}

######################################
TagMethod_List = ['PttBasicDF', 'EgmBf1Y']
######################################

if len(TagMethod_List) > 0:
    logger.info(f'df_case shape: {df_case.shape}')
    OutputCaseSetName, df_case = process_df_tagging_tasks_in_chunks(df_case, cohort_label_list, case_id_columns, 
                                                                    InputCaseSetName, 
                                                                    TagMethod_List, cf_to_QueryCaseFeatConfig, 
                                                                    cohort_config, SPACE, 
                                                                    RecName_to_dsRec, RecName_to_dsRecInfo,
                                                                    **CASE_TAGGING_PROC_CONFIG)
    logger.info(f'df_case shape: {df_case.shape}')


else:
    pass



# 3. Filtering

In [None]:
FilterMethod_List = [
    'fPttBasicDF', 
    'fTailObsDT',
]

In [None]:
from recfldtkn.pipeline_case import process_df_filtering_tasks

logger.info(f'-------------- (4) FilterMethod_List: {FilterMethod_List} --------------')
if len(FilterMethod_List) > 0:
    # logger.info(f'---------- before filtering: {df_case.shape} --------------')
    df_case = process_df_filtering_tasks(df_case, FilterMethod_List, SPACE)
    # logger.info(f'---------- after filtering: {df_case.shape} --------------')
    logger.info(f'df_case shape: {df_case.shape}')



# Split

In [None]:
df_case # df_case_learning

In [None]:
from recfldtkn.pipeline_case import generate_random_tags, assign_caseSplitTag_to_dsCase

##############
# RANDOM_SEED = 42
# downsample_ratio = 1 # 0.1 # 1 (don't drop any case), 0.1 (drop 90% of cases of one patient).
# out_ratio = 0 # 0.1 # hold-out patients
# test_ratio = 0.2 # 'tail0.1' #  '2023.10.15'#  # '0.1'
# valid_ratio = 0 #  0.1 
##############

SplitDict = {
    'RootID': cohort_config['RootID'],
    'ObsDT': 'ObsDT',
    'RANDOM_SEED': 42,
    'downsample_ratio': 0.1,
    'out_ratio': 0.1,
    'test_ratio': 'tail0.1',
    'valid_ratio': 0.1
}


# SplitMethod = f'rs{RANDOM_SEED}-ds{downsample_ratio}-out{out_ratio}ts{test_ratio}vd{valid_ratio}' 
# print(SplitMethod)

In [None]:
df_case = assign_caseSplitTag_to_dsCase(df_case,  **SplitDict)
df_case

In [None]:
print(f'\ntotal---> recnum {len(df_case)}')
for i in ['In', 'Out', 'Train', 'Valid', 'Test']:
    print(i, df_case[i].mean())

print('Valid/(Train+Test):', df_case['Valid'].sum() / (df_case['Train'].sum() + df_case['Valid'].sum()))

subtype_list = ['patient_gender', 'patient_age_bucket',
                 # 'patient_zipcode_3'
                 ]
# subtype_list = ['sex', 'a1cV0']
for subtype in subtype_list:
    for subname, df_sub in df_case.groupby(subtype):
        print(f'\n{subname}---> recnum {len(df_sub)}')
        for i in ['In', 'Out', 'Train', 'Valid', 'Test']:
            print(i, df_sub[i].mean())
        print('Valid/(Train+Test):', df_sub['Valid'].sum() / (df_sub['Train'].sum() + df_sub['Valid'].sum()))