# Space

In [None]:
import os
import logging
import pandas as pd 
from pprint import pprint 
from IPython.display import display, HTML
pd.set_option('display.max_columns', None)
KEY = 'WorkSpace'
WORKSPACE_PATH = os.getcwd().split(KEY)[0] + KEY
# print(WORKSPACE_PATH)
os.chdir(WORKSPACE_PATH)
import sys
from proj_space import SPACE
sys.path.append(SPACE['CODE_FN'])
SPACE['WORKSPACE_PATH'] = WORKSPACE_PATH

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format='[%(levelname)s:%(asctime)s:(%(filename)s@%(lineno)d %(name)s)]: %(message)s')

from datasets import disable_caching
disable_caching()

# Step 1: Record and Case Base

In [None]:
from config.config_case.GROUP import GROUP_TO_GROUPMethodArgs
from config.config_case.CF import CF_to_CFArgs
from config.config_case.CKPD import Ckpd_to_CkpdObsConfig
from config.config_case.TagRec import TagRec_to_TagRecArgs
from config.config_case.TagCF import TagCF_to_TagCFArgs 
from config.config_case.Flt import FltName_to_FltArgs
from config.config_case.CASE import TriggerCaseBaseName_to_TriggerCaseBaseArgs

from config.config_record.Cohort import CohortName_to_OneCohortArgs
from config.config_case.CKPD import Ckpd_to_CkpdObsConfig

from recfldtkn.record_base import Record_Base
from recfldtkn.case_base.case_base import Case_Base

CohortNames = [i for i in CohortName_to_OneCohortArgs.keys()]
print(CohortNames)

In [None]:
###################################
Inference_Entry = None # this is not inference mode
Case_Args_Settings = {
    'Ckpd_to_CkpdObsConfig': Ckpd_to_CkpdObsConfig,
    'CF_to_CFArgs': CF_to_CFArgs,
    'TagCF_to_TagCFArgs': TagCF_to_TagCFArgs,
    'TagRec_to_TagRecArgs': TagRec_to_TagRecArgs,
    'FltName_to_FltArgs': FltName_to_FltArgs,
    'GROUP_TO_GROUPMethodArgs': GROUP_TO_GROUPMethodArgs,
}

Record_Proc_Config = {
    'save_data': True, 
    'load_data':True, 
    'via_method': 'ds',
}

Case_Proc_Config = {
    'max_trigger_case_num': None, 
    'use_task_cache': False, 
    'caseset_chunk_size': 10000, # 200k for CGM, 50k for others.
    'save_data': True, 
    'load_data': True, 
    'load_casecollection': True,
    'via_method': 'ds',
    'n_cpus': 4, 
    'batch_size': 1000,  
}
###################################  

CohortName_list = [ 
    'WellDoc2023CVSDeRx',
]

TriggerCaseBaseName = 'WeightEntry-FutureWeightAndMultiHistoricalEgm'
TriggerCaseBaseArgs = {
    'Trigger': {
        'TriggerName': 'WeightEntry', 
        'TagRec': [
            'TagRec.PDemoFromP',
        ],
        'Filter': 'FltBasicDemo',
        'Group': 'GrpGenderDisease', # <--- get CaseSetName_to_CaseSet 
        'ObsTask': {
            'TagCF_list': [
                'TagCF.FutureWeightInfo', 
            ],
            'CF_list':  [
                'cf.PDemo',
                'cf.Bf1mRecNum',
                'cf.Bf24hCGMFeat',
                'cf.Bf24hMedalFeat',
                'cf.Bf1mMedalFeat',
                'cf.Bf2mMedalFeat',
            ],
        }
    },
}
TriggerCaseBaseName_to_TriggerCaseBaseArgs[TriggerCaseBaseName] = TriggerCaseBaseArgs
pprint(TriggerCaseBaseArgs, sort_dicts=False)

In [None]:
from recfldtkn.check import update_and_assert_CaseInfo
from recfldtkn.check import retrive_pipeline_info
PIPELINE_INFO = retrive_pipeline_info(SPACE)


CaseSettingInfo = update_and_assert_CaseInfo(
                                TriggerCaseBaseName,
                                TriggerCaseBaseArgs,
                                Case_Args_Settings,
                                Case_Proc_Config, 
                                PIPELINE_INFO, 
                                )

HumanRecordRecfeat_Args = CaseSettingInfo['HumanRecordRecfeat_Args']
record_base = Record_Base(CohortName_list, 
                            HumanRecordRecfeat_Args,
                            CohortName_to_OneCohortArgs,
                            SPACE = SPACE, 
                            Inference_Entry = Inference_Entry,
                            Record_Proc_Config = Record_Proc_Config,
                            )

In [None]:
TriggerCaseBaseName_to_CohortNameList = {
    TriggerCaseBaseName: CohortName_list,
}

TriggerCaseBaseName_to_CohortNameList

case_base = Case_Base(
    record_base = record_base, 
    TriggerCaseBaseName_to_CohortNameList = TriggerCaseBaseName_to_CohortNameList, 
    TriggerCaseBaseName_to_TriggerCaseBaseArgs = TriggerCaseBaseName_to_TriggerCaseBaseArgs,
    Case_Proc_Config = Case_Proc_Config,
    Case_Args_Settings = Case_Args_Settings, 
)

In [None]:
CaseSetNameToCaseset = case_base.TriggerCaseBaseName_to_CaseSetNameToCaseset[TriggerCaseBaseName]
CaseSetNameToCaseset

In [None]:
for name, caseset in CaseSetNameToCaseset.items(): break 

caseset

In [None]:
caseset.ds_case

In [None]:
[CF for CF in case_base.TriggerCaseBaseName_to_CFtoCFvocab[TriggerCaseBaseName]]

# Step 2: EntryFn - Input_Part

## Input Prepare

In [None]:
OneEntryArgs = {
    # ----------------- Input Part -----------------
    'Input_Part': {
        'EntryInputMethod': 'SparseMatrixFromMultiCF',
        'CF_list': [
            'cf.PDemo',
            'cf.Bf1mRecNum',
            'cf.Bf24hCGMFeat',
            'cf.Bf24hMedalFeat',
            'cf.Bf1mMedalFeat',
            'cf.Bf2mMedalFeat',
        ],
        # 'BeforePeriods': ['Bf24H'],
        # 'AfterPeriods': ['Af2H'],
        # 'InferenceMode': False, 
    }, 
}

EntryInputMethod = OneEntryArgs['Input_Part']['EntryInputMethod']

# caseset
Data = {'df_case': caseset.df_case, 'ds_case': caseset.ds_case}

CF_to_CFvocab = case_base.TriggerCaseBaseName_to_CFtoCFvocab[TriggerCaseBaseName]
print([i for i in CF_to_CFvocab])

## Function Develop

In [None]:
import inspect
import numpy as np
from scipy.sparse import csr_matrix, hstack
import itertools


## %%%%%%%%%%%%%%%%%%%%% user functions
def tfm_fn_AIInputData(*args, **kwargs):
    pass

def get_INPUT_CFs(OneEntryArgs):
    Input_Part = OneEntryArgs['Input_Part']
    CF_list = Input_Part['CF_list']
    ############################ # INPUT_CFs
    # assert type(InputCFs_Args) == list, f'InputCFs_Args must be a list, but got {type(InputCFs_Args)}'
    INPUT_CFs = sorted(CF_list) # why sorted here?
    ############################
    return INPUT_CFs
    

def entry_fn_AIInputData(Data, 
                         CF_to_CFvocab, 
                         OneEntryArgs,
                         tfm_fn_AIInputData = None):

    ds_case = Data['ds_case']
    # Input feaures. 
    
    INPUT_CFs = get_INPUT_CFs(OneEntryArgs)
    # print('\n\n\n\n ---------- INPUT_CFs" {} --------- \n\n\n\n'.format(INPUT_CFs))
    
    
    accumulated_matrices = []  # Initialize a list to accumulate the sparse matrices
    for INPUT_CF in INPUT_CFs:
        CF_vocab = CF_to_CFvocab[INPUT_CF]
        
        tid2tkn = CF_vocab['input_ids']['tid2tkn']
        num_features = len(tid2tkn)

        # tid2tkn_filter = EntryArgs.get('tid2tkn_filter', None)
        input_ids_name  = f'{INPUT_CF}--input_ids'
        input_wgts_name = f'{INPUT_CF}--input_wgts'

        col_indices = list(itertools.chain(*[          tid  for i,   tid in enumerate(ds_case[input_ids_name])]))
        row_indices = list(itertools.chain(*[[i] * len(tid) for i,   tid in enumerate(ds_case[input_ids_name])]))
        data        = list(itertools.chain(*[          wgt  for tid, wgt in zip(ds_case[input_ids_name], ds_case[input_wgts_name])]))
        
        sparse_matrix_value = (data, (row_indices, col_indices))
        shape = (len(ds_case), num_features)
        X = csr_matrix(sparse_matrix_value, shape=shape)
        
        # Inside your loop, after creating each X, append it to the list:
        accumulated_matrices.append(X)

    # After the loop, concatenate all sparse matrices horizontally
    X = hstack(accumulated_matrices, format='csr')

    ds_tfm = {'X': X}
    Data['ds_tfm'] = ds_tfm
    return Data


get_INPUT_CFs.fn_string = inspect.getsource(get_INPUT_CFs)
tfm_fn_AIInputData.fn_string = inspect.getsource(tfm_fn_AIInputData)
entry_fn_AIInputData.fn_string = inspect.getsource(entry_fn_AIInputData)

## Examine

In [None]:
Data = entry_fn_AIInputData(Data, 
                            CF_to_CFvocab, 
                            OneEntryArgs,
                            tfm_fn_AIInputData)


ds_tfm = Data['ds_tfm']
ds_tfm

##  Save Entry Fn

In [None]:
from recfldtkn.aidata_base.entry import AIDATA_ENTRYINPUT_PATH
from recfldtkn.base import Base

pypath = os.path.join(SPACE['CODE_FN'],  AIDATA_ENTRYINPUT_PATH, f'{EntryInputMethod}.py')
print(pypath) 

prefix = [
    'import itertools',
    'import pandas as pd', 
    'import numpy as np', 
    'import datasets',
    'from scipy.sparse import csr_matrix, hstack',
    ]

fn_variables = [
    get_INPUT_CFs,
    tfm_fn_AIInputData,
    entry_fn_AIInputData,
]

pycode = Base.convert_variables_to_pystirng(fn_variables = fn_variables, prefix = prefix)

# print(pypath)
if not os.path.exists(os.path.dirname(pypath)): os.makedirs(os.path.dirname(pypath))
with open(pypath, 'w') as file: file.write(pycode)

# Step 3: EntryFn - Output_Part 

## Input

In [None]:
# TaskType = 'MLUniLabel'
SeriesName  = 'weightpred.Af1M'

OneTaskName = 'WeightPred.Af1M.WeightLossPctLarge2'
OneEntryArgs = {
    # ----------------- Input Part -----------------
    'Input_Part': {
        'EntryInputMethod': 'SparseMatrixFromMultiCF',
        'CF_list': [
            'cf.PDemo',
            'cf.Bf1mRecNum',
            'cf.Bf24hCGMFeat',
            'cf.Bf24hMedalFeat',
            'cf.Bf1mMedalFeat',
            'cf.Bf2mMedalFeat',
        ],
    }, 

    # ----------------- Output Part -----------------
    'Output_Part': {
        'EntryOutputMethod': 'MLUniLabel',
        'TagCF_list': [
            'TagCF.FutureWeightInfo', 
        ], 
        'Labeling': ('co.Weight_Af1Minfo:weight_loss_pct', '>', 0.02), 
    },


    # ----------------- Task Part -----------------
    'Task_Part': {
        'Tagging': [],
        'Filtering': [
            ('co.Weight_Af1Minfo:no_future_weight', '!=', 1),
        ], 
    },
}

# EntryInputMethod = OneEntryArgs['EntryInputMethod']
EntryOutputMethod = OneEntryArgs['Output_Part']['EntryOutputMethod']
# caseset
Data = {'df_case': caseset.df_case, 'ds_case': caseset.ds_case}

CF_to_CFvocab = case_base.TriggerCaseBaseName_to_CFtoCFvocab[TriggerCaseBaseName]
print([i for i in CF_to_CFvocab])

## Function

In [None]:
## %%%%%%%%%%%%%%%%%%%%%
# UniLabel
import inspect 
import numpy as np 
# from recfldtkn.loadtools import convert_variables_to_pystirng


def get_OUTPUT_CFs(OneEntryArgs):
    if 'Output_Part' not in OneEntryArgs:
        return []
    else:
        return OneEntryArgs['Output_Part'].get('CF_list', [])
get_OUTPUT_CFs.fn_string = inspect.getsource(get_OUTPUT_CFs)


def entry_fn_AITaskData(Data, 
                        CF_to_CFvocab, 
                        OneEntryArgs,
                        tfm_fn_AIInputData = None,
                        entry_fn_AIInputData = None,
                        ):

    Data = entry_fn_AIInputData(Data, CF_to_CFvocab, OneEntryArgs, tfm_fn_AIInputData) 
    
    
    Output_Part = OneEntryArgs['Output_Part']
    Labeling = Output_Part['Labeling']
    # assert type(Labeling) == tuple, f'Labeling must be a tuple, but got {type(Labeling)}'
    assert len(Labeling) == 3, f'Labeling must have 3 elements, but got {len(Labeling)}'
    label_name, label_op, label_value = Labeling
    df_case = Data['df_case']
    
    if label_op == '>':
        Y = df_case[label_name] > label_value
    elif label_op == '<':
        Y = df_case[label_name] < label_value
    elif label_op == '==':
        Y = df_case[label_name] == label_value
    elif label_op == 'in':
        Y = df_case[label_name].isin(label_value)
    else:
        raise ValueError(f'Invalid label_op: {label_op}')
    
    Y = Y.astype(int).values
    ds_tfm = Data['ds_tfm']
    ds_tfm['Y'] = Y
    Data['ds_tfm'] = ds_tfm
    return Data

entry_fn_AITaskData.fn_string = inspect.getsource(entry_fn_AITaskData)

In [None]:
Data = entry_fn_AITaskData(Data, 
                           CF_to_CFvocab, 
                           OneEntryArgs,
                           tfm_fn_AIInputData,
                           entry_fn_AIInputData)

ds_tfm = Data['ds_tfm']
ds_tfm

In [None]:
ds_tfm['Y'].mean()

In [None]:
from recfldtkn.base import Base
from recfldtkn.aidata_base.entry import AIDATA_ENTRYOUTPUT_PATH

prefix = [
    'import torch',
    'import pandas as pd', 
    'import numpy as np', 
    'import datasets'
    ]
fn_variables = [
    get_OUTPUT_CFs,
    entry_fn_AITaskData,
]
pycode = Base.convert_variables_to_pystirng(fn_variables = fn_variables, prefix = prefix)
pypath = os.path.join(SPACE['CODE_FN'], AIDATA_ENTRYOUTPUT_PATH, f'{EntryOutputMethod}.py')
print(pypath)
if not os.path.exists(os.path.dirname(pypath)): os.makedirs(os.path.dirname(pypath))
with open(pypath, 'w') as file: file.write(pycode)