# Space

In [None]:
import os
import logging
import pandas as pd
from pprint import pprint 
from IPython.display import display, HTML
pd.set_option('display.max_columns', None)
KEY = 'WorkSpace'
WORKSPACE_PATH = os.getcwd().split(KEY)[0] + KEY
# print(WORKSPACE_PATH)
os.chdir(WORKSPACE_PATH)
import sys
from proj_space import SPACE
sys.path.append(SPACE['CODE_FN'])
SPACE['WORKSPACE_PATH'] = WORKSPACE_PATH
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format='[%(levelname)s:%(asctime)s:(%(filename)s@%(lineno)d %(name)s)]: %(message)s')

from datasets import disable_caching
disable_caching()

SPACE['MODEL_ENDPOINT'] = 'vTest'

# Step 1: Record and CaseSet

In [None]:
from config.config_record.Cohort import CohortName_to_OneCohortArgs
from config.config_case.CKPD import Ckpd_to_CkpdObsConfig
from recfldtkn.record_base import Record_Base

###################################
HumanRecordRecfeat_Args = {
    'P': {
        'BP': [],
        'CGM5Min': ['CGM5Min-N2Cin1'],
        'Carb': ['Carb-N2Cin20'],
        'Exercise': ['Exercise-Nume'],
        'Food': ['Food-NutriNume'],
        'P': ['P-DemoCate'],
        'Sleep': ['Sleep-Nume'],
        'Step': ['Step-Nume'],
        'Weight': ['Weight-Nume'],
        'PHeight': [], 
    }
}

# CohortName = '20240701_Spiriva'
CohortName_list = [
    # 'WellDoc2022CGM', 
    # 'WellDoc2023CVSTDC',
    'WellDoc2023CVSDeRx'
]
HumanRecordRecfeat_Args = HumanRecordRecfeat_Args
Record_Proc_Config = {'save_data': True, 'load_data': True, 'via_method': 'ds'}
Inference_Entry = None # this is not inference mode
###################################


record_base = Record_Base(CohortName_list, 
                        HumanRecordRecfeat_Args,
                        CohortName_to_OneCohortArgs,
                        SPACE = SPACE, 
                        Inference_Entry = Inference_Entry,
                        Record_Proc_Config = Record_Proc_Config,
                        )
record_base

In [None]:
from config.config_case.GROUP import GROUP_TO_GROUPMethodArgs
from config.config_case.CF import CF_to_CFArgs
from config.config_case.CKPD import Ckpd_to_CkpdObsConfig
from config.config_case.TagRec import TagRec_to_TagRecArgs
from config.config_case.TagCF import TagCF_to_TagCFArgs 
from config.config_case.Flt import FltName_to_FltArgs
from config.config_case.CASE import TriggerCaseBaseName_to_TriggerCaseBaseArgs

from recfldtkn.case_base.case_base import OneCohortTrigger_CaseBase
from recfldtkn.case_base.case_base import CaseSetManager, Case_Base

Case_Args_Settings = {
    'Ckpd_to_CkpdObsConfig': Ckpd_to_CkpdObsConfig,
    'CF_to_CFArgs': CF_to_CFArgs,
    'TagCF_to_TagCFArgs': TagCF_to_TagCFArgs,
    'TagRec_to_TagRecArgs': TagRec_to_TagRecArgs,
    'FltName_to_FltArgs': FltName_to_FltArgs,
    'GROUP_TO_GROUPMethodArgs': GROUP_TO_GROUPMethodArgs,
}

Case_Proc_Config = {
    'max_trigger_case_num': None, 
    'use_task_cache': False, 
    'caseset_chunk_size': 10000, # 200k for CGM, 50k for others.
    'save_data': True, 
    'load_data': True, 
    'load_casecollection': True,
    'via_method': 'ds',
    'n_cpus': 1, 
    'batch_size': 1000,  
}

In [None]:
# --------------------------------------------------
TriggerCaseBaseName = 'WeightEntry-FutureWeightAndMultiHistoricalEgm'


TriggerCaseBaseArgs =  {
    'Trigger': {
        'TriggerName': 'WeightEntry', 
        'TagRec': [
            'TagRec.PDemoFromP',
        ],
        'Filter': 'FltBasicDemo',
        'Group': 'GrpGenderDisease', # <--- get CaseSetName_to_CaseSet 
        'ObsTask': {
            'TagCF_list': [
                'TagCF.FutureWeightInfo', 
            ],
            'CF_list':  [
                'cf.PDemo',
                'cf.Bf1mRecNum',
                'cf.Bf24hCGMFeat',
                'cf.Bf24hMedalFeat',
                'cf.Bf1mMedalFeat',
                'cf.Bf2mMedalFeat',
            ],
        }
    },
}

# CohortTriggerCaseBaseArgs = Name_to_CohortTriggerCaseBaseArgs[TriggerCaseBaseName]
TriggerCaseBaseName_to_TriggerCaseBaseArgs[TriggerCaseBaseName] = TriggerCaseBaseArgs
pprint(TriggerCaseBaseArgs, sort_dicts=False)

In [None]:
TriggerCaseBaseName_to_CohortNameList = {
    TriggerCaseBaseName: CohortName_list,
}

TriggerCaseBaseName_to_CohortNameList

In [None]:
case_base = Case_Base(
    record_base = record_base, 
    TriggerCaseBaseName_to_CohortNameList = TriggerCaseBaseName_to_CohortNameList, 
    TriggerCaseBaseName_to_TriggerCaseBaseArgs = TriggerCaseBaseName_to_TriggerCaseBaseArgs,
    Case_Proc_Config = Case_Proc_Config,
    Case_Args_Settings = Case_Args_Settings, 
)

In [None]:
case_base.TriggerCaseBaseName_to_CaseSetNameToCaseset

# Step 2: AIData

In [None]:
OneDataName = 'UnilabelWeightpredAf1M'


OneEntryArgsTemplate = {
    # ----------------- Task Part -----------------
    'Task_Part': {
        'Tagging': {
            'TagName_to_TaggingMethod': {
                # TagName: TaggingMethod {Rules: [(x,x,x)], Op: and or}
            },
            'ColumnsAddToDsCase': [],
            'TagFilter': True,
            'TagSplit': True, 
        },

        'Filtering': {
            'FilterTagging': {
                'Rules': [
                    ('co.Weight_Af1Minfo:no_future_weight', '!=', 1),
                ],
                'Op': 'and',
            }
        }, 
        
        'Splitting': {
            'SplitTagging': {
                'RANDOM_SEED': 42,
                # 'downsample_ratio': 1,
                'out_ratio': 0.0, # hold-out patients. 
                'test_ratio': 0.2,
                'valid_ratio': 0.0
            },
            'TrainEvals': {
                'TrainSetName': 'Train',
                'EvalSetNames': ['Test'],
            },
        }
    },

    # ----------------- Input Part -----------------
    'Input_Part': {
        'EntryInputMethod': 'SparseMatrixFromMultiCF',
        'InputCFs_Args': [
            'cf.PDemo',
            'cf.Bf1mRecNum',
            'cf.Bf24hCGMFeat',
            'cf.Bf24hMedalFeat',
            'cf.Bf1mMedalFeat',
            'cf.Bf2mMedalFeat',
        ],
    }, 

    # ----------------- Output Part -----------------
    'Output_Part': {
        'EntryOutputMethod': 'MLUniLabel',
        
        # -----------
        # 'Labeling': ('co.Weight_Af1Minfo:weight_loss_pct', '>', 0.02), 
        'Labeling': None, 
        # -----------
    },
}

DataVariantName_to_Args = {
    'WeightLossPctLarge2': {
        'Output_Part:Labeling': ('co.Weight_Af1Minfo:weight_loss_pct', '>', 0.02),
    },
    # 'WeightLossPctLarge4': {
    #     'Output_Part:Labeling': ('co.Weight_Af1Minfo:weight_loss_pct', '>', 0.04),
    # },
    # 'WeightLossPctLarge6': {
    #     'Output_Part:Labeling': ('co.Weight_Af1Minfo:weight_loss_pct', '>', 0.04),
    # },
    # 'WeightLossPctLarge8': {
    #     'Output_Part:Labeling': ('co.Weight_Af1Minfo:weight_loss_pct', '>', 0.08),
    # },
    # 'WeightLossPctLarge10': {
    #     'Output_Part:Labeling': ('co.Weight_Af1Minfo:weight_loss_pct', '>', 0.10),
    # },
}

In [None]:
from recfldtkn.aidata_base.aidata_base import get_OneAIDataName_to_OneAIDataArgs

# SeriesName = 'UnilabelWeightpredAf1M' 
# OneEntryArgsTemplate = SeriesName_to_OneEntryArgsTemplate[SeriesName]
####################
OneAIDataName_to_OneAIDataArgs = get_OneAIDataName_to_OneAIDataArgs(OneDataName, 
                                                                    CohortName_list, 
                                                                    TriggerCaseBaseName,
                                                                    TriggerCaseBaseArgs, 
                                                                    OneEntryArgsTemplate, 
                                                                    DataVariantName_to_Args)
####################


pprint(OneAIDataName_to_OneAIDataArgs, sort_dicts=False)

In [None]:
from recfldtkn.aidata_base.aidata_base import AIData_Base

############## inference mode ####################
# AIDataArgs_items_for_inference = ['TriggerCaseBaseName', 'Input_Args']
# CohortName_list_for_inference = ['Inference']

############## training mode ####################
aidata_base = AIData_Base(
    case_base = case_base, 
    OneAIDataName_to_OneAIDataArgs = OneAIDataName_to_OneAIDataArgs,
    SPACE = SPACE, 
)   

# pprint(aidata_base.OneAIDataName_to_OneAIDataArgs, sort_dicts=False)
# pprint(aidata_base.AIDataHashName_to_AIDataArgs, sort_dicts=False)

In [None]:
AIDataName_list = aidata_base.get_AIDataName_list()

OneAIDataName = AIDataName_list[0] 

OneAIData_Args = aidata_base.get_OneAIDataArgs_from_OneAIDataName(OneAIDataName)
# pprint(OneAIData_Args, sort_dicts=False)

aidata = aidata_base.get_aidata_from_OneAIDataName(OneAIDataName)
aidata

In [None]:
Name_to_Data = aidata.Name_to_Data
for Name, Data in Name_to_Data.items():
    print(Name, [i for i in Data])
    print(Name, ':', Data['ds_case'])

# Step 3: Model Instance Parameters

In [None]:
import os
import inspect 

# %%%%%%%%%%%%%%%%%%%%%%%% user generation
ModelArgs = {
    'model_type': 'XGBClassifierV1',
    'random_state': 42, 
    'max_depth': 10,
}

TrainingArgs = {
    'n_estimators': 1000, # num_boost_round
    'learning_rate': 0.01, # eta
    'objective': 'binary:logistic', 
    'early_stopping_rounds': 10,
    'eval_metric': 'logloss',  
}

InferenceArgs = {}

EvaluationArgs = {
    'subgroup_config_list': [
        ['EvalName'],
        # ['EvalName', 'ageBucketGroup'], 
    ],
    'y_real_label_name': 'y_real_label', 
    'y_pred_score_name': 'y_pred_score',
    'EachThreshold_step': 100, 
    'PredScoreGroup_step': 100, 
    'GroupNum': 100,
}
# %%%%%%%%%%%%%%%%%%%%%%%% user generation

# Step 4. Model Instance Design

In [None]:
from nn.xgboost.instance_xgboost import XGBClassifierInstance

In [None]:
XGBClassifierInstance

# Step 5: Model Init

In [None]:
ModelInstance = XGBClassifierInstance
model_artifact = ModelInstance(
    aidata = aidata, 
    ModelArgs = ModelArgs, 
    TrainingArgs = TrainingArgs, 
    InferenceArgs = InferenceArgs, 
    EvaluationArgs = EvaluationArgs,
    SPACE = SPACE,
)

print(model_artifact.model_artifact_name)
print(model_artifact.model_artifact_path)

model_artifact.init_model()
model_artifact.model 

# Step 6: Model Fit

In [None]:
model_artifact.fit()

In [None]:
# model_instance.model.save_model(model_path)
######################## testing 
model_artifact_path = model_artifact.model_artifact_path
######################## testing 


if not os.path.exists(model_artifact_path):
    os.makedirs(model_artifact_path)

model_path = os.path.join(model_artifact_path, 'model.json')
model_path

In [None]:
len(model_artifact.model_artifact_name)

In [None]:
# len(model_path)

In [None]:
# model_instance.fit()
# model_instance.model

In [None]:
model_path

In [None]:
import xgboost 


if os.path.exists(model_path):
    logger.info(f'Loading model from {model_path}')
    model = xgboost.XGBClassifier()
    model.load_model(model_path)
    model_artifact.model = model
else:
    model_artifact.fit()
    model_artifact.model.save_model(model_path)
    logger.info(f'Saved model to {model_path}')


model_artifact.model

# Step 7: Model Inference

In [None]:
model_artifact

In [None]:
[i for i in aidata.Name_to_Data]

In [None]:
df_case_list = []

for SetName in aidata.TrainEvals['EvalSetNames']:
    Data = aidata.Name_to_Data[SetName]
    # dataset = aidata.Name_to_DsAIData[SetName]
    df_case = Data['df_case'].copy()
    df_case['EvalName'] = SetName   
    inference_results = model_artifact.inference(Data)
    for k, v in inference_results.items():
        print(k, len(v), len(df_case))
        df_case[k] = v
    df_case_list.append(df_case)

df_case_eval = pd.concat(df_case_list)
df_case_eval

In [None]:
df_case_eval['y_real_label']

In [None]:
SetName = 'Test'
Data = aidata.Name_to_Data[SetName]
# Data = aidata.Name_to_DsAIData[SetName]

inference_results = model_artifact.inference(Data)
inference_results

# Step 8: Model Evaluation

In [None]:
model_artifact.EvaluationArgs

In [None]:
model_artifact.evaluate()
model_artifact.df_report_full

# Step 9: Save Model

In [None]:
model_artifact.save_model()

In [None]:
model_artifact_path = model_artifact.model_artifact_path
model_artifact_path

# Step 10: Load Model

In [None]:
import os
import logging
import pandas as pd 
from pprint import pprint 
from IPython.display import display, HTML
pd.set_option('display.max_columns', None)
KEY = 'WorkSpace'
WORKSPACE_PATH = os.getcwd().split(KEY)[0] + KEY
# print(WORKSPACE_PATH)
os.chdir(WORKSPACE_PATH)
import sys
from proj_space import SPACE
sys.path.append(SPACE['CODE_FN'])
SPACE['WORKSPACE_PATH'] = WORKSPACE_PATH
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format='[%(levelname)s:%(asctime)s:(%(filename)s@%(lineno)d %(name)s)]: %(message)s')

from datasets import disable_caching
disable_caching()

SPACE['MODEL_ENDPOINT'] = 'vTest'

In [None]:
# model_checkpoint_path = model_instance.model_checkpoint_path
# model_artifact_path = '../_Model/Timely-Model/vTest/UniLabelPred-InvAf1w.AllBrand-Inv.Link-XGBClassifierV0.6-2024.08.31-f593c453f40068a0'
model_artifact_path 

In [None]:
import json 
from recfldtkn.aidata_base.aidata import AIData 
from recfldtkn.model_base.model_base import load_model_artifact
from nn import load_model_instance_from_nn

In [None]:
model_artifact = load_model_artifact(model_artifact_path, load_model_instance_from_nn, SPACE)
model_artifact.model 

In [None]:
model_artifact.evaluate()

In [None]:
model_artifact.df_case_eval

In [None]:
model_artifact.df_report_neat