# Space

In [None]:
import os
import sys 
import logging
from pprint import pprint 
 
# WorkSpace
KEY = 'WorkSpace'; WORKSPACE_PATH = os.getcwd().split(KEY)[0] + KEY; print(WORKSPACE_PATH)
os.chdir(WORKSPACE_PATH)
sys.path.append(WORKSPACE_PATH)

# Pipeline Space
from proj_space import SPACE
SPACE['WORKSPACE_PATH'] = WORKSPACE_PATH
sys.path.append(SPACE['CODE_FN'])

from recfldtkn.configfn import load_cohort_args
from config_observer.CF import cf_to_CaseFeatConfig
from config_observer.QCF import cf_to_QueryCaseFeatConfig
from config_observer.CKPD import ckpd_to_CkpdObsConfig
from recfldtkn.pipeline_dataset import pipeline_to_generate_dfcase_and_dataset


# Step 1: CaseSet

In [None]:
logger = logging.getLogger(__name__)
recfldtkn_config_path = os.path.join(SPACE['CODE_FN'], 'config_recfldtkn/')
cohort_args = load_cohort_args(recfldtkn_config_path, SPACE)
cohort_args['ckpd_to_CkpdObsConfig'] = ckpd_to_CkpdObsConfig
cohort_args['ObsDTName'] = 'ObsDT'
cohort_args['PID_ObsDT_columns'] = [cohort_args['RootID'], cohort_args['ObsDTName']]

CASE_TAGGING_PROC_CONFIG = {
    'use_CF_from_disk': False,
    'use_CO_from_disk': False,
    'start_chunk_id': 0,
    'end_chunk_id': None,
    'chunk_size': 10000, # 100000,
    'save_to_pickle': True,
    'num_processors': 8, #1, # 12,
}

CASE_FIEDLING_PROC_CONFIG = {
    'use_CF_from_disk': False,
    'use_CO_from_disk': False,
    'start_chunk_id': 0,
    'end_chunk_id': None,
    'chunk_size': 10000, # 100000,
    'num_processors': 8,
}

PROC_SETTINGS = {
    'LOAD_DF_CASE': True,
    'LOAD_DS_DATA': True,
    'SAVE_DF_CASE': False,
    'SAVE_DS_DATA': False,
    'RANDOM_SAMPLE': False,
    'SAVE_TRIGGER_DF': False,
}

In [None]:
from recfldtkn.pipeline_dataset import pipeline_to_generate_dfcase_and_dataset
from DsConst import DsConst_Config
from recfldtkn.aidata_base.caseset import CaseSet

##################
dataset_name = 'SurveyTagCrntFeat'  # 'InvTagCrntFeat' 
##################

caseset_args = {
    'dataset_name': dataset_name,
    'DsConst_Config': DsConst_Config, 
    'cohort_args': cohort_args,
    'cf_to_QueryCaseFeatConfig': cf_to_QueryCaseFeatConfig,
    'cf_to_CaseFeatConfig': cf_to_CaseFeatConfig,
    'SPACE': SPACE,
    'CASE_TAGGING_PROC_CONFIG': CASE_TAGGING_PROC_CONFIG,
    'CASE_FIEDLING_PROC_CONFIG': CASE_FIEDLING_PROC_CONFIG,
    'PROC_SETTINGS': PROC_SETTINGS,
    'pipeline_to_generate_dfcase_and_dataset': pipeline_to_generate_dfcase_and_dataset,
}

caseset = CaseSet(caseset_args)

# Step 2: AIData

In [None]:
from ConfigInput import InputConfig_Settings
from ConfigTask import TaskSeries_Settings
from ConfigAIDev import AIDevConfig_Settings

from recfldtkn.aidata_base.entry import EntryAIData_Builder

## %%%%%%%%%%%%%%%%%%%%%%%% user generation 
InputData_Args = {
    'INPUT_CFs_Args': 'InvCrntFeat',
    'EntryInputMethod': 'SparseMatrixFromOneCF',
    # we might add the Filtering for Input Data as well. 
}

OneTask_Args = {
    'OneTaskSeries': 'SurveyPred',
    'OneTaskName': 'Mars.MedAdhere',
    'EntryOutputMethod': 'UniLabel'
}

AIDevData_Args = {
    'NewName_to_OldNames': 'BaseC2',  # 'BaseC1', 
    'TrainEvals': 'BaseTrTe', 
    'SplitTagging': 'Rs42t2',
    'Filtering': 'FltNone', # 'FltBaseSMS',
}
## %%%%%%%%%%%%%%%%%%%%%%%% user generation 


entry_builder = EntryAIData_Builder(InputData_Args, 
                                    InputConfig_Settings, 
                                    OneTask_Args, 
                                    TaskSeries_Settings,
                                    AIDevData_Args, 
                                    AIDevConfig_Settings, 
                                    SPACE = SPACE)

entry_builder.load_pypath()
EntryArgs = entry_builder.EntryArgs 
EntryArgs

In [None]:
from recfldtkn.aidata_base.aidata import AIData
aidata = AIData(caseset, entry_builder)
aidata

In [None]:
Name_to_DsAIData = aidata.Name_to_DsAIData
Name_to_DsAIData

In [None]:
Name_to_DsCaseFields = aidata.Name_to_DsCaseFields
Name_to_DsCaseFields

In [None]:
df_case = aidata.df_case
df_case

# Step 3: model inference results

In [None]:
import os
import inspect 
from recfldtkn.loadtools import convert_variables_to_pystirng
from recfldtkn.loadtools import load_module_variables


LoadFnMethod = 'XGBClassifier'
pypath = os.path.join(SPACE['CODE_FN'], 'fn_io', f'{LoadFnMethod}.py')
module = load_module_variables(pypath)

# %%%%%%%%%%%%%%%%%%%%%%%% user generation
ModelArgs = {
    'algorithm': 'XGBClassifier',
    'random_state': 42, 
    'max_depth': 10,
}

TrainingArgs = {
    'n_estimators': 2000, # num_boost_round
    'learning_rate': 0.1, # eta
    'objective': 'binary:logistic', 
    'early_stopping_rounds': 10,
    'eval_metric': 'logloss',  
}
# %%%%%%%%%%%%%%%%%%%%%%%% user generation

fn_model_structure = module.fn_model_structure
fn_model_training = module.fn_model_training
model = fn_model_structure(ModelArgs, TrainingArgs)

TrainEvals = entry_builder.TrainEvals
model = fn_model_training(model, 
                          Name_to_DsAIData, TrainEvals,
                          ModelArgs, TrainingArgs)

In [None]:
TrainEvals = entry_builder.TrainEvals
EvalSetNames = TrainEvals['EvalSetNames']

Name_to_Inference = {}

for Name in EvalSetNames: 
    fn_model_inference = module.fn_model_inference
    dataset = Name_to_DsAIData[Name]
    results = fn_model_inference(model, dataset)
    Name_to_Inference[Name] = results

Name_to_Inference

# Evaluation

In [None]:
aidata.Name_to_DsCaseFields

In [None]:
import pandas as pd 

case_id_columns = aidata.case_id_columns
df_case = aidata.df_case
Name_to_dfInference = {}
for Name in Name_to_Inference:
    dsCF = Name_to_DsCaseFields[Name]
    df = dsCF.select_columns(case_id_columns).to_pandas()
    df_case_eval = pd.merge(df_case, df, on=case_id_columns, how='right') 
    Inference = Name_to_Inference[Name]
    for key in Inference:
        df_case_eval[key] = Inference[key]
    Name_to_dfInference[Name] = df_case_eval
    
df_case_eval