# Space Settings

In [None]:
# This is the file that implements a flask server to do inferences. It's the file that you will modify to
# implement the scoring for your own algorithm.

from __future__ import print_function
import io
import os
import sys
import json
# import flask
import logging
import datasets
import traceback
import pandas as pd
from pprint import pprint
from datetime import datetime 
# from flask import Flask, request, jsonify, Response

logging.basicConfig(level=logging.INFO, format='[%(levelname)s:%(asctime)s:(%(filename)s@%(lineno)d %(name)s)]: %(message)s')
logger = logging.getLogger(__name__)

In [None]:
# parent_directory = '/dbfs/FileStore/Sid_Files/Deployment-v1119'
parent_directory = '.'

os.chdir(parent_directory)

In [None]:

def process_inference_SPACE(SPACE, MODEL_ENDPOINT):

    assert 'MODEL_ROOT' in SPACE, "Invalid SPACE: missing MODEL_ROOT"   
    
    # pipeline from ModelVersion/pipeline
    SPACE['CODE_FN'] = os.path.join(SPACE['MODEL_ROOT'], MODEL_ENDPOINT, 'pipeline')
    assert os.path.exists(SPACE['CODE_FN']), f"Invalid CODE_FN: {SPACE['CODE_FN']}"
    # external from ModelVersion/external
    SPACE['DATA_EXTERNAL'] = os.path.join(SPACE['MODEL_ROOT'], MODEL_ENDPOINT, 'external')
    assert os.path.exists(SPACE['DATA_EXTERNAL']), f"Invalid DATA_EXTERNAL: {SPACE['DATA_EXTERNAL']}"

    SPACE['DATA_RAW'] = os.path.join(SPACE['MODEL_ROOT'], MODEL_ENDPOINT)
    assert os.path.exists(SPACE['DATA_RAW']), f"Invalid DATA_EXTERNAL: {SPACE['DATA_RAW']}"

    SPACE['DATA_INFERENCE'] = os.path.join(SPACE['MODEL_ROOT'], MODEL_ENDPOINT, 'inference')
    assert os.path.exists(SPACE['DATA_INFERENCE']), f"Invalid DATA_EXTERNAL: {SPACE['DATA_INFERENCE']}"

    SPACE['MODEL_ENDPOINT'] = MODEL_ENDPOINT
    return SPACE

In [None]:
# #####################
# # Save the model with artifacts
# MODEL_NAME = 'weight_af1m_prediction'
# json_payload_path = 'data_weight.json'
# #####################



# ############################
# # ----------- environment for Estimator.deploy() -----------
# MODEL_ROOT          = 'model'           # '/opt/ml/model' in sagemaker
# MODEL_ENDPOINT      = 'vTestWeight' # 'vTestCGMFull'
# INF_CohortName      = '20241013_InferencePttSampleV0'
# INF_OneCohortArgs   = {'CohortLabel': 9,
#                        'CohortName': '20241013_InferencePttSampleV0',
#                        'FolderPath': '$DATA_RAW$/inference/',
#                        'SourcePath': 'patient_sample',
#                        'Source2CohortName': 'InferencePttSampleV0'}
# INF_CFArgs          = None 
# INF_Args            = None 

# PostFnName = "PostFn_NaiveForUniLabelPred" # "EngagementPredToLabel"
# TrigFnName = 'TriggerFn_WeightEntry_v1211' 
# MetaFnName = 'MetaFn_None'

# POST_PROCESS_SCRIPT = None # 'pipeline/inference/post_process.py' # by default, use this script
# LoggerLevel         = "INFO"
# ############################


In [None]:
#####################
# Save the model with artifacts
MODEL_NAME = 'cgmlsm_naive_2h_predict'

json_payload_path = 'data_cgm.json'
#####################

###########################
MODEL_ROOT          = 'model' # '../../../_Model'           # '/opt/ml/model' in sagemaker
MODEL_ENDPOINT      = 'vTestCGMFull' # 'vTestWeight' # 
INF_CohortName      = '20241013_InferencePttSampleV0'
INF_OneCohortArgs   = {'CohortLabel': 9,
                       'CohortName': '20241013_InferencePttSampleV0',
                       'FolderPath': '$DATA_RAW$/inference/',
                       'SourcePath': 'patient_sample',
                       'Source2CohortName': 'InferencePttSampleV0'}
INF_CFArgs          = ['cf.TargetCGM_Bf24H'] 
INF_Args            = {'GEN_Args': {
                            'num_first_tokens_for_gen': 289,
                            'max_new_tokens': 24,
                            'do_sample': False,
                            'items_list': ['hist', 'pred', 'logit_scores']}
                      } 
MetaFnName = 'MetaFn_None'
TrigFnName = 'TriggerFn_CGM5MinEntry_v1211' 
PostFnName = "PostFn_WithCGMPred_v1210" # "EngagementPredToLabel"
POST_PROCESS_SCRIPT = None # 'pipeline/inference/post_process.py' # by default, use this script
LoggerLevel         = "INFO"
###########################


In [None]:
############################# # image your are in the sagemaker container
MODEL_ROOT        = os.environ.get('MODEL_ROOT', MODEL_ROOT)
MODEL_ENDPOINT    = os.environ.get('MODEL_ENDPOINT', MODEL_ENDPOINT)
INF_CohortName    = os.environ.get('INF_COHORT_NAME', INF_CohortName)
INF_CohortArgs    = os.environ.get('INF_COHORT_ARGS', INF_OneCohortArgs)
InputCFArgs_ForInference = os.environ.get('INF_CFArgs', INF_CFArgs)
InferenceArgs     = os.environ.get('INF_Args', INF_Args)   

PostFnName = os.environ.get('PostFnName', PostFnName)
TrigFnName = os.environ.get('TrigFnName', TrigFnName)
MetaFnName = os.environ.get('MetaFnName', MetaFnName)

LoggerLevel       = os.environ.get('LOGGER_LEVEL', LoggerLevel)
#############################


In [None]:
SPACE = {'MODEL_ROOT': MODEL_ROOT}  
SPACE = process_inference_SPACE(SPACE, MODEL_ENDPOINT)
# TODO: update POST_PROCESS_SCRIPT is it is a s3 path

pprint(SPACE)

# MlFlow Databrick

In [None]:
import mlflow.pyfunc
import os
import sys
import logging
import shutil
from datetime import datetime

In [None]:
class PredictionModel(mlflow.pyfunc.PythonModel):
    """
    MLflow PythonModel for weight prediction with complete model context.
    """
    def __init__(self):
        self.MODEL_ENDPOINT = MODEL_ENDPOINT
        self.pipeline_inference_for_modelbase = None
        self.aidata_base = None
        self.model_base = None
        self.info_settings = None
        self.Inference_Entry = None 
        self.SPACE = None
        self.InputCFArgs_ForInference = InputCFArgs_ForInference
        self.InferenceArgs = InferenceArgs

    def load_context(self, context):
        """
        Load model context including external features, inference examples,
        models, and pipeline code.
        
        Context structure:
        - external/: External features
        - Inference/: Inference examples
        - models/: Model files
        - pipeline/: Python package
        """

        for key, path in context.artifacts.items():
            print(f"{key}: {path}")

        # Setup logging
        logging.basicConfig(
            level=logging.INFO,
            format='[%(levelname)s:%(asctime)s:(%(filename)s@%(lineno)d %(name)s)]: %(message)s'
        )
        self.logger = logging.getLogger(__name__)



        # SPACE['MODEL_ENDPOINT']

        MODEL_ENDPOINT = self.MODEL_ENDPOINT

        MODEL_ENDPOINT_FOLDER = context.artifacts[MODEL_ENDPOINT]
        MODEL_ROOT = os.path.dirname(MODEL_ENDPOINT_FOLDER)

        SPACE = {
            'MODEL_ROOT': MODEL_ROOT,
        }

        SPACE = process_inference_SPACE(SPACE, MODEL_ENDPOINT)


        logger.info(f"SPACE information: {SPACE}")  

        if SPACE['CODE_FN'] not in sys.path:
            sys.path.append(SPACE['CODE_FN'])
            sys.path = list(set(sys.path))

        # ----------------------------------------------------
        from recfldtkn.record_base.cohort import CohortFn, Cohort
        from recfldtkn.case_base.caseutils import get_ROCOGammePhiInfo_from_CFList
        from recfldtkn.aidata_base.aidata_base import AIData_Base 
        from recfldtkn.record_base.record_base import Record_Base
        from recfldtkn.case_base.case_base import Case_Base
        from recfldtkn.model_base.model_base import Model_Base
        from recfldtkn.base import fill_missing_keys
        from nn import load_model_instance_from_nn
        from inference.utils_inference import (
            load_AIData_Model_InfoSettings,
            load_Inference_Entry_Example,
            pipeline_inference_for_modelbase,
            Record_Proc_Config,
            Case_Proc_Config,
            OneEntryArgs_items_for_inference,
        )
        from inference.post_process import NAME_TO_FUNCTION
        # ----------------------------------------------------


        try:
            self.pipeline_inference_for_modelbase = pipeline_inference_for_modelbase

            self.MetaFn = NAME_TO_FUNCTION[MetaFnName]
            self.TrigFn = NAME_TO_FUNCTION[TrigFnName]
            self.PostFn = NAME_TO_FUNCTION[PostFnName]

            InputCFArgs_ForInference = self.InputCFArgs_ForInference
            InferenceArgs = self.InferenceArgs
            CohortName_to_OneCohortArgs = {INF_CohortName: INF_OneCohortArgs}

            ModelEndpoint_Path = os.path.join(SPACE['MODEL_ROOT'], SPACE['MODEL_ENDPOINT'])
            assert os.path.exists(ModelEndpoint_Path), f"Invalid ModelEndpoint_Path: {ModelEndpoint_Path}"

            Package_Settings = {
                'INF_CohortName': INF_CohortName,
                'INF_OneCohortArgs': INF_OneCohortArgs,
                'Record_Proc_Config': Record_Proc_Config,
                'Case_Proc_Config': Case_Proc_Config,
                'OneEntryArgs_items_for_inference': OneEntryArgs_items_for_inference,
                'get_ROCOGammePhiInfo_from_CFList': get_ROCOGammePhiInfo_from_CFList,
                'load_model_instance_from_nn': load_model_instance_from_nn,
                'Model_Base': Model_Base,
                'AIData_Base': AIData_Base,
            }

            Context = load_AIData_Model_InfoSettings(
                ModelEndpoint_Path = ModelEndpoint_Path,
                InputCFArgs_ForInference = InputCFArgs_ForInference, 
                InferenceArgs = InferenceArgs, 
                SPACE = SPACE,
                **Package_Settings,
            )
            
            self.model_base = Context['model_base']
            self.aidata_base = Context['aidata_base']
            self.InfoSettings = Context['InfoSettings']
            self.SPACE = SPACE

            Inference_Entry_Example = load_Inference_Entry_Example(INF_CohortName, 
                                                                    CohortName_to_OneCohortArgs,
                                                                    Cohort,
                                                                    CohortFn,
                                                                    SPACE)
            self.Inference_Entry_Example = Inference_Entry_Example


            self.Record_Base = Record_Base
            self.Case_Base = Case_Base

            
            # Import pipeline components
            self.logger.info("Successfully loaded model context and components")


        except Exception as e:
            self.logger.error(f"Failed to load model context: {str(e)}")
            raise


    def predict(self, context, model_input):
        """Run prediction using loaded context."""
        try:
            # #{dataframe_record: [ {input: {k1:v1, k2:v2}}]}
            # # df: column `input`
            # #           | {k1:v1, k2:v2}|
            # self.logger.info(model_input) # json_payload = {'xxx': model_input}

            # # model_input: df
            # # model_input = model_input['inputs'] # follow databrick's requirements 
            # # model_input: series # one-row series

            # model_input = model_input.iloc[0]


            # # model_input: {k1:v1, k2:v2}
            # TriggerName_to_CaseTriggerList = model_input['TriggerName_to_CaseTriggerList']


            df_model_input = model_input
            #{dataframe_record: [ {k1:v1, k2:v2}]}
            # df: column `k1`, k2
            #           | v1,|.v2|
            self.logger.warning(model_input) # json_payload = {'xxx': model_input}
            self.logger.warning(type(model_input))
            # model_input: df


            model_input = df_model_input.iloc[0].to_dict()
            # model_input: series: {k1:v1, k2:v2}


            # model_input: {k1:v1, k2:v2}
            # assert type(model_input) == dict, f'get type of {type(model_input)}'

            self.logger.warning(type(model_input))
            

            # ------------- TriggerName_to_dfCaseTrigger -------------
            if 'TriggerName_to_CaseTriggerList' not in model_input:
                inference_form = model_input['inference_form']
                TriggerName_to_CaseTriggerList = self.TrigFn(inference_form)
            else:
                TriggerName_to_CaseTriggerList = model_input['TriggerName_to_CaseTriggerList']          
            
            TriggerName_to_dfCaseTrigger = {k: pd.DataFrame(v) for k, v in TriggerName_to_CaseTriggerList.items()}

            for TriggerName, df in TriggerName_to_dfCaseTrigger.items():
                if 'ObsDT' not in df.columns:
                    df['ObsDT'] = pd.to_datetime(df['ObsDT_UTC']) + pd.to_timedelta(df['TimezoneOffset'], 'm')
                TriggerName_to_dfCaseTrigger[TriggerName] = df


            Inference_Entry = {}
            Inference_Entry['TriggerName_to_dfCaseTrigger'] = TriggerName_to_dfCaseTrigger
            Inference_Entry['inference_form'] = model_input['inference_form']
            Inference_Entry['template_form'] = self.Inference_Entry_Example['template_form']
            Inference_Entry['ModelArtifacts_to_call'] = None 

            pipeline_inference_for_modelbase = self.pipeline_inference_for_modelbase

            inference_results = pipeline_inference_for_modelbase(
                Inference_Entry = Inference_Entry,
                Record_Base = self.Record_Base, 
                Case_Base = self.Case_Base,
                aidata_base = self.aidata_base, 
                model_base = self.model_base,
                InfoSettings = self.InfoSettings, 
                SPACE = self.SPACE
            )

            # ----------------------------------------------------
            du1 = inference_results['du1']
            du2 = inference_results['du2']
            du3 = inference_results['du3']
            du4 = inference_results['du4']
            total_time = inference_results['total_time']

            self.logger.info(f"record_base: {du1}")
            self.logger.info(f"case_base: {du2}")
            self.logger.info(f"aidata_base and model_base update: {du3}")
            self.logger.info(f"model_infernece: {du4}")
            self.logger.info(f"total_time: {total_time}")

            print(inference_results)
            
            ModelArtifactName_to_Inference = inference_results['ModelArtifactName_to_Inference']
            results = self.PostFn(ModelArtifactName_to_Inference, self.SPACE)
            
            self.logger.info("Successfully ran prediction")
            return results
            
            
        except Exception as e:
            self.logger.error(f"Prediction failed: {str(e)}")
            raise


In [None]:
artifacts = {
    SPACE['MODEL_ENDPOINT']: os.path.join(SPACE['MODEL_ROOT'], SPACE['MODEL_ENDPOINT']),
}

artifacts

In [None]:
conda_env = {
    "channels": [
        "pytorch",
        "nvidia", 
        "defaults", 
        "conda-forge", 
        ],
    "dependencies": [
        f"python={sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}",
        "pip", 

        # pay attention to this pytorch part. 
        "cudatoolkit", 
        "pytorch", 
        "torchvision", 
        "torchaudio",
        "cudatoolkit",

        "ipykernel", 

        "datasets",
        "pandas==2.2.0",
        "requests==2.31.0",
        "scikit-learn==1.4.0",
        "scipy==1.12.0",
        "tokenizers==0.15.1",
        "xgboost==2.0.3",
        "Werkzeug==2.0.2",
        "Pympler==1.1",

        "numpy",
        "gunicorn",
        "matplotlib",#
    ],
    "name": "weight_prediction_env"
}

In [None]:
def underscore_to_hyphen(parent_dir):
    os.chdir(f'{parent_dir}/ds_case')
    print(os.getcwd())
    files = os.listdir()
    for file in files:
        print(file)
        if file.split('.')[-1] == 'arrow':
            new_file = file.replace('_', '-')
            os.rename(file, new_file)
    os.chdir('../../')

In [None]:
print(os.getcwd())

In [None]:
# model_folder_path = 'weight_prediction_model'

# Register in Local

In [None]:
if os.path.exists(MODEL_NAME):
    shutil.rmtree(MODEL_NAME)

mlflow.pyfunc.save_model(
    path = MODEL_NAME,
    python_model=PredictionModel(),
    artifacts=artifacts,
    conda_env=conda_env
)

In [None]:
loaded_model = mlflow.pyfunc.load_model(MODEL_NAME)

# Register in Databrick

In [None]:
# ################
# MODEL_NAME = MODEL_NAME
# #Update this peice of code before
# i = 0
# ################


# with mlflow.start_run() as run:
#     mlflow.pyfunc.log_model(
#         MODEL_NAME,
#         python_model=WeightPredictionModel(),
#         signature=None,
#         artifacts=artifacts,
#         conda_env = conda_env
#     )
#     run_id = run.info.run_id

#     # Register the model
#     model_uri = f"runs:/{run_id}/{MODEL_NAME}"
#     mlflow.register_model(model_uri=model_uri, name=MODEL_NAME)

In [None]:
# MODEL_ENDPOINT = i + 1
# model_uri = f"models:/{MODEL_NAME}/{MODEL_ENDPOINT}"
# loaded_model = mlflow.pyfunc.load_model(model_uri = model_uri)

# Test Model

In [None]:
with open(json_payload_path, 'r') as f:
    json_payload = json.load(f)

model_input = pd.DataFrame(json_payload['dataframe_records'])
display(model_input)


In [None]:
result = loaded_model.predict(model_input)
print("Prediction result:", result)
pprint(result)

# Compress Folder

In [None]:
import os
import zipfile
import shutil

def remove_unwanted_files(folder_path):
    """Remove macOS trash files and Python rubbish files from the specified folder."""
    for root, dirs, files in os.walk(folder_path):
        # Remove specific files
        for name in files:
            if name in [".DS_Store"] or name.endswith((".pyc", ".pyo", "~")):
                file_path = os.path.join(root, name)
                print(f"Removing {file_path}")
                os.remove(file_path)
        
        # Remove specific directories
        for name in dirs:
            if name in ["__MACOSX", "__pycache__"]:
                dir_path = os.path.join(root, name)
                print(f"Removing {dir_path}")
                shutil.rmtree(dir_path, ignore_errors=True)

def compress_to_zip(folder_path, output_file):
    """Compress the folder into a zip file after cleaning."""
    remove_unwanted_files(folder_path)  # Clean up trash files
    with zipfile.ZipFile(output_file, "w", zipfile.ZIP_DEFLATED) as zipf:
        for root, _, files in os.walk(folder_path):
            for file in files:
                file_path = os.path.join(root, file)
                arcname = os.path.relpath(file_path, folder_path)  # Maintain folder structure
                zipf.write(file_path, arcname)
    print(f"Compressed {folder_path} to {output_file}")


In [None]:
if os.path.exists(MODEL_NAME):
    shutil.rmtree(MODEL_NAME)

RUN = './mlruns'
if os.path.exists(RUN):
    shutil.rmtree(RUN)

In [None]:

# Usage example
folder_to_compress = os.getcwd()
output_tar_gz = f"{folder_to_compress}.zip"
print(output_tar_gz)

if os.path.exists(output_tar_gz):
    os.remove(output_tar_gz)

In [None]:
compress_to_zip(folder_to_compress, output_tar_gz)