In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
import json
import logging
import pickle
import toml
import os
from typing import List, Union, Tuple
import joblib
import optuna
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    ConfusionMatrixDisplay,
)

import snowflake.connector
from snowflake.snowpark.session import Session
from snowflake.snowpark.functions import call_udf, array_construct, pandas_udf, col, udf

from dateutil.relativedelta import relativedelta
from snowflake.snowpark import types as T

# 1. Connect to Snowflake


In [27]:
config = toml.load("../config.toml")
connection_parameters = config["snowflake_connection"]
session = Session.builder.configs(connection_parameters).create()

# Check if the connection and database are correct:
session.sql(
    "select current_warehouse(), current_database(), current_schema(), current_user(), current_role()"
).collect()

[Row(CURRENT_WAREHOUSE()='COMPUTE_WH', CURRENT_DATABASE()='HEART_DB', CURRENT_SCHEMA()='PUBLIC', CURRENT_USER()='JOANABAIAO', CURRENT_ROLE()='ACCOUNTADMIN')]

# 2. Training function


In [34]:
def upload_model_to_stage(session, obj, model_stage, model_id):
    """Upload model to Snowflake stage"""

    if not model_stage.startswith("@"):
        model_stage = f"@{model_stage}"

    temp_file_path = os.path.join("/tmp", model_id)
    joblib.dump(obj, temp_file_path)
    session.file.put(temp_file_path, model_stage, overwrite=True, auto_compress=False)
    os.remove(temp_file_path)

    print(f"File '{model_id}' uploaded successfully to stage '{model_stage}'.")


def load_model_from_stage(session, model_stage, model_id):
    """Load model from Snowflake stage"""

    if not model_stage.startswith("@"):
        model_stage = f"@{model_stage}"

    file_path = f"{model_stage}/{model_id}"
    local_path = f"/tmp/{model_id}"

    session.file.get(file_path, "/tmp/")
    model = joblib.load(local_path)

    return model


def deploy_model_as_udf(
    session: Session,
    model,
    scaler,
    model_id: str,
    model_stage: str,
    function_stage: str,
    required_packages: List[str],
):
    """
    Deploy trained model and scaler as a permanent UDF
    """

    scaler_id = f"{model_id}_scaler"
    upload_model_to_stage(session, scaler, model_stage, scaler_id)
    upload_model_to_stage(session, model, model_stage, model_id)

    # Create the prediction function
    def predict(features: list) -> float:
        features_array = np.array(features).reshape(1, -1)
        scaled_features = scaler.transform(features_array)
        return float(model.predict(scaled_features)[0])

    # Register as permanent UDF
    udf_name = f"PREDICT_{model_id}"
    session.udf.register(
        func=predict,
        name=udf_name,
        stage_location=f"@{function_stage}",
        is_permanent=True,
        packages=required_packages,
        imports=[f"@{model_stage}/{model_id}", f"@{model_stage}/{scaler_id}"],
    )

    print(f"Successfully deployed model as UDF: {udf_name}")
    return udf_name


def evaluate_model(y_test, y_pred):

    conf_matrix = confusion_matrix(y_test, y_pred)
    TN, FP, FN, TP = conf_matrix.ravel()

    metrics = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred),
        "recall": recall_score(y_test, y_pred),
        "f1": f1_score(y_test, y_pred),
        "auc_roc": roc_auc_score(y_test, y_pred),
        "TN": TN,
        "FP": FP,
        "FN": FN,
        "TP": TP,
    }

    return metrics


def save_training_info(
    session,
    model_id,
    model_name,
    optimization,
    training_table,
    feature_columns,
    metrics,
    table_name="MODEL_TRAINING_INFO",
):
    from datetime import datetime

    try:
        training_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

        # training_date = str(datetime.datetime.now())
        feature_columns_str = ",".join([f"'{col}'" for col in feature_columns])

        insert_query = f"""
                INSERT INTO {table_name} (
                training_date,
                model_id,
                model_name,
                optimization,
                training_table,
                feature_columns,
                accuracy, precision, recall, f1_score, auc_roc,
                TN, FP, FN, TP
            )
            SELECT 
                '{training_date}', 
                '{model_id}', 
                '{model_name}', 
                {optimization}, 
                '{training_table}', 
                ARRAY_CONSTRUCT({feature_columns_str}),
                {metrics['accuracy']}, {metrics['precision']}, {metrics['recall']},
                {metrics['f1']}, {metrics['auc_roc']},
                {metrics['TN']}, {metrics['FP']}, {metrics['FN']}, {metrics['TP']};
            """

        session.sql(insert_query).collect()
        print(f"Training details for model '{model_id}' have been successfully logged.")

    except Exception as e:
        print(f"Error logging training details for model '{model_id}': {e}")

In [35]:
def train_and_deploy_model(
    session: Session,
    model_name: str,
    optimize: bool,
    training_table: str,
) -> dict:

    ##########################
    # IMPORTS
    ##########################
    import optuna
    from sklearn.model_selection import train_test_split, cross_val_score
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.linear_model import LogisticRegression
    from sklearn.svm import SVC
    from xgboost import XGBClassifier

    model_abbreviations = {
        "Random Forest": "RF",
        "XGBoost": "XGB",
        "K-Nearest Neighbors": "KNN",
        "Support Vector Machine": "SVM",
    }

    ##########################
    # FUNCTIONS AND VARIABLES
    ##########################
    python_packages = [
        "pandas==2.2.3",
        "scikit-learn==1.5.2",
        "xgboost==1.7.3",
    ]

    ##########################
    # STEP 1: LOAD TABLE DATA
    ##########################
    # Generate a unique model name using sequence from the database
    seq_value = str(session.sql("select MODEL_SEQ.nextval").collect()[0][0])

    model_reduced = model_abbreviations.get(model_name)
    model_id = f"{model_reduced}_{seq_value}"
    scaler_id = f"{model_reduced}_{seq_value}_scaler"

    # Load the table
    df = session.table(training_table).to_pandas()

    ##########################
    # Data Preprocessing
    ##########################
    # Separate target variable and features
    X = df.drop("TARGET", axis=1)
    y = df["TARGET"]
    feature_columns = X.columns.to_numpy()

    # Split dataset into training and test
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # Scale data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    ##########################
    # Model Training
    ##########################
    if optimize:

        def objective(trial):
            clf = None
            # Define hyperparameters and initialise classifier
            if model_reduced == "RF":
                params = {
                    "max_depth": trial.suggest_int("max_depth", 2, 64),
                    "n_estimators": trial.suggest_int("n_estimators", 5, 100),
                    "max_samples": trial.suggest_float("rf_max_samples", 0.2, 1),
                }
                clf = RandomForestClassifier(**params, random_state=42)

            elif model_reduced == "XGB":
                params = {
                    "max_depth": trial.suggest_int("max_depth", 2, 20),
                    "n_estimators": trial.suggest_int("n_estimators", 10, 200),
                    "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
                }
                clf = XGBClassifier(**params, random_state=42)

            elif model_reduced == "KNN":
                params = {
                    "n_neighbors": trial.suggest_int("n_neighbors", 1, 30),
                }
                clf = KNeighborsClassifier(**params)

            elif model_reduced == "SVM":
                params = {
                    "C": trial.suggest_float("C", 1e-5, 1e5),
                    "gamma": trial.suggest_float("gamma", 1e-5, 1e2),
                }
                clf = SVC(**params, random_state=42, probability=True)

            elif model_reduced == "LR":
                params = {
                    "C": trial.suggest_loguniform("C", 1e-4, 1e2),
                    "penalty": trial.suggest_categorical("penalty", ["l1", "l2"]),
                }
                clf = LogisticRegression(**params)

            else:
                raise ValueError("Invalid model_name value.")

            # Evaluate using cross-validation
            score = cross_val_score(
                clf, X_train_scaled, y_train, cv=3, scoring="roc_auc"
            ).mean()

            # Attach the classifier to the trial
            trial.set_user_attr(key="best_model", value=clf)

            return score

        def callback(study, trial):
            if study.best_trial.number == trial.number:
                study.set_user_attr(
                    key="best_model", value=trial.user_attrs["best_model"]
                )

        # Find best model
        study = optuna.create_study(
            direction="maximize", sampler=optuna.samplers.RandomSampler(seed=42)
        )

        study.optimize(objective, n_trials=100, callbacks=[callback])
        best_trial = study.best_trial
        classifier = best_trial.user_attrs["best_model"]  # Best model

    # Without optimization with Optuna
    else:
        if model_reduced == "RF":
            classifier = RandomForestClassifier(random_state=42)
        elif model_reduced == "XGB":
            classifier = XGBClassifier(random_state=42)
        elif model_reduced == "KNN":
            classifier = KNeighborsClassifier(n_neighbors=10)
        elif model_reduced == "SVM":
            classifier = SVC(random_state=42)
        elif model_reduced == "LR":
            classifier = LogisticRegression(random_state=42)

    # Fit the classifier in the training data and predict on the test data
    classifier.fit(X_train_scaled, y_train)
    y_pred = classifier.predict(X_test_scaled)

    ##########################
    # Model Evaluation
    ##########################
    metrics = evaluate_model(y_test, y_pred)

    ##################################
    # SAVE MODELS AND RESULTS
    ##################################
    save_training_info(
        session,
        model_id,
        model_name,
        optimize,
        training_table,
        feature_columns,
        metrics,
    )

    udf_name = deploy_model_as_udf(
        session,
        model=classifier,
        scaler=scaler,
        model_id=model_id,
        model_stage="MODEL_STAGE",
        function_stage="FUNCTION_STAGE",
        required_packages=python_packages,
    )

    result = {
        "model_id": model_id,
        "udf_name": udf_name,
    }

    return result

In [36]:
%%time

train_and_deploy_model(session, model_name="XGBoost", optimize=True, training_table="DATA_TABLE_1")

[I 2025-01-22 11:33:57,750] A new study created in memory with name: no-name-88f5ab33-877d-4844-a2a1-bb39437f8867
[I 2025-01-22 11:33:58,033] Trial 0 finished with value: 0.9286946834482922 and parameters: {'max_depth': 9, 'n_estimators': 191, 'learning_rate': 0.22227824312530747}. Best is trial 0 with value: 0.9286946834482922.
[I 2025-01-22 11:33:58,130] Trial 1 finished with value: 0.918216438724154 and parameters: {'max_depth': 13, 'n_estimators': 39, 'learning_rate': 0.055238410897498764}. Best is trial 0 with value: 0.9286946834482922.
[I 2025-01-22 11:33:58,272] Trial 2 finished with value: 0.9233166623211421 and parameters: {'max_depth': 3, 'n_estimators': 175, 'learning_rate': 0.18432335340553055}. Best is trial 0 with value: 0.9286946834482922.
[I 2025-01-22 11:33:58,323] Trial 3 finished with value: 0.9201817219338374 and parameters: {'max_depth': 15, 'n_estimators': 13, 'learning_rate': 0.29127385712697834}. Best is trial 0 with value: 0.9286946834482922.
[I 2025-01-22 11:3

Training details for model 'XGB_101' have been successfully logged.
File 'XGB_101_scaler' uploaded successfully to stage '@MODEL_STAGE'.
File 'XGB_101' uploaded successfully to stage '@MODEL_STAGE'.
Successfully deployed model as UDF: PREDICT_XGB_101
CPU times: user 1min 34s, sys: 3.79 s, total: 1min 37s
Wall time: 21.2 s


{'model_id': 'XGB_101', 'udf_name': 'PREDICT_XGB_101'}

In [37]:
session.sql("ls @MODEL_STAGE").collect()

[Row(name='model_stage/XGB_101', size=48032, md5='4f5757fa031b4dff58b59d49adc8ec9b', last_modified='Wed, 22 Jan 2025 11:34:14 GMT'),
 Row(name='model_stage/XGB_101_scaler', size=1360, md5='52da28936711b10b53f04a52a2a438e4', last_modified='Wed, 22 Jan 2025 11:34:14 GMT')]

In [38]:
df_training_info = session.table("MODEL_TRAINING_INFO").to_pandas()
df_training_info

Unnamed: 0,TRAINING_DATE,MODEL_ID,MODEL_NAME,OPTIMIZATION,TRAINING_TABLE,FEATURE_COLUMNS,ACCURACY,PRECISION,RECALL,F1_SCORE,AUC_ROC,TN,FP,FN,TP
0,2025-01-22 11:34:12,XGB_101,XGBoost,True,DATA_TABLE_1,"[\n ""AGE"",\n ""SEX"",\n ""CP"",\n ""TRESTBPS"",\...",0.942029,0.947368,0.947368,0.947368,0.941426,29,2,2,36


<b>Register</b> the function as a <b>Stored Procedure</b> within Snowflake
This will push down all the function and dependencies into Snowflake.
Once the SP is being defined, we can call it directly from Snowflake without the need of a notebook.
For example it can be called from a Task to trigger a new training based on new data.

Because <b>optuna</b> and <b>cmaes</b> libraries are not included in the Anaconda repository, we are going to add them from the local repository. We get the local path and we will include it later in the imports section when registering the functions as Stored Procedures


In [39]:
optuna_path = optuna.__path__[0]

session.sproc.register(
    func=train_and_deploy_model,
    name="train_and_deploy_model",
    packages=[
        "snowflake-snowpark-python",
        "scikit-learn==1.5.2",
        "xgboost==1.7.3",
        "sqlalchemy==1.4.39",
        "tqdm==4.64.1",
        "colorlog==5.0.1",
    ],
    imports=[optuna_path],
    is_permanent=True,
    stage_location="@FUNCTION_STAGE",
    replace=True,
)

<snowflake.snowpark.stored_procedure.StoredProcedure at 0x1468f7710>

Test now the Stored Procedure for training executed within Snowflake


In [21]:
model_name = "XGBoost"
optimization = True
training_table = "DATA_TABLE_2"

session.call(
    "train_and_deploy_model",
    model_name,
    optimization,
    training_table,
)

'{\n  "model_id": "XGB_102",\n  "udf_name": "PREDICT_XGB_102"\n}'

In [22]:
session.sql("ls @MODEL_STAGE").collect()

[Row(name='model_stage/SVM_2', size=33584, md5='bdf5762f9f5a3b30f16913cf7df24918', last_modified='Tue, 21 Jan 2025 17:15:24 GMT'),
 Row(name='model_stage/SVM_2_scaler', size=1360, md5='fce1e5e5e248e89c6b1dd4d1973b9755', last_modified='Tue, 21 Jan 2025 17:15:24 GMT'),
 Row(name='model_stage/XGB_101', size=95856, md5='407b7b5f6391945a13f57753e50546a0', last_modified='Tue, 21 Jan 2025 17:45:50 GMT'),
 Row(name='model_stage/XGB_101_scaler', size=1360, md5='929db76dc27def46552fe33429ec9ee6', last_modified='Tue, 21 Jan 2025 17:45:50 GMT'),
 Row(name='model_stage/XGB_102', size=240896, md5='d33a37948de0d7e5ab29b20f5e42ee54', last_modified='Tue, 21 Jan 2025 17:47:20 GMT'),
 Row(name='model_stage/XGB_102_scaler', size=1360, md5='8e574d847fbdeb9ae3a09592e979d6c2', last_modified='Tue, 21 Jan 2025 17:47:19 GMT'),
 Row(name='model_stage/XGB_3', size=95856, md5='b9992a9f45c864106b9a3c4e1b7f2649', last_modified='Tue, 21 Jan 2025 17:16:53 GMT'),
 Row(name='model_stage/XGB_3_scaler', size=1360, md5='f3

# 3. Run Inference


In [40]:
def save_inference_details(
    session,
    model_id,
    training_table,
    test_table,
    # predictions_table,
    metrics,
    table_name="INFERENCE_RESULTS",
):

    from datetime import datetime

    try:
        inference_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        insert_query = f"""
                INSERT INTO {table_name} (
                    inference_date,
                    model_id,
                    training_table,
                    test_table,
                    accuracy, precision, recall, f1_score, auc_roc,
                    TN, FP, FN, TP
                )
                VALUES (
                    '{inference_date}',
                    '{model_id}',
                    '{training_table}',
                    '{test_table}',
                    {metrics['accuracy']}, {metrics['precision']}, {metrics['recall']},
                    {metrics['f1']}, {metrics['auc_roc']},
                    {metrics['TN']}, {metrics['FP']}, {metrics['FN']}, {metrics['TP']}
                );
                """

        session.sql(insert_query).collect()
        print(f"Inference details stored successfully for model {model_id}.")

    except Exception as e:
        print(f"Failed to store inference details: {e}")

In [41]:
def run_inference(
    session: Session,
    test_table: str,
    model_id: str,
    predictions_table: str = "PREDICTIONS_RESULT",
    target_column: str = "TARGET",
) -> None:

    try:

        query_result = session.sql(
            f"""
            SELECT FEATURE_COLUMNS, TRAINING_TABLE
            FROM MODEL_TRAINING_INFO 
            WHERE MODEL_ID = '{model_id}'
        """
        ).collect()

        training_table = query_result[0]["TRAINING_TABLE"]
        feature_columns = query_result[0]["FEATURE_COLUMNS"]
        feature_columns = json.loads(feature_columns)
        features_array = array_construct(*[col(c) for c in feature_columns])

        # Get data
        df = session.table(test_table)
        has_target = target_column in df.columns

        # Predict
        df_predictions = df.select(
            *feature_columns,
            *([target_column] if has_target else []),
            call_udf(f"PREDICT_{model_id}", features_array).alias("PREDICTION"),
        )

        df_predictions.write.mode("overwrite").save_as_table(predictions_table)
        print(f"Predictions saved in table {predictions_table}!")

        if has_target:
            df_predictions_pd = df_predictions.to_pandas()
            metrics = evaluate_model(
                df_predictions_pd[target_column], df_predictions_pd["PREDICTION"]
            )

            save_inference_details(
                session,
                model_id,
                training_table,
                test_table,
                # predictions_table,
                metrics,
            )

        return metrics

    except Exception as e:
        raise Exception(f"Error in inference pipeline: {str(e)}")

In [44]:
run_inference(session, test_table="DATA_TABLE_2", model_id="RF_201")

Predictions saved in table PREDICTIONS_RESULT!
Inference details stored successfully for model RF_201.


{'accuracy': 0.9824561403508771,
 'precision': 0.9827586206896551,
 'recall': 0.9827586206896551,
 'f1': 0.9827586206896551,
 'auc_roc': 0.9824507389162561,
 'TN': 165,
 'FP': 3,
 'FN': 3,
 'TP': 171}

In [43]:
session.sproc.register(
    func=run_inference,
    name="run_inference",
    packages=[
        "snowflake-snowpark-python",
        "scikit-learn==1.2.1",
        "joblib==1.1.1",
        "xgboost==1.7.3",
    ],
    is_permanent=True,
    stage_location="@FUNCTION_STAGE",
    replace=True,
)

<snowflake.snowpark.stored_procedure.StoredProcedure at 0x146503a50>

In [46]:
session.call("run_inference", "DATA_TABLE_3", "RF_201")

"{'accuracy': 0.9093567251461988, 'precision': 0.9090909090909091, 'recall': 0.9036144578313253, 'f1': 0.9063444108761329, 'auc_roc': 0.9091935925520264, 'TN': 161, 'FP': 15, 'FN': 16, 'TP': 150}"

In [48]:
df_predictions_result = session.table("PREDICTIONS_RESULT").to_pandas()
df_predictions_result

Unnamed: 0,AGE,SEX,CP,TRESTBPS,CHOL,FBS,RESTECG,THALACH,EXANG,OLDPEAK,SLOPE,CA,THAL,TARGET,PREDICTION
0,50,0,2,120.0,219,0.0,1.0,158.0,0.0,1.6,1.0,0.0,2,1,1.0
1,40,1,0,152.0,223,0.0,1.0,181.0,0.0,0.0,2.0,0.0,3,0,1.0
2,60,1,2,140.0,185,0.0,0.0,155.0,0.0,3.0,1.0,0.0,2,0,0.0
3,54,0,2,160.0,201,0.0,1.0,163.0,0.0,0.0,2.0,1.0,2,1,1.0
4,65,1,0,110.0,248,0.0,0.0,158.0,0.0,0.6,2.0,2.0,1,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
337,59,1,0,110.0,239,0.0,0.0,142.0,1.0,1.2,1.0,1.0,3,0,0.0
338,60,1,0,130.0,206,0.0,0.0,132.0,1.0,2.4,1.0,2.0,3,0,0.0
339,55,0,1,135.0,250,0.0,0.0,161.0,0.0,1.4,1.0,0.0,2,1,1.0
340,61,1,0,138.0,166,0.0,0.0,125.0,1.0,3.6,1.0,1.0,2,0,0.0
