In [4]:
import toml
import snowflake.connector
from snowflake.snowpark.session import Session
from snowflake.snowpark.functions import call_udf, array_construct, pandas_udf, col, udf

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    ConfusionMatrixDisplay,
)
import optuna

In [2]:
config = toml.load("../config.toml")
connection_parameters = config["snowflake_connection"]
session = Session.builder.configs(connection_parameters).create()

# Check if the connection and database are correct:
session.sql(
    "select current_warehouse(), current_database(), current_schema(), current_user(), current_role()"
).collect()

[Row(CURRENT_WAREHOUSE()='COMPUTE_WH', CURRENT_DATABASE()='HEART_DB', CURRENT_SCHEMA()=None, CURRENT_USER()='JOANABAIAO', CURRENT_ROLE()='ACCOUNTADMIN')]

# Train the model (locally)


Before implementing the training process within Snowflake, I first tested a simple machine learning model locally. For this initial experiment, I used the Random Forest classifier.

The goal is to experiment with and fine-tune the model, ensuring that the chosen approach works effectively before integrating it into a Stored Procedure.


In [3]:
table_name = "DATA_TABLE_1"
df = session.table(table_name).to_pandas()
df.head()

Unnamed: 0,AGE,SEX,CP,TRESTBPS,CHOL,FBS,RESTECG,THALACH,EXANG,OLDPEAK,SLOPE,CA,THAL,TARGET
0,59,1,0,135.0,234,0.0,1.0,161.0,0.0,0.5,1.0,0.0,3,1
1,59,1,0,140.0,177,0.0,1.0,162.0,1.0,0.0,2.0,1.0,3,0
2,70,1,1,156.0,245,0.0,0.0,143.0,0.0,0.0,2.0,0.0,2,1
3,65,0,0,150.0,225,0.0,0.0,114.0,0.0,1.0,1.0,3.0,3,0
4,58,1,2,105.0,240,0.0,0.0,154.0,1.0,0.6,1.0,0.0,3,1


Split into train-test sets and scale


In [None]:
X = df.drop("TARGET", axis=1)
y = df["TARGET"]

# Split dataset into training and test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

The optimization technique Optuna will be used to fine-tune hyperparameters and select the best model based on cross-validation results.

**Steps:**

1. Define the Optuna objective and callback functions
2. Run the Optuna study
3. Analyze results of the best trial

**Calback function:** The callback function is triggered at the end of each trial. It checks if the current trial (trial) is the best trial using `study.best_trial.number == trial.number`. If true, it updates the study's user_attrs with the best model from the current trial.


In [None]:
def objective(trial):

    # Define hyperparameters and initialise classifier
    params = {
        "max_depth": trial.suggest_int("max_depth", 2, 64, log=True),
        "n_estimators": trial.suggest_int("n_estimators", 5, 50, log=True),
        "max_samples": trial.suggest_float("rf_max_samples", 0.2, 1),
    }
    classifier = RandomForestClassifier(**params, random_state=42)

    # Evaluate using cross-validation
    score = cross_val_score(
        classifier, X_train_scaled, y_train, cv=3, scoring="roc_auc"
    ).mean()

    # Attach the classifier to the trial
    trial.set_user_attr(key="best_model", value=classifier)

    return score


def callback(study, trial):
    if study.best_trial.number == trial.number:
        study.set_user_attr(key="best_model", value=trial.user_attrs["best_model"])

In [None]:
study = optuna.create_study(
    direction="maximize", sampler=optuna.samplers.RandomSampler(seed=42)
)
study.optimize(objective, n_trials=100, callbacks=[callback])

print("Best Trial:")
best_trial = study.best_trial
best_model = study.user_attrs["best_model"]

print("Best Score: ", best_trial.value)
print("Best Params: ")
for key, value in best_trial.params.items():
    print("  {}: {}".format(key, value))

Now lets rebuild our classifier with the parameters from Optuna.

The optimized model is trained on the entire training set and evaluated on the test set. Key metrics include:

- Confusion matrix
- Classification report


In [None]:
# Fit best model
best_trial = study.best_trial
best_model = best_trial.user_attrs["best_model"]
best_model.fit(X_train_scaled, y_train)

# Predict on the test data
y_pred = best_model.predict(X_test_scaled)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

# Print the metrics
print("Model Evaluation Metrics:")
print(f"* Accuracy: {accuracy:.4f}")
print(f"* Precision: {precision:.4f}")
print(f"* Recall (Sensitivity): {recall:.4f}")
print(f"* F1-Score: {f1:.4f}")
print(f"* ROC AUC Score: {roc_auc:.4f}")
print()

conf_matrix = confusion_matrix(y_test, y_pred)
TN, FP, FN, TP = conf_matrix.ravel()

print("True Negative (TN):", TN)
print("False Positive (FP):", FP)
print("False Negative (FN):", FN)
print("True Positive (TP):", TP)


disp = ConfusionMatrixDisplay(
    confusion_matrix=conf_matrix, display_labels=best_model.classes_
)
disp.plot(cmap=plt.cm.Blues, values_format="g")
plt.show()

### Visualizing the Optuna Results


In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_parallel_coordinate(study)

In [None]:
optuna.visualization.plot_slice(study, params=["n_estimators", "max_depth"])

In [None]:
optuna.visualization.plot_param_importances(study)