# Using the Snowflake Model Registry
We will use prepared data in Snowflake on used car prices and try to predict a price for a given car using ML functionality.  The model we create will be registered to the Snowflake Model registry and be using in both Python and SQL in Snowflake.

## You will learn to do in Snowflake/Snowpark
- how to register a model in Snowflake


In [None]:
#import necessary packages --- needs snowpark-ml-python 1.5.0 (select it in the upper right packages list)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from snowflake.ml.registry import Registry
import re




In [None]:
#snowflake libs
from snowflake.snowpark.session import Session
from snowflake.snowpark.context import get_active_session
session = get_active_session()

source_table_name = "LAB_DATA.PUBLIC.CARS"
clean_table_name = "LAB_DATA.PUBLIC.CARS_CLEANED_ALL_COLUMNS"

MY_NAME='MY_NAME' ##!!!CHANGE THIS TO YOUR NAME!!!
session.use_database("HOL_SIMON")
session.sql("create schema if not exists "+MY_NAME+";").collect()
session.use_schema(MY_NAME)

print(session.get_current_database())
print(session.get_current_schema())
print(session.get_current_role())

In [None]:
#familiar python code running on Scalable Snowflake architecture and leveraging a bit of Snowpark to collect data
pd_df_cleaned = session.table(clean_table_name).limit(5000).to_pandas()
pd_df_cleaned.head(5)


In [None]:
#split training data out
def get_training_columns(columns):
    clean_columns_train = []
    for col in columns:
        if col != 'PRICE':
                clean_columns_train.append(col)
    return clean_columns_train
    
get_training_columns = get_training_columns(pd_df_cleaned)

X_train, X_test, y_train, y_test = train_test_split(pd_df_cleaned[get_training_columns], pd_df_cleaned['PRICE'],test_size=0.2)

In [None]:

# Define the quantile
quantiles = [0.1, 0.5, 0.9]

# Train models for each quantile
models = {}
for q in quantiles:
    model = GradientBoostingRegressor(loss='quantile', alpha=q, n_estimators=100, learning_rate=0.1, max_depth=3)
    model.fit(X_train, y_train)
    models[q] = model

# Predict and evaluate
predictions = {}
for q, model in models.items():
    predictions[q] = model.predict(X_test)

# Calculate quantile loss
def quantile_loss(q, y_true, y_pred):
    e = y_true - y_pred
    return np.mean(np.maximum(q * e, (q - 1) * e))

for q in quantiles:
    loss = quantile_loss(q, y_test, predictions[q])
    print(f"Quantile loss for quantile {q}: {loss}")

# Calculate coverage probability
coverage = np.mean((predictions[0.1] <= y_test) & (y_test <= predictions[0.9]))
print(f"Coverage probability for 10th and 90th percentiles: {coverage}")

gbr_model = models[0.5]


In [None]:
# Visualize predictions
plt.figure(figsize=(10, 6))
plt.scatter(y_test, predictions[0.5], alpha=0.5, label='Median Prediction')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2, label='Perfect Prediction')
plt.xlabel('True Values')
plt.ylabel('Predicted Values')
plt.title('Quantile Regression Predictions')
plt.legend()
plt.show()

In [None]:
#get a model registry object
reg = Registry(
    session=session, 
    database_name=session.get_current_database(), 
    schema_name=session.get_current_schema()
    )

In [None]:
#register our model, this also automatically creates a SQL function
my_model_version = reg.log_model(gbr_model,
                   model_name="car_price_gbr_jordan",
                   version_name="v1",
                   conda_dependencies=["scikit-learn"],
                   comment="My awesome ML model",
                   metrics={"score": 96},
                   sample_input_data=X_train.head(10))

In [None]:
#score some data in python
sf_df = session.table(clean_table_name).sample(.01)
result_df = my_model_version.run(sf_df).cache_result()
result_df[['"output_feature_0"','"PRICE"','"MILEAGE"','"LEVY"']].show(n=15, max_width=1000)

In [None]:
--can score the model in ANY SQL now too
select 
    car_price_gbr_jordan!predict(*):output_feature_0::number(10,0) PREDICTED_PRICE,
from LAB_DATA.PUBLIC.CARS_CLEANED_NO_PRICE limit 5;


## QUIZ

See if you can change the output of the model from "output_feature_0" to "PRICE"

Check out the docs on [Signatures](https://docs.snowflake.com/en/developer-guide/snowpark-ml/model-registry/model-signature)
