# An Snowpark Workflow for Regression
We will use sample data on used cars and try to predict a price for aa given car using various features

## You will learn to do in Snowflake/Snowpark
- how to prep data
- how to train / score the model
- how to register the model


In [None]:
#import necessary packages --- needs snowflake-ml-python 1.5.0 (select it in the upper right packages list)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from snowflake.ml.registry import Registry
import re




In [None]:
#snowflake libs
from snowflake.snowpark.session import Session
from snowflake.snowpark.context import get_active_session
session = get_active_session()

source_table_name = "LAB_DATA.PUBLIC.CARS"
clean_table_name = "CARS_CLEANED"

#session.use_database("HOL_TAKETWO")
#session.use_schema("MY_NAME")

print(session.get_current_database());
print(session.get_current_schema());
print(session.get_current_role());

In [None]:
#familiar python code running on Scalable Snowflake architecture and leveraging a bit of Snowpark to collect data
pd_df = session.table(source_table_name).limit(5000).to_pandas()
def remove_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = 3000
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    
pd_df = remove_outliers_iqr(pd_df,'PRICE').reset_index()
pd_df.dtypes

In [None]:
#Separate numerical, categorical, and ordinal columns
numerical_cols = ['YEAR_PROD','MILEAGE','LEVY','CYL']  # List of numerical columns
categorical_cols = ['MAKE', 'MODEL_NAME', 'COLOR','FUEL_TYPE','DRIVE' ]  # List of categorical columns

pd_df['LEVY'] = pd_df['LEVY'].str.replace('-','').replace('',0)
pd_df['MILEAGE'] = pd_df['MILEAGE'].str.replace(r'[a-zA-Z ]', '', regex=True).replace('',0)

pd_df_cleaned = pd.DataFrame(pd_df[numerical_cols+categorical_cols+['PRICE']])


In [None]:


# Normalize numerical columns
scaler = StandardScaler()
pd_df_cleaned[numerical_cols] = scaler.fit_transform(pd_df_cleaned[numerical_cols])

# One-hot encode categorical columns
encoder = OneHotEncoder(handle_unknown='ignore')
categorical_encoded = encoder.fit_transform(pd_df_cleaned[categorical_cols])
categorical_encoded = pd.DataFrame(categorical_encoded.toarray(), columns=encoder.get_feature_names_out(categorical_cols))
pd_df_cleaned = pd.concat([pd_df_cleaned, categorical_encoded], axis=1)
pd_df_cleaned.drop(categorical_cols, axis=1, inplace=True)
columns = pd_df_cleaned.columns

def clean_column_names(columns):
    clean_columns_all = []
    clean_columns_train = []
    for col in columns:
        clean_col = re.sub(r'\W+', '', col)  # Remove non-alphanumeric characters
        clean_col = clean_col.replace('ᲡᲮᲕᲐ','A')
        clean_col = clean_col.upper()  # Capitalize the column names
        clean_columns_all.append(clean_col)
        if clean_col != 'PRICE' and clean_col != 'MAKE_ᲡᲮᲕᲐ':
                clean_columns_train.append(clean_col)
    return clean_columns_all,clean_columns_train

# Apply the function to the DataFrame's columns
clean_columns_all, clean_columns_train =  clean_column_names(pd_df_cleaned.columns)
# Rename columns with cleaned names
pd_df_cleaned.columns = clean_columns_all
pd_df_cleaned.rename(columns={'MAKE_ᲡᲮᲕᲐ': 'MAKE_UNKNOWN'}, inplace=True)

sf_df = session.create_dataframe(pd_df_cleaned[clean_columns_train])

sf_df.write.mode("overwrite").save_as_table(clean_table_name)



In [None]:
#session.create_dataframe(pd_df_cleaned).write.mode("overwrite").save_as_table("CARS_CLEANED_ALL_COLUMNS")

In [None]:
#split training data out

X_train, X_test, y_train, y_test = train_test_split(pd_df_cleaned[clean_columns_train], pd_df_cleaned['PRICE'],test_size=0.2)
X_train.dtypes

In [None]:

# Define the quantile
quantiles = [0.1, 0.5, 0.9]

# Train models for each quantile
models = {}
for q in quantiles:
    model = GradientBoostingRegressor(loss='quantile', alpha=q, n_estimators=100, learning_rate=0.1, max_depth=3)
    model.fit(X_train, y_train)
    models[q] = model

# Predict and evaluate
predictions = {}
for q, model in models.items():
    predictions[q] = model.predict(X_test)

# Calculate quantile loss
def quantile_loss(q, y_true, y_pred):
    e = y_true - y_pred
    return np.mean(np.maximum(q * e, (q - 1) * e))

for q in quantiles:
    loss = quantile_loss(q, y_test, predictions[q])
    print(f"Quantile loss for quantile {q}: {loss}")

# Calculate coverage probability
coverage = np.mean((predictions[0.1] <= y_test) & (y_test <= predictions[0.9]))
print(f"Coverage probability for 10th and 90th percentiles: {coverage}")

gbr_model = models[0.5]


In [None]:
# Visualize predictions
plt.figure(figsize=(10, 6))
plt.scatter(y_test, predictions[0.5], alpha=0.5, label='Median Prediction')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2, label='Perfect Prediction')
plt.xlabel('True Values')
plt.ylabel('Predicted Values')
plt.title('Quantile Regression Predictions')
plt.legend()
plt.show()

In [None]:
#get a model registry object
reg = Registry(
    session=session, 
    database_name=session.get_current_database(), 
    schema_name=session.get_current_schema()
    )

In [None]:
#register our model, this also automatically creates a SQL function
my_model_version = reg.log_model(gbr_model,
                   model_name="car_price_gbr",
                   version_name="v6",
                   conda_dependencies=["scikit-learn"],
                   comment="My awesome ML model",
                   metrics={"score": 96},
                   sample_input_data=X_train.head(10))

In [None]:
#score some data in python
sf_df = session.table(clean_table_name).sample(.01)
result_df = my_model_version.run(sf_df).cache_result()
result_df[['"output_feature_0"','"MILEAGE"','"LEVY"']].show(n=15, max_width=1000)

In [None]:
--can score the model in ANY SQL now too
select 
    car_price_gbr!predict(*):output_feature_0::number(10,0) PREDICTED_PRICE,
from CARS_CLEANED limit 5;


## QUIZ

See if you can change the output of the model from "output_feature_0" to "PRICE"

Check out the docs on [Signatures](https://docs.snowflake.com/en/developer-guide/snowpark-ml/model-registry/model-signature)
