# A Snowpark Workflow using XGBoost
We will use sample data on used cars and try to predict a price for aa given car using various features

## You will learn to do in Snowflake/Snowpark
- how to prep data
- how to train / score the model
- how to register the model


In [None]:
#import necessary packages --- needs snowflake-ml-python 1.5.0 (select it in the upper right packages list)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from snowflake.ml.registry import Registry
import re




In [None]:
#snowflake libs
from snowflake.snowpark.session import Session
from snowflake.snowpark.context import get_active_session
session = get_active_session()

source_table_name = "LAB_DATA.PUBLIC.CARS_5000"
clean_table_name = "CARS_CLEANED"

session.use_database("A_HOL_REVIEWS")
session.use_schema("PUBLIC")

print(session.get_current_database());
print(session.get_current_schema());
print(session.get_current_role());

In [None]:
#familiar python code running on Scalable Snowflake architecture and leveraging a bit of Snowpark to collect data
pd_df = session.table(source_table_name).to_pandas()
pd_df.dtypes
pd_df

In [None]:
#Separate numerical, categorical, and ordinal columns
numerical_cols = ['YEAR_PROD','MILEAGE','LEVY','CYL']  # List of numerical columns
categorical_cols = ['MAKE', 'MODEL_NAME', 'COLOR','FUEL_TYPE','DRIVE' ]  # List of categorical columns

pd_df_cleaned = pd.DataFrame(pd_df[numerical_cols+categorical_cols+['PRICE']])


In [None]:
# Normalize numerical columns
scaler = StandardScaler()
pd_df_cleaned[numerical_cols] = scaler.fit_transform(pd_df_cleaned[numerical_cols])

# One-hot encode categorical columns
encoder = OneHotEncoder(handle_unknown='ignore')
categorical_encoded = encoder.fit_transform(pd_df_cleaned[categorical_cols])
categorical_encoded = pd.DataFrame(categorical_encoded.toarray(), columns=encoder.get_feature_names_out(categorical_cols))
pd_df_cleaned = pd.concat([pd_df_cleaned, categorical_encoded], axis=1)
pd_df_cleaned.drop(categorical_cols, axis=1, inplace=True)
columns = pd_df_cleaned.columns

def clean_column_names(columns):
    clean_columns_all = []
    clean_columns_train = []
    for col in columns:
        clean_col = re.sub(r'\W+', '', col)  # Remove non-alphanumeric characters
        clean_col = clean_col.upper()  # Capitalize the column names
        clean_columns_all.append(clean_col)
        if clean_col != 'PRICE':
                clean_columns_train.append(clean_col)
    return clean_columns_all,clean_columns_train

# Apply the function to the DataFrame's columns
clean_columns_all, clean_columns_train =  clean_column_names(pd_df_cleaned.columns)
# Rename columns with cleaned names
pd_df_cleaned.columns = clean_columns_all

sf_df = session.create_dataframe(pd_df_cleaned[clean_columns_train])

sf_df.write.mode("overwrite").save_as_table(clean_table_name)



In [None]:
#session.create_dataframe(pd_df_cleaned[clean_columns_train]).write.mode("overwrite").save_as_table("CARS_CLEANED")

In [None]:
pandas_df  = session.table(clean_table_name).to_pandas()

In [None]:
#split training data out

X_train, X_test, y_train, y_test = train_test_split(pd_df_cleaned[clean_columns_train], pd_df_cleaned['PRICE'],test_size=0.2)
X_train.dtypes

In [None]:
# Create DMatrix for xgboost
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Set parameters for xgboost
params = {
    'objective': 'reg:squarederror',  # for regression task
    'max_depth': 5,
    'eta': 0.1,
    'eval_metric': 'rmse'
}

# Train the model
num_round = 100
bst_model = xgb.train(params, dtrain, num_round)

# Make predictions
preds = bst_model.predict(dtest)

# Evaluate the model
rmse = np.sqrt(mean_squared_error(y_test, preds))
print(f"RMSE: {rmse:.4f}")


In [None]:
# Visualize predictions
plt.figure(figsize=(10, 6))
plt.scatter(y_test, preds, alpha=0.5, label='Prediction')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2, label='Perfect Prediction')
plt.xlabel('True Values')
plt.ylabel('Predicted Values')
plt.title('Predictions')
plt.legend()
plt.show()

In [None]:
#get a model registry object
reg = Registry(
    session=session, 
    database_name=session.get_current_database(), 
    schema_name=session.get_current_schema()
    )

In [None]:
#register our model, this also automatically creates a SQL function
my_model_version = reg.log_model(bst_model,
                   model_name="car_price_gbr_ym",
                   version_name="v1",
                   conda_dependencies=["scikit-learn"],
                   comment="My awesome ML model",
                   metrics={"score": 96},
                   sample_input_data=X_train.head(10))

In [None]:
#score some data in python
sf_df = session.table(clean_table_name).sample(.01)
result_df = my_model_version.run(sf_df).cache_result()
result_df[['"output_feature_0"','"MILEAGE"','"YEAR_PROD"']].show(n=15, max_width=1000)

In [None]:
--can score the model in ANY SQL now too
select 
    car_price_gbr_rockstar!predict(*):output_feature_0::number(10,0) PREDICTED_PRICE,
from CARS_CLEANED limit 5;


In [None]:
--for any other version (for example V1 below):
WITH model_version_alias AS MODEL car_price_gbr_ym VERSION v1 
SELECT 
    a.*, model_version_alias!predict(*):output_feature_0::number(10,0) PREDICTED_PRICE,
from 
    CARS_CLEANED a limit 5;