In [10]:
# Generic
import io

# Joblib
import joblib

# Snowpark ML
import snowflake.ml.modeling.preprocessing as snowml
from snowflake.snowpark.functions import sproc
from snowflake.snowpark import Session

#### Prerequisites
1.	Download data file from https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques/data
2.	Download Kaggle data set and upload it in snowflake
a.	You can use the snowflake snowsight (web UI) to upload a dataset and create a table in a single step.
b.	In this solution, the table is created with the name HOUSE_PRICES_RAW_DATA for the train data set and HOUSE_PRICES_TEST_DATA for the test data set.
3.	Create Kaggle account or Google Collab account or you may setup Jypter to run locally.
4. If you had already followed through the previous chapters, you would already have a database called "RAW" and schema "RETAIL". If not, the database and schema used in this example is RAW.RETAIL and it needs to be changed as appropritate.
5. The example also assumes you have a database called "COMMONS" with schema called "UTILS" which is used for storing models and stored procedures.


#### Create session and load data

In [11]:
connection_parameters = {
    "account": "",
    "user": "",
    "password": "",
    "warehouse": "", # optional
    "database":"COMMONS",
    "schema":"UTILS"
}  

session = Session.builder.configs(connection_parameters).create()  

#### Add packages which are needed within your functions

As you see, there are minimal packages needed as we are utilizing Snowpark ML api to perform the preprocessing, model creation and management

In [12]:
session.add_packages('snowflake-snowpark-python', 'snowflake-ml-python')

#### Create a stage to store your model

In [21]:
query = "create or replace stage commons.utils.snowmodels" +\
        " directory = (enable = true)" +\
        " copy_options = (on_error='skip_file')"
        
session.sql(query).collect()

[Row(status='Stage area SNOWMODELS successfully created.')]

#### Input variables

We define the training table name, source columns and the target column available within the table.

In [13]:
training_table = 'RAW.RETAIL.HOUSE_PRICES_RAW_DATA'
src_cols = ["BLDGTYPE", "OVERALLCOND", "MSSUBCLASS", "MSZONING","LOTAREA", "LOTCONFIG", "YEARBUILT", "FOUNDATION"]
target_col = 'SALEPRICE'

Check if the table has been read properly by displaying the count.

In [15]:
df = session.table(training_table).limit(1000).select(src_cols)

In [16]:
df.columns

['BLDGTYPE',
 'OVERALLCOND',
 'MSSUBCLASS',
 'MSZONING',
 'LOTAREA',
 'LOTCONFIG',
 'YEARBUILT',
 'FOUNDATION']

#### Create stored procedure to train and upload the model

We create a function that will be deployed as a stored procedure to train a random forest regression model on the sample dataset to predict house prices. 
We will use Snowpark ML features instead of sklearn and the model is saved using the Snowpark MLOps features instead of directly loading it into a stage using joblib. 
The response from this procedure is important as it is later used to retrieve the features used by the model.

Note that I did not create a named procedure, instead went with a temporary procedure (snowflake assigns a random name and associates it with the local function name). The temporary stored procedure will only be available within the current snowflake session. Once closed, you would not be able to call or use it from another session like from snowflake worksheets or a different jypter worksheet.


In [22]:
def save_file(session, model, path):
    """
    The function save_file is responsible for saving a machine learning model to a specified path using the Snowflake Python connector.

    Args:
        session: A Snowflake session object that represents the connection to the Snowflake database.
        model: The machine learning model object that needs to be saved.
        path: The path where the model file will be saved.

    Returns:
        str: success message
    """

    # takes the model object and serializes it using the joblib.dump() function 
    # creates an input stream using io.BytesIO() to store the serialized model
    input_stream = io.BytesIO()
    joblib.dump(model, input_stream)
    
    # the serialized model is uploaded to the specified path using the upload_stream() method of the Snowflake connection cursor.
    session._conn._cursor.upload_stream(input_stream, path)
    return "successfully created file: " + path

@sproc(replace=True)
def train_model(session: Session, training_table: str, src_cols: list, target_col: str) -> str:
    """
    This function trains a random forest regression model to predict house prices. 
    The function performs data preprocessing, including one-hot encoding of categorical variables, cleaning column names, and splitting the dataset into training and validation sets. It then trains the random forest regression model using the training set and predicts house prices for the validation set. 
    The function saves the trained model to a file and returns a JSON string containing information about the model.
    
    Args:
        session (snowflake.snowpark.Session): A Snowflake session object.
        training_table (str): The name of the training table in Snowflake.
        src_cols (list): A list of source columns to use for training the model.
        target_col (str): The target column to predict.

    Returns:
        str: JSON string containing information about the model's feature details, feature importance, and mean absolute percentage error.
    """
    import logging
    logger = logging.getLogger("train_model")

    # load the raw data from the training table into a snowflake DataFrame and extract only needed columns
    raw_data = session.table(training_table)
    cols = src_cols
    cols.append(target_col)
    train_dataset = raw_data[cols]
    
    # identify the categorical columns in the source columns and Perform one-hot encoding on the categorical columns.
     
    # Get a list of all categorical columns
    categorical_columns = [column for column, dtype in train_dataset.dtypes if dtype.startswith("string")]

    def replace_characters(x: str, regex_string: str = '[^a-zA-Z,_]') -> str:
        """Removes regex patterns from string.
    
        Args:
            x (str) : Target string to make replacement.
            regex_string (str) : Regex string to remove from x.
    
        Returns
            str
    
        """
        import re
        if isinstance(x, str):
            regex = re.compile(regex_string)
            return regex.sub('', x).upper()
 
 
    # loop through the column names and rename them
    for column in train_dataset.columns:
        new_col_name = replace_characters(column)
        train_dataset = train_dataset.withColumnRenamed(column, new_col_name)
    
    # the above step was needed for OneHotEncoder to work
    OH_encoder = snowml.OneHotEncoder(input_cols=categorical_columns, output_cols=categorical_columns, drop_input_cols=True)
    train_df = OH_encoder.fit(train_dataset).transform(train_dataset)

    # the below code is needed because the encoding adds unwanted quotes and some special characters into
    # the new column names
    import re
    for column in train_df.columns:
        tempColName = re.sub(r'"', '',  column) # first remove quotes
        tempColName = re.sub(r'[^_a-zA-Z0-9]', '_', tempColName) # now remove all other characters and replace with underscore
        train_df = train_df.with_column_renamed(column, tempColName)

    XY_train, XY_test = train_df.random_split(weights=[0.8, 0.2])

    from snowflake.ml.modeling.ensemble.random_forest_regressor import RandomForestRegressor

    model_RFR = RandomForestRegressor(n_estimators=10, label_cols=target_col)
    model_RFR.fit(XY_train)

    result = model_RFR.predict(XY_test)

    from snowflake.ml.modeling.metrics import mean_absolute_percentage_error


    # save the trained model into Snowflake Registery.

    # create a dictionary containing information about the models feature details, feature importance, and mean absolute percentage error.
    model_info = dict()
    model_info['model_name']="houseprice_estimator"
    model_info['model_features'] = train_df.columns
    model_info['mean_absolute_percentage_error']= mean_absolute_percentage_error(df=result, 
                                            y_true_col_names="SALEPRICE", 
                                            y_pred_col_names="OUTPUT_SALEPRICE")
    
    logger.info('Saving Model into Stage')
    path = save_file(session, model_RFR, "@SNOWMODELS/houseprice_estimator.joblib")
    logger.info('Saved Model:'+path)

    import json
    return json.dumps(model_info)

In [23]:
# invoke the procedure to create and upload the model
model_results = train_model(training_table, src_cols, target_col)

In [24]:
import json
model_results_json = json.loads(model_results)

In [25]:
print(json.dumps(model_results_json, indent=4))

{
    "model_name": "houseprice_estimator",
    "model_features": [
        "BLDGTYPE_1FAM",
        "BLDGTYPE_2FMCON",
        "BLDGTYPE_DUPLEX",
        "BLDGTYPE_TWNHS",
        "BLDGTYPE_TWNHSE",
        "MSZONING__C__ALL__",
        "MSZONING_FV",
        "MSZONING_RH",
        "MSZONING_RL",
        "MSZONING_RM",
        "LOTCONFIG_CORNER",
        "LOTCONFIG_CULDSAC",
        "LOTCONFIG_FR2",
        "LOTCONFIG_FR3",
        "LOTCONFIG_INSIDE",
        "FOUNDATION_BRKTIL",
        "FOUNDATION_CBLOCK",
        "FOUNDATION_PCONC",
        "FOUNDATION_SLAB",
        "FOUNDATION_STONE",
        "FOUNDATION_WOOD",
        "OVERALLCOND",
        "MSSUBCLASS",
        "LOTAREA",
        "YEARBUILT",
        "SALEPRICE"
    ],
    "mean_absolute_percentage_error": 0.20127441942975144
}


In [26]:
model_features = model_results_json['model_features']
model_features.remove(target_col)
model_features

['BLDGTYPE_1FAM',
 'BLDGTYPE_2FMCON',
 'BLDGTYPE_DUPLEX',
 'BLDGTYPE_TWNHS',
 'BLDGTYPE_TWNHSE',
 'MSZONING__C__ALL__',
 'MSZONING_FV',
 'MSZONING_RH',
 'MSZONING_RL',
 'MSZONING_RM',
 'LOTCONFIG_CORNER',
 'LOTCONFIG_CULDSAC',
 'LOTCONFIG_FR2',
 'LOTCONFIG_FR3',
 'LOTCONFIG_INSIDE',
 'FOUNDATION_BRKTIL',
 'FOUNDATION_CBLOCK',
 'FOUNDATION_PCONC',
 'FOUNDATION_SLAB',
 'FOUNDATION_STONE',
 'FOUNDATION_WOOD',
 'OVERALLCOND',
 'MSSUBCLASS',
 'LOTAREA',
 'YEARBUILT']

### Unit test the model using a python stored procedure

Like above, I'm using a anonymous (temporary) procedure. All operations happens within the proc which includes - reading test data from snowflake, loading the pre-trained model from file, encodes categorical variables in the test dataset before using it to test the model and finally returning the lsit of predicted values.

Do not consider this as a best practice for unit testing and only consider this as a quick and dirty way to test your model.

In [31]:
import os
import sys

session.add_import("@SNOWMODELS/houseprice_estimator.joblib")  

def read_file(filename):
    """
    Reads the snowflake model from the stage location. 
    You can think of this as mounting the stage location on a special path, which is local to the python container that is execuitng your code.
    The local path that has file mounted is available by reading the value of "snowflake_import_directory" system variable
    Args:
        filename (str): name of model file

    Returns:
        binary: machine learning model
    """
    import_dir = sys._xoptions.get("snowflake_import_directory")
    if import_dir:
        with open(os.path.join(import_dir, filename), 'rb') as file:
                m = joblib.load(file)
                return m

@sproc(replace=True)
def test_model(session: Session, test_table: str, src_cols: list) -> list:
    """
    Tests the model

    Args:
        session (snowflake.snowpark.Session): a Snowflake session object
        test_table (str): the name of the test table
        src_cols (list): a list of strings representing the names of the source columns

    Returns:
        bool: success or failure
    """
    
    import logging
    logger = logging.getLogger("test_model")

    model_RFR = read_file('houseprice_estimator.joblib')

    if model_RFR is None:
        raise Exception('Unable to read model file')
    
    # load the raw data from the training table into a snowflake DataFrame and extract only needed columns
    raw_data = session.table(test_table)
    test_dataset = raw_data[src_cols]

    # identify the categorical columns in the source columns and Perform one-hot encoding on the categorical columns.
    
    # Get a list of all categorical columns
    categorical_columns = [column for column, dtype in test_dataset.dtypes if dtype.startswith("string")]

    def replace_characters(x: str, regex_string: str = '[^a-zA-Z,_]') -> str:
        """Removes regex patterns from string.
    
        Args:
            x (str) : Target string to make replacement.
            regex_string (str) : Regex string to remove from x.
    
        Returns
            str
    
        """
        import re
        if isinstance(x, str):
            regex = re.compile(regex_string)
            return regex.sub('', x).upper()


    # loop through the column names and rename them
    for column in test_dataset.columns:
        new_col_name = replace_characters(column)
        test_dataset = test_dataset.withColumnRenamed(column, new_col_name)
    
    # the above step was needed for OneHotEncoder to work
    OH_encoder = snowml.OneHotEncoder(input_cols=categorical_columns, output_cols=categorical_columns, drop_input_cols=True)
    test_df = OH_encoder.fit(test_dataset).transform(test_dataset)

    # the below code is needed because the encoding adds unwanted quotes and some special characters into
    # the new column names
    import re
    for column in test_df.columns:
        tempColName = re.sub(r'"', '',  column) # first remove quotes
        tempColName = re.sub(r'[^_a-zA-Z0-9]', '_', tempColName) # now remove all other characters and replace with underscore
        test_df = test_df.with_column_renamed(column, tempColName)

    # drop columns not available in the model
    unavailable_features_in_model = set(test_df.columns).difference(model_features)
    logger.info(unavailable_features_in_model)

    for feature in unavailable_features_in_model:
        test_df = test_df.drop(feature)
    
    # add columns available in the model but not in the testdata (this method is not ideal but okay for quick poc)
    unavailable_feature_in_testdf = set(model_features).difference(test_df.columns)

    for feature in unavailable_feature_in_testdf:
        test_df.with_column(feature, 0)


    test_df = test_df[model_features]
    
    # test_df['SALEPRICE_ESTIMATED']  = model_RFR.predict(test_df)
    
    tbl_ref = session.write_pandas(test_df, table_name='HOUSE_PRICES_TEST_DATA_RESULT', database='RAW', schema='RETAIL', auto_create_table=True, create_temp_table=False).collect()
    if tbl_ref is None:
        raise Exception('Error writing estimation results')

    return test_df.columns


In [32]:
test_table =  'RAW.RETAIL.HOUSE_PRICES_TEST_DATA'

In [33]:
session.table(test_table).count()

1459

In [None]:
test_model(test_table, src_cols)

In [9]:
session.close()