# Model Training Script
This script extracts data from the snowflake table, transforms the features, and then traings the model on the engeneered features. Our training script leverages MLflow for model logging.

In [1]:
import os
from datetime import datetime
import json
import argparse
from dotenv import load_dotenv
import joblib
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
import snowflake.connector

from sklearn.datasets import make_regression
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, make_scorer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
#import functions from helper_functions.py
from helper_functions import get_data, write_to_snowflake
# Enable pandas to display up to 500 columns
pd.set_option('display.max_columns', 500)


#### 1.1 Load Snowflake credentials for temp user

In [2]:
# Load environment variables from a .env file
load_dotenv()

try:
    TEMP_USER = os.getenv('SNOWSQL_TEMP_USER')
    TEMP_USER_PASSWORD = os.getenv('SNOWSQL_TEMP_PWD')
    
    if not TEMP_USER:
        raise ValueError("Environment variable SNOWSQL_TEMP_USER must be set")
    if not TEMP_USER_PASSWORD:
        raise ValueError("Environment variable SNOWSQL_TEMP_PWD must be set")

except ValueError as ve:
    print(f"Error: {ve}")
    
except Exception as e:
    print(f"An unexpected error occurred: {e}")

#### 1.2 Connect to Snowflake

In [3]:
# Establish connection to Snowflake
current_time = datetime.now().strftime('%Y-%m-%d %H:%M%S')

try:
    conn = snowflake.connector.connect(
        user=TEMP_USER,
        password=TEMP_USER_PASSWORD,
        account='ygeuort-alb19263',
        warehouse='COMPUTE_WH',
        database='AIRBNB',
    )

    print(f'Connected to Snowflake successfully at {current_time}')

except Exception as e:
    print(f'Failed to connect to Snowflake on {current_time} due to error code {e}')

Connected to Snowflake successfully at 2024-07-30 20:0439


#### Extract data for all markets

In [4]:
sql_query = '''
select * from ods.listings
 '''

df_raw = get_data(sql_query, conn=conn)
df_raw.shape

(111052, 68)

In [19]:
# Select features and target for the model
features = ['market', 'room_type', 'accommodates', 'bathrooms', 'beds', 'latitude', 'longitude', 'amenities']
categorical_features = ['market', 'room_type']  # Features with categorical data
numerical_features = ['accommodates', 'bathrooms', 'beds', 'latitude', 'longitude']  # Features with numerical data
target = 'price'  # Target variable to predict

# Filter the dataframe to include only the selected features and target column
df = df_raw[features + [target]]

# Remove NA values from the 'price' column
df_raw_no_na = df_raw[target].dropna()
# Remove rows where the price exceeds $600 per night (97th percentile) to avoid outliers
df = df[df[target] <= 600]

# Drop rows with missing values in the target column to ensure data integrity
df = df.dropna(subset=[target])
# write cleaned data to snowflake feature store
write_to_snowflake(df,conn=conn, snowflake_schema_name='FEATURE_STORE', snowflake_table_name='listings_cleaned')

  success, num_chunks, num_rows, output = write_pandas(


Created table and data loaded to Snowflake at 2024-07-30 20:1936


In [20]:
#add code for pulling from snowflake feature store

# Separate the features (X) and the target (y)
X = df.drop(columns=[target])
y = df[target]

# Split the data into training and test sets to evaluate model performance. Note because we have a large dataset, 111,052 obsevations, we can use a smaller test size of 10% the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [22]:
# Get model run start time
start_time = datetime.now()

# Define the pipeline
def createPipeline(numerical_features, categorical_features):
    # Define the preprocessing for numerical features
    numericalTransformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    
    # Define the preprocessing for categorical features
    categoricalTransformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    
    # Combine preprocessing for numerical and categorical features
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numericalTransformer, numerical_features),
            ('cat', categoricalTransformer, categorical_features)
        ])
    
    # Create the pipeline
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', RandomForestRegressor(max_depth=30,
                                            max_features='auto',
                                            min_samples_leaf=2,
                                            min_samples_split=2,
                                            n_estimators=300,
                                            random_state=42)) #best model after hyperparameter tuning
    ])
    return pipeline

def log_to_file(log_file, params, metrics, training_details):
    # Load existing data
    try:
        with open(log_file, 'r') as f:
            logs = json.load(f)
    except (FileNotFoundError, json.JSONDecodeError):
        logs = []

    # Append new log entry
    log_entry = {
        "params": params,
        "metrics": metrics,
        "training_details": training_details
    }
    logs.append(log_entry)

    # Write updated logs back to the file
    with open(log_file, 'w') as f:
        json.dump(logs, f, indent=4)

# Parameters and file paths
log_file = 'model/experiment_log.json'
model_file = 'model/regression_pipeline.joblib'

# Create the pipeline
pipeline = createPipeline(numerical_features, categorical_features)

# Perform cross-validation with Mean Absolute Error (MAE)
maeScores = cross_val_score(pipeline, X_train, y_train, cv=10, scoring='neg_mean_absolute_error', n_jobs=-1)
maeScores = -maeScores  # Convert negative MAE to positive
print("Cross-validation MAE scores:", maeScores)
print("Average cross-validation MAE:", maeScores.mean())

# Perform cross-validation with Root Mean Squared Error (RMSE)
rmse_scorer = make_scorer(mean_squared_error, squared=False)  # squared=False returns RMSE
rmseScores = cross_val_score(pipeline, X_train, y_train, cv=10, scoring=rmse_scorer, n_jobs=-1)
print("Cross-validation RMSE scores:", rmseScores)
print("Average cross-validation RMSE:", rmseScores.mean())

# Parameters and metrics to log
params = {
    "model_type": str(pipeline.get_params()['regressor']),
    "numerical_features": numerical_features,
    "categorical_features": categorical_features
}
metrics = {
    "cv_mae_mean": maeScores.mean(),
    "cv_rmse_mean": rmseScores.mean(),
    "cv_mae_std": maeScores.std(),
    "cv_rmse_std": rmseScores.std()
}
training_details = {
    "train_size": len(X_train),
    "test_size": len(X_test),
    "train_duration": str(datetime.now() - start_time)
}

# Train the model on the entire training set
pipeline.fit(X_train, y_train)

# Predict and evaluate the model on the test set using MAE
y_pred = pipeline.predict(X_test)
mae_test = mean_absolute_error(y_test, y_pred)
print("Test set MAE:", mae_test)

# Calculate RMSE on the test set
rmse_test = mean_squared_error(y_test, y_pred, squared=False)  # squared=False returns RMSE
print("Test set RMSE:", rmse_test)

# Log training run results to file
metrics.update({"test_mae": mae_test, "test_rmse": rmse_test})
#log_to_file(log_file, params, metrics, training_details)

# Refit the final model on the entire dataset to improve generalization by leveraging all available information
pipeline.fit(X, y)

# Save the model
joblib.dump(pipeline, model_file, compress=('gzip', 9))


Cross-validation MAE scores: [45.66534262 46.30377953 46.52061509 46.76627372 47.00417896 46.03335114
 45.68168021 45.95040059 46.96100644 45.72665094]
Average cross-validation MAE: 46.26132792400367
Cross-validation RMSE scores: [69.5905525  70.35051165 70.57718629 70.92256786 72.49644571 68.91319669
 69.19007214 69.40628224 70.79759222 68.86378946]
Average cross-validation RMSE: 70.11081967490922
Test set MAE: 46.140358929648826
Test set RMSE: 69.5552348912378


['model/regression_pipeline.joblib']