In [4]:
import pandas as pd
import numpy as np
import os
import datetime
import warnings
import logging
import boto3
import optuna
import joblib
from sklearn.model_selection import train_test_split
from model_training import *
from preprocessing import read_csv_file, save_file_to_s3
warnings.filterwarnings("ignore")

In [5]:
# Load data
df = read_csv_file("Data/processed/data_clean.csv")

df

Unnamed: 0,make,model,odometer,price,bodytype,trim,year,drivetrain,fetchdate,transmission_manual,province,days_since_reference,car_age
0,audi,s5,145396,30999,coupe,s5-progressiv,2016,AWD,2022-05-17,0,quebec,136,6.378082
1,audi,q3,20443,33499,suv,q3-2.0 tfsi komfort,2018,AWD,2022-05-17,0,quebec,136,4.375342
2,audi,a4,108716,14495,sedan,a4-unknown,2013,AWD,2022-05-17,0,quebec,136,9.378082
3,audi,s6,213077,20890,sedan,s6-unknown,2013,AWD,2022-05-17,0,quebec,136,9.378082
4,audi,a4,148477,13989,sedan,a4-premium plus s-line,2013,AWD,2022-05-17,1,quebec,136,9.378082
...,...,...,...,...,...,...,...,...,...,...,...,...,...
194763,volvo,xc90,52040,52880,suv,xc90-t6 inscription,2019,AWD,2023-07-20,0,ontario,565,4.550685
194764,volvo,s60,73000,20900,sedan,s60-unknown,2015,AWD,2023-07-20,0,quebec,565,8.553425
194765,volvo,xc60,29915,49990,suv,xc60-momentum t6,2021,AWD,2023-07-20,0,ontario,565,2.547945
194766,volvo,xc90,71165,46988,suv,xc90-t6 inscription,2019,AWD,2023-07-20,0,ontario,565,4.550685


## Split Data

In [4]:
test_set_size = 0

# Split data into train, validation and test sets
X_train, X_val, X_test, y_train, y_val, y_test = split_data_by_date(df, 'fetchdate', 'price',
                                                                    val_size=0.2, test_size=test_set_size,
                                                                    random_state=42)

Validation set cutoff date: 2023-04-30 - Rows: 38954


In [5]:
import lightgbm as lgb
import category_encoders as ce
from sklearn.preprocessing import LabelEncoder

# Encode the categorical columns

# Target encode
targetenc_cols = ['model', 'trim']
target_encoder = ce.TargetEncoder(cols=targetenc_cols)
X_train = target_encoder.fit_transform(X_train, y_train)
X_val = target_encoder.transform(X_val)

# Label encode
labelenc_cols = ['make', 'bodytype', 'drivetrain', 'province']
ordinal_encoder = ce.OrdinalEncoder(cols=labelenc_cols)
X_train[labelenc_cols] = ordinal_encoder.fit_transform(X_train[labelenc_cols])
X_val[labelenc_cols] = ordinal_encoder.transform(X_val[labelenc_cols])

# Create the lightgbm dataset
train_dataset = lgb.Dataset(X_train, label=y_train, free_raw_data=False)
val_dataset = lgb.Dataset(X_val, label=y_val, free_raw_data=False, reference=train_dataset)

# Define the LightGBM parameters
params = {
    'objective': 'regression',
    'metric': 'mape',
    'seed': 42
}

num_boost_round = 200

# Train the LightGBM model
model = lgb.train(params, train_dataset, num_boost_round=num_boost_round, valid_sets=[train_dataset, val_dataset], categorical_feature=labelenc_cols)

print("Model training complete.")

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 996
[LightGBM] [Info] Number of data points in the train set: 155814, number of used features: 11
[LightGBM] [Info] Start training from score 35872.872412
Model training complete.


## Evaluate on test set

In [6]:
if test_set_size > 0:
    # Category encode the test set
    X_test = target_encoder.transform(X_test)
    X_test[labelenc_cols] = ordinal_encoder.transform(X_test[labelenc_cols])

    # Evaluate the model on the test set
    test_mape = evaluate_model(model, X_test, y_test)
    test_mape = round(test_mape,4)
    print("Test MAPE Value: ", test_mape)

## Hyperparameter Tuning

In [None]:
def objective(trial):
    # Define the hyperparameters to optimize
    params = {
        'objective': 'regression',
        'metric': 'mape',
        'num_leaves': trial.suggest_int('num_leaves', 2, 80),
        'n_estimators': trial.suggest_int('n_estimators', 500, 3000, step=100),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
        'max_depth': trial.suggest_int('max_depth', 5, 15),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 50),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 0.001, 10.0),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'verbose': -1
    }
    
    # Label encoded columns
    labelenc_cols = ['make', 'bodytype', 'drivetrain', 'province']

    # Train the LightGBM model with the given hyperparameters
    model = lgb.train(params, train_dataset, num_boost_round=num_boost_round, valid_sets=[train_dataset, val_dataset], categorical_feature=labelenc_cols)

    # Make predictions on the validation set
    y_pred = model.predict(X_val)

    # Calculate the evaluation metric (MAPE)
    mape = mean_absolute_percentage_error(y_val, y_pred)

    return mape

# Create the Optuna study
study = optuna.create_study(direction='minimize')

# Run the optimization trials
study.optimize(objective, n_trials=35)

# Get the best hyperparameters and the best objective value
best_params = study.best_params
best_value = study.best_value
print("Best Hyperparameters: ", best_params)
print("Best MAPE Value: ", best_value)

## Visualization

In [8]:
visualize_hyperparams = False

if visualize_hyperparams:
    # Plotting the relationship between parameters
    fig_slice_plot = optuna.visualization.plot_slice(study)
    fig_slice_plot.show()

In [9]:
if visualize_hyperparams:
    # Plotting the importance of parameters
    fig_param_importances = optuna.visualization.plot_param_importances(study)
    fig_param_importances.show()

## Train on the whole dataset

In [23]:
def fit_encode_data(X, y, targetenc_cols, labelenc_cols):
    """
    Encodes the training data.

    Parameters:
    - X (pd.DataFrame): Training features.
    - y (pd.Series or np.array): Training target.
    - targetenc_cols (list): List of columns to be target encoded.
    - labelenc_cols (list): List of columns to be encoded using LabelEncoder.

    Returns:
    - pd.DataFrame: Encoded training features.

    """
    # Target encode the specified columns
    target_encoder = ce.TargetEncoder(cols=targetenc_cols)
    X_encoded = target_encoder.fit_transform(X, y)

    # Label (Ordinal) encode the specified columns
    ordinal_encoder = ce.OrdinalEncoder(cols=labelenc_cols)
    X_encoded[labelenc_cols] = ordinal_encoder.fit_transform(X_encoded[labelenc_cols])

    return X_encoded, target_encoder, ordinal_encoder

In [None]:
X = df.drop(['price', 'fetchdate'], axis=1)
y = df['price']

targetenc_cols = ['model', 'trim']
labelenc_cols = ['make', 'bodytype', 'drivetrain', 'province'] # ,'fueltype'
X_encoded, target_encoder, ordinal_encoder = fit_encode_data(X, y, targetenc_cols, labelenc_cols)

# Create the lightgbm dataset using all of the data
train_dataset = lgb.Dataset(X_encoded, label=y, free_raw_data=False)

# Train the LightGBM model on the whole dataset
model = lgb.train(best_params, train_dataset, categorical_feature=labelenc_cols)

print("Model training complete.")

In [27]:
# Save the LightGBM model to a file
model.save_model('models/lightgbm/lightgbm_model.txt')
print("Model saved.")

# Save the fitted target encoder to a separate file
joblib.dump(target_encoder, 'models/lightgbm/target_encoder.joblib')
print("Target encoder saved.")

# Save the fitted OrdinalEncoder to a file
joblib.dump(ordinal_encoder, 'models/lightgbm/ordinal_encoder.joblib')
print("Ordinal encoder saved.")

Model saved.
Target encoder saved.
Ordinal encoder saved.


In [28]:
from preprocessing import *
# Save model to S3
# Get the execution role (only in SageMaker)
role = get_execution_role()

# Save to S3
save_file_to_s3('models/lightgbm/lightgbm_model.txt', 'hazar-ml-bucket', 'carvalu/models/lightgbm_model.txt')
save_file_to_s3('models/lightgbm/target_encoder.joblib', 'hazar-ml-bucket', 'carvalu/models/target_encoder.joblib')
save_file_to_s3('models/lightgbm/ordinal_encoder.joblib', 'hazar-ml-bucket', 'carvalu/models/ordinal_encoder.joblib')

File saved to S3 bucket.
File saved to S3 bucket.
File saved to S3 bucket.


True

# Predict the Test Set

## Preprocess Test Data

In [29]:
from preprocessing import *

# Get the execution role (only in SageMaker)
role = get_execution_role()

# Declare bucket name, remote file, and destination
my_bucket = 'hazar-ml-bucket'
filename = 'carvalu/test_data/test_data_2023_07_30.csv'
local_file = 'Data/raw/test_data_2023_07_30.csv'

# Download the file from S3
download_file_from_s3(my_bucket, filename, local_file)

# Load the data
df = read_csv_file(local_file, index_col=0)

# Preprocess the DataFrame
df = preprocess_dataframe(df, trim_min_occurrences=2,model_min_occurrences=2, trim_combine_with_modelname=True, remove_outliers=False)

# Save to local as csv
save_dataframe_to_csv(df, 'Data/processed/test_data_clean.csv')

File downloaded successfully.
Initial row count: 10863
Filtering some rows
--Row count:10810
Dropping unnecessary columns
--Row count:10810
Processing make
--Row count:10810
Processing model
--Row count:10647
Processing trim
Trims: Starting to process individual makes
--Processing bmw
--Processing toyota
--Processing audi
--Processing honda
--Processing ford
--Processing porsche
--Processing chevrolet
--Processing chrysler
--Processing dodge
--Processing hyundai
--Processing land rover
--Processing tesla
--Processing nissan
--Processing kia
--Processing ram
--Processing subaru
--Processing mini
--Processing cadillac
--Processing infiniti
--Processing mercedes-benz
--Processing gmc
--Processing volkswagen
--Processing mazda
--Processing mitsubishi
--Processing jeep
--Processing lexus
--Processing lincoln
--Processing volvo
Trims of all makes processed
Number of unique trims: 2090
--Row count:10647
Processing transmission
--Row count:10647
Processing drivetrain
--Row count:10647
Processi

## Predict

In [31]:
import lightgbm as lgb
import category_encoders as ce
import pandas as pd
import joblib

# Load the LightGBM model and encoders from the files
lgb_model = lgb.Booster(model_file='models/lightgbm/lightgbm_model.txt')
target_encoder = joblib.load('models/lightgbm/target_encoder.joblib')
ordinal_encoder = joblib.load('models/lightgbm/ordinal_encoder.joblib')

y_test = df['price']
X_test = df.drop(["fetchdate","price"],axis=1)

# Apply target encoding
X_test_encoded = target_encoder.transform(X_test)

# Apply ordinal encoder
labelenc_cols = ['make', 'bodytype', 'drivetrain', 'province']
X_test_encoded[labelenc_cols] = ordinal_encoder.transform(X_test_encoded[labelenc_cols])

# Predict test data
y_pred = lgb_model.predict(X_test_encoded)

from training_helpers import mean_absolute_percentage_error

# Evaluate the model on the test set
mape = mean_absolute_percentage_error(y_test, y_pred)
print("Test set MAPE: {:.2f}%".format(mape * 100))

Test set MAPE: 5.50%


In [32]:
X_test_encoded['real_make'] = df.make
X_test_encoded['price'] = y_test
X_test_encoded['prediction'] = y_pred
X_test_encoded['pct_error'] = X_test_encoded.apply(lambda row: abs(row['price']-row['prediction'])/row['price']*100,axis=1)

error_table = X_test_encoded.groupby(by="real_make").mean()
error_table = error_table[['pct_error']]
error_table.sort_values('pct_error')

Unnamed: 0_level_0,pct_error
real_make,Unnamed: 1_level_1
tesla,2.743678
cadillac,3.552685
lexus,3.725402
volvo,3.81419
lincoln,4.042441
infiniti,4.196404
toyota,4.339412
honda,4.348687
porsche,4.548816
land rover,4.816886


## Model Performance on Clean Test Data

In [35]:
indexes_to_remove = df[df['trim'].str.contains('unknown')].index

In [36]:
X_test_no_unknowns = X_test.drop(indexes_to_remove)
y_test_no_unknowns = y_test.drop(indexes_to_remove)

In [37]:
# Apply target encoding
X_test_no_unknowns_encoded = target_encoder.transform(X_test_no_unknowns)

# Apply ordinal encoder
labelenc_cols = ['make', 'bodytype', 'drivetrain', 'province']
X_test_no_unknowns_encoded[labelenc_cols] = ordinal_encoder.transform(X_test_no_unknowns_encoded[labelenc_cols])

# Predict test data
y_pred = lgb_model.predict(X_test_no_unknowns_encoded)

from training_helpers import mean_absolute_percentage_error

# Evaluate the model on the test set
mape = mean_absolute_percentage_error(y_test_no_unknowns, y_pred)
print("Test set MAPE: {:.2f}%".format(mape * 100))

Test set MAPE: 4.98%
