In [None]:
import numpy as np
import pandas as pd

import joblib

from sklearn.model_selection import cross_val_score, KFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor

from xgboost import XGBRegressor

In [None]:
toyota_df = pd.read_excel("./Toyota.xlsx")

# Data Pre-processing 

Let's take a look at the first few rows of the dataset

In [None]:
toyota_df.head()

Let's take a look at all of the columns

In [None]:
toyota_df.columns

To get more feel of the data, let's take a look at the dimensions and the summary statistics for each column 

In [None]:
toyota_df.shape

In [None]:
toyota_df.describe()

Let's take a look at the data type for each column

In [None]:
toyota_df.info()

Good news is there appears to be no NA values, so we save time on cleaning the data.
We can drop columns Id and Model since they do not have an effect on the price of a Toyota Corolla. 
We can also drop Cylinders since the only value for all the rows is 4 

In [None]:
toyota_df.drop(columns=['Id', 'Model', 'Cylinders'], axis = 1, inplace=True)

If we do toyota_df.head() we can see that the Id and Model columns have been deleted

In [None]:
toyota_df.head()

From toyota_df.info() we also see the data type for Fuel_Type and Color as object. We need to convert these values to int data type, so we create dummy columns for Fuel_Type and Color.            

In [None]:
toyota_df = pd.get_dummies(toyota_df, columns = ['Fuel_Type', 'Color'], dtype=int, drop_first=True)

In [None]:
toyota_df.info()

As you can see from toyota_df.info() additional fuel type columns and color columns have been added. Now, all columns are of int data type.

We can finally start our analysis.

# Regression Analysis

Our target variable is Price and the rest of the columns are our attributes. 

We will be comparing Linear Regression, Gradient Boosting, and XG Boost models to see which performs the best in predicting the price.

In [None]:
y = toyota_df[['Price']]
X = toyota_df.drop(columns=['Price'])

In [None]:
y.head()

In [None]:
X.head()

In [None]:
X.columns

We will be using 5-fold cross-validation technique to train and test different subsets of the Toyota Corolla dataframe.

In [None]:
cv = KFold(
    n_splits=5,
    shuffle=True,
    random_state=42
)

In [None]:
linreg_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", LinearRegression())
])

We will be evaluating the number trees (model__n_estimators) of 50 or 100 as well as the max depth of each tree (model__max_depth) of 5 or 10 to see which combination of parameters performs best for Gradient Boosting and XG Boost.

In [None]:
gbr_pipeline = Pipeline([
    ("model", GradientBoostingRegressor(random_state=42))
])

gbr_param_grid = {
    "model__n_estimators": [50, 100],
    "model__max_depth": [5, 10],
}

gbr_grid = GridSearchCV(
    estimator=gbr_pipeline,
    param_grid=gbr_param_grid,
    cv=cv,
    scoring="neg_root_mean_squared_error",
    n_jobs=-1
)

In [None]:
xgb_pipeline = Pipeline([
    ("model", XGBRegressor(
        objective="reg:squarederror",
        random_state=42,
        verbosity=0
    ))
])

xgb_param_grid = {
    "model__n_estimators": [50, 100],
    "model__max_depth": [5, 10]
}

xgb_grid = GridSearchCV(
    estimator=xgb_pipeline,
    param_grid=xgb_param_grid,
    cv=cv,
    scoring="neg_root_mean_squared_error",
    n_jobs=-1
)


Our performance metrics are R-squared and RMSE.

In [None]:
models = {
    "Linear Regression": linreg_pipeline,
    "Gradient Boosting": gbr_grid,
    "XGBoost": xgb_grid
}

results = {}
best_model = None
best_params = None
rmse = None
r2 = None
r2_std = None


In [None]:
for name, model in models.items():
    model.fit(X, y)

    if isinstance(model, GridSearchCV):
        best_model = model.best_estimator_
        best_params = model.best_params_
        rmse = -model.best_score_

        r2_scores = cross_val_score(
            best_model,
            X,
            y,
            cv=cv,
            scoring="r2"
        )

        r2 = r2_scores.mean()
        r2_std = r2_scores.std()

    else:
        # Linear Regression baseline
        rmse_scores = cross_val_score(
            model,
            X,
            y,
            cv=cv,
            scoring="neg_root_mean_squared_error"
        )
        rmse = -rmse_scores.mean()

        r2_scores = cross_val_score(
            model,
            X,
            y,
            cv=cv,
            scoring="r2"
        )
        r2 = r2_scores.mean()
        r2_std = r2_scores.std()
        best_params = "N/A"

    results[name] = {
        "Best Model": best_model,
        "Best Params": best_params,
        "RMSE": rmse,
        "R2": r2,
        "R2 Std": r2_std, 
    }


In [None]:
for model_name, metrics in results.items():
    print(model_name)
    print(f"  Lowest RMSE: {metrics['RMSE']:.4f}")
    print(f"  R²:   {metrics['R2']:.4f} ± {metrics['R2 Std']:.4f}")
    print(f"  Best Params: {metrics['Best Params']}\n")

As you can see, Gradient Boosting performed the best with the lowest RMSE of around 997 and slightly better R-square of 0.9233

# Dump model to pkl file for streamlit app

In [None]:
model = GradientBoostingRegressor(n_estimators = 50, max_depth = 5, random_state = 42)
model.fit(X, y)

joblib.dump(model, "gb_model.pkl")
# joblib.dump(features, "gb_features.pkl")