### Importing Modules

In [92]:
# ML Modules
import pandas as pd

import pickle

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import TargetEncoder, PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error

from statsmodels.stats.outliers_influence import variance_inflation_factor

In [93]:
df = pd.read_csv("./DataSet/cars24_car-price-clean-data.csv", index_col="Unnamed: 0")

In [94]:
df.columns

Index(['selling_price', 'make', 'model', 'year', 'seller_type', 'km_driven',
       'fuel_type', 'transmission_type', 'mileage', 'engine', 'max_power',
       'seats'],
      dtype='object')

In [95]:
categCols = ["make", "model", "seller_type", "fuel_type", "transmission_type", "seats"]
contCols = ["year", "km_driven", "mileage", "engine", "max_power"]

In [96]:
## splitting the Data to Train-Validation-Test

randomState = 50
## splitting the data to features and target Variables
X = df.drop(columns="selling_price")
y = df["selling_price"]

X_tr_vl, X_test, y_tr_vl, y_test = train_test_split(X, y, test_size=0.2, random_state=randomState)
X_train, X_val, y_train, y_val = train_test_split(X_tr_vl, y_tr_vl, test_size=0.25, random_state=randomState)

print("Training Data size: X: %s, y: %s" % (X_train.shape, y_train.shape))
print("Validation Data size: X: %s, y: %s" % (X_val.shape, y_val.shape))
print("Test Data size: X: %s, y: %s" % (X_test.shape, y_test.shape))

Training Data size: X: (11772, 11), y: (11772,)
Validation Data size: X: (3924, 11), y: (3924,)
Test Data size: X: (3925, 11), y: (3925,)


In [97]:
# Scaling Model Names
X_MinMaxModelName = "minmaxModel"
X_StdModelName = "stdModel"
X_TgtModelName = "tgtModel"

# Scaling Continuous Data
# TODO: Need to figure out which ones to pass to Standard Scaler and vice-versa 
minmax_features = contCols
standard_features = []
Target_features = categCols

# Create and fit model for continuous features
preprocessor = ColumnTransformer(
    transformers=[
        (X_MinMaxModelName, MinMaxScaler(), minmax_features),
        (X_StdModelName, StandardScaler(), standard_features),
        (X_TgtModelName, TargetEncoder(), Target_features)],
        remainder="passthrough")
preprocessor.fit(X=X_train, y=y_train)

# Create and fit model for Target Variable
yStdScaler = StandardScaler()
yStdScaler.fit(y_train.values.reshape(-1, 1))

In [98]:
# Saving the models for future use
# Model pickle file Names
X_MinMax_scale_pkl = "./Artifacts/X_MinMax_scale.pkl" 
X_Std_scale_pkl = "./Artifacts/X_Std_scale.pkl" 
X_Tgt_encode_pkl = "./Artifacts/X_Tgt_encode.pkl" 
y_Std_scale_pkl = "./Artifacts/y_Std_scale.pkl" 

with open(X_MinMax_scale_pkl, "wb") as pfile:
    pickle.dump(preprocessor.named_transformers_[X_MinMaxModelName], pfile)

with open(X_Std_scale_pkl, "wb") as pfile:
    pickle.dump(preprocessor.named_transformers_[X_StdModelName], pfile)

with open(X_Tgt_encode_pkl, "wb") as pfile:
    pickle.dump(preprocessor.named_transformers_[X_TgtModelName], pfile)

with open(y_Std_scale_pkl, "wb") as pfile:
    pickle.dump(yStdScaler, pfile)

In [99]:
## now transforming the continuous Features
X_train_scaled = pd.DataFrame(preprocessor.transform(X_train), columns = preprocessor.get_feature_names_out())
y_train_scaled = yStdScaler.transform(y_train.values.reshape(-1, 1))

X_val_scaled = pd.DataFrame(preprocessor.transform(X_val), columns = preprocessor.get_feature_names_out())
y_val_scaled = yStdScaler.transform(y_val.values.reshape(-1, 1))

### Create and tune the LR model

In [100]:
## Basic LR model.
lrModel = LinearRegression()
lrModel.fit(X_train_scaled, y_train_scaled)

In [101]:
yPred = lrModel.predict(X_val_scaled)

In [102]:
print("Training Score: ", lrModel.score(X_train_scaled, y_train_scaled))
print("Validation Score: ", lrModel.score(X_val_scaled, y_val_scaled))

Training Score:  0.9255024889717117
Validation Score:  0.7671921700666986


In [103]:
X_train_scaled.head(1)

vifData = pd.DataFrame({"columns": X_train_scaled.columns})
vifData["vif"] = [variance_inflation_factor(exog=X_train_scaled.values, exog_idx=idx) for idx in range(len(X_train_scaled.columns))]
print(vifData)

                        columns        vif
0             minmaxModel__year   7.686999
1        minmaxModel__km_driven   3.983590
2          minmaxModel__mileage  14.405229
3           minmaxModel__engine  50.146269
4        minmaxModel__max_power  25.426015
5                tgtModel__make  10.126732
6               tgtModel__model  10.428579
7         tgtModel__seller_type  27.984976
8           tgtModel__fuel_type  29.201610
9   tgtModel__transmission_type   8.298020
10              tgtModel__seats  27.018974


IndentationError: unexpected indent (3553813010.py, line 2)

### Save the LR model

In [105]:
X_Tgt_encode_pkl = "./Artifacts/lr_BasicModel.pkl"

with open(X_Tgt_encode_pkl, "wb") as pfile:
    pickle.dump(lrModel, pfile)