In [9]:
# ML Modules
import warnings
warnings.filterwarnings("ignore")


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import pickle

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import TargetEncoder, PolynomialFeatures
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import make_scorer
from functools import partial

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [10]:
df = pd.read_csv("./DataSet/cars24_car-price-clean-data.csv", index_col="Unnamed: 0")

In [11]:
categCols = ["make", "model", "seller_type", "fuel_type", "transmission_type", "seats"]
contCols = ["year", "km_driven", "mileage", "engine", "max_power"]

In [12]:
def dropusingVif(X, y,VIFThreshold=10, r2adjustedThreshold=0.85):
    iter = 0
    droppedCol = []
    while True:
        vifData = pd.DataFrame({"columns": X.columns})
        vifData["vif"] = [variance_inflation_factor(exog=X.values, exog_idx=idx) for idx in range(len(X.columns))]
        maxIdx = vifData["vif"].idxmax()
        highestVIF = vifData.loc[maxIdx, "vif"]
        columntoDrop = vifData.loc[maxIdx, "columns"]
        
        smOLS_X = sm.add_constant(X)
        smOLS_y = y
        olsModel = sm.OLS(endog=smOLS_y, exog=smOLS_X)
        results = olsModel.fit()
    
        if (highestVIF < VIFThreshold) or results.rsquared_adj < r2adjustedThreshold:
            print("Threshold Reached.!")
            print("highestVIF: ", highestVIF)
            print("adjusted-R2 ", results.rsquared_adj)
            print("dropped ColumnName: ", droppedCol)
            break
        else:
            print("iter: %s, VIF_Val: %s, ColumnName: %s" % (iter, highestVIF, columntoDrop))
            droppedCol.append(columntoDrop)
            X.drop(columns=columntoDrop, inplace=True)
        
        iter += 1
    
    return X, y, droppedCol, vifData

In [13]:
## adjusted R2 Formula
def adjusted_r2_score(X, y, ypred):
    ss_res = np.sum((y-ypred)**2)
    ss_mean = np.sum((y-y.mean())**2)
    m, d = X.shape
    r2 = 1 - ss_res/ss_mean
    adjusted_r2_score = 1-(((1-r2) * (m -1))/(m-d-1))
    
    return adjusted_r2_score

In [14]:
## splitting the Data to Train-Validation-Test

randomState = 50
## splitting the data to features and target Variables
X = df.drop(columns="selling_price")
y = df["selling_price"]

X_tr_vl, X_test, y_tr_vl, y_test = train_test_split(X, y, test_size=0.2, random_state=randomState)
X_train, X_val, y_train, y_val = train_test_split(X_tr_vl, y_tr_vl, test_size=0.25, random_state=randomState)

print("Training Data size: X: %s, y: %s" % (X_train.shape, y_train.shape))
print("Validation Data size: X: %s, y: %s" % (X_val.shape, y_val.shape))
print("Test Data size: X: %s, y: %s" % (X_test.shape, y_test.shape))

Training Data size: X: (11772, 11), y: (11772,)
Validation Data size: X: (3924, 11), y: (3924,)
Test Data size: X: (3925, 11), y: (3925,)


In [15]:
# Scaling Model Names
X_MinMaxModelName = "minmaxModel"
X_StdModelName = "stdModel"
X_TgtModelName = "tgtModel"

# Scaling Continuous Data
# TODO: Need to figure out which ones to pass to Standard Scaler and vice-versa 
minmax_features = []
standard_features = contCols
Target_features = categCols

# Create and fit model for continuous features
preprocessor = ColumnTransformer(
    transformers=[
        (X_MinMaxModelName, MinMaxScaler(), minmax_features),
        (X_StdModelName, StandardScaler(), standard_features),
        (X_TgtModelName, TargetEncoder(), Target_features)],
        remainder="passthrough")
preprocessor.fit(X=X_train, y=y_train)

# Create and fit model for Target Variable
yStdScaler = StandardScaler()
yStdScaler.fit(y_train.values.reshape(-1, 1))

In [16]:
for polyVal in range(1, 3):
    X = X_train.copy()
    y = y_train.copy()
    # create polynomial Features    
    print(polyVal)
    polyFeatures = PolynomialFeatures(degree=polyVal)
    polyFeatures.fit(preprocessor.transform(X), y)
    
    X_poly_train_scaled = pd.DataFrame(polyFeatures.transform(preprocessor.transform(X)))
    y_poly_train_scaled = yStdScaler.transform(y_train.values.reshape(-1, 1))
    
    X_poly_val_scaled = pd.DataFrame(polyFeatures.transform(preprocessor.transform(X_val)))
    y_poly_val_scaled = yStdScaler.transform(y_val.values.reshape(-1, 1))
    
    X_poly_test_scaled = pd.DataFrame(polyFeatures.transform(preprocessor.transform(X_test)))
    y_poly_test_scaled = yStdScaler.transform(y_test.values.reshape(-1, 1))
    
    X_train_vif, y_train_vif, droppedCol, vifData = dropusingVif(X=X_poly_train_scaled, y=y_poly_train_scaled)
    
    print("# columns Dropped: %s, \nvifData: %s" % (droppedCol, vifData))
    
    ## Refit the model with new dropped columns
    ## Basic LR model.
    lrModel = LinearRegression()
    lrModel.fit(X_train_vif, y_train_vif)
    
    yPred = lrModel.predict(X_poly_val_scaled.drop(columns=droppedCol))
    
    print("R2:")
    print("\t - Training Score: ", lrModel.score(X_train_vif, y_train_vif))
    print("\t - Validation Score: ", lrModel.score(X_poly_val_scaled.drop(columns=droppedCol), y_poly_val_scaled))
    print("Adjusted R2:")
    print("\t - Training Score: ", adjusted_r2_score(X_train_vif, y_train_vif, lrModel.predict(X_train_vif)))
    print("\t - Validation Score: ", adjusted_r2_score(X_poly_val_scaled.drop(columns=droppedCol), y_poly_val_scaled, lrModel.predict(X_poly_val_scaled.drop(columns=droppedCol))))    

1
iter: 0, VIF_Val: 106.50507586486037, ColumnName: 0
iter: 1, VIF_Val: 24.802568819879788, ColumnName: 8
iter: 2, VIF_Val: 20.637625734712028, ColumnName: 9
iter: 3, VIF_Val: 10.428848703520604, ColumnName: 7
Threshold Reached.!
highestVIF:  7.212084471846783
adjusted-R2  0.7593497138342267
dropped ColumnName:  [np.int64(0), np.int64(8), np.int64(9), np.int64(7)]
# columns Dropped: [np.int64(0), np.int64(8), np.int64(9), np.int64(7)], 
vifData:    columns       vif
0        1  1.364060
1        2  1.296695
2        3  1.767831
3        4  3.539683
4        5  3.713853
5        6  6.780928
6       10  7.212084
7       11  6.559973
R2:
	 - Training Score:  0.7595132685270587
	 - Validation Score:  0.7506185549032991
Adjusted R2:
	 - Training Score:  0.7593497138342267
	 - Validation Score:  0.7501089631891806
2
iter: 0, VIF_Val: inf, ColumnName: 10
iter: 1, VIF_Val: 177562.67297105078, ColumnName: 0
iter: 2, VIF_Val: 26815.998233242328, ColumnName: 8
iter: 3, VIF_Val: 5263.616628549598,

In [17]:
print("R2 Validation Score: ", lrModel.score(X_poly_test_scaled.drop(columns=droppedCol), y_poly_test_scaled))
print("Adjusted Validation Score: ", adjusted_r2_score(X_poly_test_scaled.drop(columns=droppedCol), y_poly_test_scaled, lrModel.predict(X_poly_test_scaled.drop(columns=droppedCol))))    

R2 Validation Score:  0.8685661785545892
Adjusted Validation Score:  0.8676215823019015
