In [144]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.stats.stattools import durbin_watson
import statsmodels.api as sm
import scipy.stats as stats
from scipy.special import inv_boxcox
import joblib
import json
import warnings
warnings.filterwarnings("ignore") 

In [None]:
# Configuration
loc = "Your folder path"

# eg . /Users/yourname/Documents/VSCode/End-End Linear Regression- Car Price Prediction/

## Load New Dataset


In [147]:
data_new = pd.read_csv(loc + r"raw/new car data.csv")


## Load saved features, model, Box-Cox Lambda Values and Feature Scaler

In [133]:
# Load model
model = joblib.load("final_model_lr.joblib")

# Load Box-Cox lambdas
fitted_lambda = joblib.load("boxcox_lambda_values.joblib")

# Load feature scaler
scaler = joblib.load("scaler.joblib")

# Load feature list
with open("feature_list.json", "r") as f:
    feature_list = json.load(f)

## Data Prepration

In [134]:
# Convert categorical variables to numerical using OneHotEncoding
data = data_new.copy()
data = pd.get_dummies(data, columns=['Fuel_Type', 'Seller_Type', 'Transmission'], drop_first=True)

In [135]:
# Create a new feature: Car Age using Year
curr_year = datetime.now().year
data['Car_Age'] = curr_year - data['Year']
data.drop(columns=['Car_Name', 'Year'], inplace=True)  # Drop redundant columns

In [136]:
# Ensure all expected columns exist
for col in feature_list:
    if col not in data.columns:
        data[col] = 0  # Assign 0 for missing one-hot encoded categories

In [137]:
# dropping Fuel_Type_Petrol as it was highly correlated with Fuel_Type_Diesel
data.drop(columns=['Fuel_Type_Petrol'], inplace=True)

In [138]:
# Apply Box-Cox transformation
data_transformed = data.copy()
numerical_features = ['Present_Price', 'Kms_Driven', 'Owner', 'Car_Age']
for col in numerical_features:
    data_transformed[col] = stats.boxcox(data_transformed[col] + 1, fitted_lambda[col])

In [139]:
# Making sure order of features is same as train data
data_transformed = data_transformed[feature_list]

In [140]:
# Apply feature scaling
data_scaled = scaler.transform(data_transformed)

## Make Predictions on new dataset

In [141]:
# Predict
predicted_price_transformed = model.predict(data_scaled)

In [None]:
# Reshape before inverse transform
predicted_price_transformed = predicted_price_transformed.reshape(-1, 1)

# Undo box-cox transformation on target variable
predicted_prices_original = inv_boxcox(predicted_price_transformed, fitted_lambda['Selling_Price']) - 0.1

# add prediction to main df
data_new["Predicted_Selling_Price"] = predicted_prices_original


In [143]:
data_new

Unnamed: 0,Car_Name,Year,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner,Predicted_Selling_Price
0,Honda Activa 125,2016,0.57,24000,Petrol,Individual,Automatic,0,0.138338
1,city,2016,13.6,10980,Petrol,Dealer,Manual,0,9.303698
2,verna,2012,9.4,60000,Diesel,Dealer,Manual,0,3.975964
3,corolla altis,2013,18.61,40001,Petrol,Dealer,Manual,0,8.429142
4,ciaz,2015,8.92,42367,Diesel,Dealer,Manual,0,5.082745
5,Royal Enfield Classic 350,2015,1.47,26000,Petrol,Individual,Manual,0,0.268662
6,Yamaha Fazer,2014,0.88,8000,Petrol,Individual,Manual,0,0.154021
7,Hero Splender iSmart,2016,0.54,14000,Petrol,Individual,Manual,0,0.150484
8,ertiga,2014,9.95,45000,Diesel,Dealer,Manual,0,5.065907
9,vitara brezza,2018,9.83,2071,Diesel,Dealer,Manual,0,10.248118
