# Importing Required Libraries

In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,cross_val_score, KFold,GridSearchCV
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_absolute_error
import joblib

# Train-Test Split

In [5]:
car_df = pd.read_csv('car_df_no_outliers_iqr.csv')
x = car_df.drop('price_in_lakh', axis=1)
y = car_df['price_in_lakh']# Target variable

# Performing the train-test split (80% train, 20% test)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [6]:
car_df.shape

(5148, 537)

# Model Training and Evaluation

In [7]:
# List of models with their names
models = []
models.append(('Linear Regression', LinearRegression()))
models.append(('Decision Tree Regression', DecisionTreeRegressor()))
models.append(('Random Forest Regression', RandomForestRegressor()))
models.append(('Gradient Boosting Regression', GradientBoostingRegressor()))

results = []
names = []
mean_scores = []

# K-fold cross-validation
kFold = KFold(n_splits=5, shuffle=True, random_state=42)


for name, model in models:
    cv_results = cross_val_score(model, x_train, y_train, cv=kFold, scoring='neg_mean_squared_error')
    # Converting the negative MSE to positive for easier interpretation
    mean_mse = np.mean(np.abs(cv_results))
    print(f'{name}: Mean MSE = {mean_mse}')
    results.append(cv_results)
    names.append(name)
    mean_scores.append(mean_mse)

# Identifing the best model with lowest MSE
best_model_index = np.argmin(mean_scores)
best_model_name = names[best_model_index]
best_model_score = mean_scores[best_model_index]

print(f'\nBest Model: {best_model_name} with Mean MSE: {best_model_score}')

Linear Regression: Mean MSE = 9.664248931897836e+20
Decision Tree Regression: Mean MSE = 0.011580810639277097
Random Forest Regression: Mean MSE = 0.005605328492325405
Gradient Boosting Regression: Mean MSE = 0.0062567190047464975

Best Model: Random Forest Regression with Mean MSE: 0.005605328492325405


# Cross-Validation (K-fold CV)

In [8]:
model = RandomForestRegressor()

# K-fold cross-validation (K=5)
cv_scores = cross_val_score(model, x_train, y_train, cv=5)
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Score: {cv_scores.mean()}')

Cross-Validation Scores: [0.90743961 0.90446078 0.91960947 0.90396765 0.90730381]
Mean CV Score: 0.90855626310995


In [9]:
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

# Evaluating the model using R² score & MAE
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print(f'R² score on test set: {r2}')
print(f'Mean Absolute Error on test set: {mae}')

R² score on test set: 0.9125977043296024
Mean Absolute Error on test set: 0.048629649281305344


## Regularization:
    Regularization helps prevent overfitting by penalizing large weights in the model.
    L1 Regularization (Lasso) forces some weights to zero, performing feature selection.
    L2 Regularization (Ridge) shrinks weights but keeps all features, controlling the model complexity.
    By using regularization, you help the model generalize better to new, unseen data!
### Lasso Regression (L1 Regularization):

In [10]:
lasso = Lasso(alpha=0.1)
# Performing cross-validation
cv_results = cross_val_score(lasso, x_train, y_train, cv=kFold, scoring='neg_mean_squared_error')
mean_mse = np.mean(np.abs(cv_results))
print(f'Lasso Regression: Mean MSE = {mean_mse}')

Lasso Regression: Mean MSE = 0.043989786351518644


### Ridge Regression (L2 Regularization):

In [11]:
ridge = Ridge(alpha=0.1)
# Performing cross-validation
cv_results = cross_val_score(ridge, x_train, y_train, cv=kFold, scoring='neg_mean_squared_error')
mean_mse = np.mean(np.abs(cv_results))
print(f'Ridge Regression: Mean MSE = {mean_mse}')

Ridge Regression: Mean MSE = 0.0054919978594768866


# Hyperparameter tuning

In [12]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

rf = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(x_train, y_train)
print(f'Best Parameters: {grid_search.best_params_}')
best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(x_test)

# Calculating R² and MAE
r2 = r2_score(y_test, y_pred) 
mae = mean_absolute_error(y_test, y_pred)

print(f'R² score on test set: {r2}')
print(f'Mean Absolute Error on test set: {mae}')

Fitting 5 folds for each of 216 candidates, totalling 1080 fits
Best Parameters: {'bootstrap': True, 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
R² score on test set: 0.9142734774971522
Mean Absolute Error on test set: 0.04855360990461025


1. Saving the Model Using joblib (Preferred for large models like Random Forest)
2. Saving the Model Using pickle (Another option for smaller models)

Once saved, you can reuse the model without retraining, for example in a web app or other environments.

## Saving

In [13]:
import joblib

joblib.dump(best_rf, 'best_rf.joblib')

print("Model saved successfully!")

Model saved successfully!


## Loading

In [14]:
import joblib
loaded_model = joblib.load('best_rf.joblib')

In [15]:
loaded_model

In [16]:
y_pred = loaded_model.predict(x_test)

In [17]:
y_pred

array([ 0.17277551,  0.00961294, -0.13416744, ..., -0.28089004,
       -0.40679898, -0.15711837])