In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

# -----------------------------
# Load dataset
df = pd.read_csv("dataset.csv")  # Change to your filename

# -----------------------------
# Drop columns that are mostly unstructured text
df.drop(columns=['name', 'description', 'engine'], inplace=True)

# Drop rows with missing target value
df.dropna(subset=['price'], inplace=True)

# -----------------------------
# Fill missing values
# Numeric columns -> median
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

# Categorical columns -> 'Unknown'
cat_cols = df.select_dtypes(include='object').columns
df[cat_cols] = df[cat_cols].fillna('Unknown')

# -----------------------------
# One-hot encode categorical columns
categorical_cols = ['make', 'model', 'fuel', 'transmission', 'trim',
                    'body', 'exterior_color', 'interior_color', 'drivetrain']
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# -----------------------------
# Feature & target split
X = df.drop('price', axis=1)
y = df['price']

# -----------------------------
# Split and scale
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# -----------------------------
# 1. Linear Regression
print("\nðŸ”¹ Linear Regression")
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
r2_lr = r2_score(y_test, y_pred_lr)
print("RÂ²:", r2_lr)
print("MAE:", mean_absolute_error(y_test, y_pred_lr))
print("MSE:", mean_squared_error(y_test, y_pred_lr))

# -----------------------------
# 2. Decision Tree Regressor
print("\nðŸ”¹ Decision Tree Regressor")
dt = DecisionTreeRegressor(random_state=42)
param_dt = {'max_depth': [5, 10, 15], 'min_samples_split': [2, 5, 10]}
grid_dt = GridSearchCV(dt, param_dt, cv=5, n_jobs=-1, verbose=1)
grid_dt.fit(X_train, y_train)
y_pred_dt = grid_dt.predict(X_test)
r2_dt = r2_score(y_test, y_pred_dt)
print("Best Params:", grid_dt.best_params_)
print("RÂ²:", r2_dt)
print("MAE:", mean_absolute_error(y_test, y_pred_dt))
print("MSE:", mean_squared_error(y_test, y_pred_dt))

# -----------------------------
# 3. Random Forest Regressor
print("\nðŸ”¹ Random Forest Regressor")
rf = RandomForestRegressor(random_state=42)
param_rf = {'n_estimators': [50, 100], 'max_depth': [5, 10, 15]}
grid_rf = GridSearchCV(rf, param_rf, cv=5, n_jobs=-1, verbose=1)
grid_rf.fit(X_train, y_train)
y_pred_rf = grid_rf.predict(X_test)
r2_rf = r2_score(y_test, y_pred_rf)
print("Best Params:", grid_rf.best_params_)
print("RÂ²:", r2_rf)
print("MAE:", mean_absolute_error(y_test, y_pred_rf))
print("MSE:", mean_squared_error(y_test, y_pred_rf))

# -----------------------------
# ðŸ“Š Compare models
print("\nðŸ“Š Model Comparison")
print(f"Linear Regression RÂ²:     {r2_lr:.4f}")
print(f"Decision Tree RÂ²:         {r2_dt:.4f}")
print(f"Random Forest RÂ²:         {r2_rf:.4f}")

# -----------------------------
# ðŸ’¾ Save the best model
best_model = grid_rf.best_estimator_ if r2_rf >= max(r2_lr, r2_dt) else \
             grid_dt.best_estimator_ if r2_dt >= max(r2_lr, r2_rf) else \
             lr

joblib.dump(best_model, "best_vehicle_price_model.pkl")
print("\nâœ… Best model saved as: best_vehicle_price_model.pkl")



ðŸ”¹ Linear Regression
RÂ²: 0.7665451713561953
MAE: 5051.192830977877
MSE: 71294857.98024048

ðŸ”¹ Decision Tree Regressor
Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best Params: {'max_depth': 15, 'min_samples_split': 10}
RÂ²: 0.7540921481189233
MAE: 5538.968544141418
MSE: 75097891.4333654

ðŸ”¹ Random Forest Regressor
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best Params: {'max_depth': 15, 'n_estimators': 100}
RÂ²: 0.793888892107298
MAE: 4943.898241080319
MSE: 62944348.80925377

ðŸ“Š Model Comparison
Linear Regression RÂ²:     0.7665
Decision Tree RÂ²:         0.7541
Random Forest RÂ²:         0.7939

âœ… Best model saved as: best_vehicle_price_model.pkl


In [5]:
# Save the scaler
joblib.dump(scaler, "scaler.pkl")

# Save input feature names (optional safety check)
joblib.dump(X.columns.tolist(), "input_columns.pkl")


['input_columns.pkl']