# Car Price Prediction Model [Feature Engineering]

### 📦 Importing Libraries 

In [1]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-3.0.5-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.5-py3-none-win_amd64.whl (56.8 MB)
   ---------------------------------------- 0.0/56.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/56.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/56.8 MB ? eta -:--:--
   ---------------------------------------- 0.3/56.8 MB ? eta -:--:--
   ---------------------------------------- 0.5/56.8 MB 1.3 MB/s eta 0:00:45
    --------------------------------------- 1.0/56.8 MB 1.8 MB/s eta 0:00:32
   - -------------------------------------- 2.6/56.8 MB 3.6 MB/s eta 0:00:15
   ---- ----------------------------------- 6.6/56.8 MB 7.5 MB/s eta 0:00:07
   --------- ------------------------------ 13.1/56.8 MB 12.0 MB/s eta 0:00:04
   ---------- ----------------------------- 14.9/56.8 MB 11.5 MB/s eta 0:00:04
   --------------- ------------------------ 21.8/56.8 MB 14.5 MB/s eta 0:00:03
   ------------------ ------


[notice] A new release of pip is available: 25.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
import numpy as np

# Models and tools
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score

In [3]:
df = pd.read_csv('new_features.csv')
df

Unnamed: 0,Price,Kilometers,Transmission,Year,Doors,Seats,Warranty,Number_of_Owners,Car_Age,Kilometers_per_Year,...,Brand_Skoda,Brand_Smart,Brand_Ssang Yong,Brand_Subaru,Brand_Suzuki,Brand_Tank,Brand_Tesla,Brand_Toyota,Brand_Volkswagen,Brand_Volvo
0,6000.0,100.0,0,2024,4,5,1,1,1,100.000000,...,False,False,False,False,False,False,False,False,False,False
1,39900.0,43000.0,0,2021,4,5,1,1,4,10750.000000,...,False,False,False,False,False,False,False,False,False,False
2,8500.0,3383.0,0,2023,4,5,1,1,2,1691.500000,...,False,False,False,False,False,False,False,False,False,False
3,14000.0,39694.0,0,2023,4,5,1,1,2,19847.000000,...,False,False,False,False,False,False,False,False,False,False
4,8400.0,235793.0,0,2014,4,7,1,1,11,21435.727273,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5074,750.0,12000.0,0,2008,4,5,1,10,17,705.882353,...,False,False,False,False,False,False,False,False,False,False
5075,1500.0,18000.0,0,2008,2,4,2,1,17,1058.823529,...,False,False,False,False,False,False,False,False,False,False
5076,900.0,180000.0,0,2013,4,5,1,2,12,15000.000000,...,False,False,False,False,False,False,False,False,True,False
5077,5000.0,4500.0,0,2021,4,5,1,1,4,1125.000000,...,False,False,False,False,False,False,False,False,False,False


In [4]:
# ----- 1. Prepare Data -----
# Drop target leakage or unnecessary columns if any

X = df.drop(['Price'], axis=1)
y = df['Price']

# Ensure all features are numeric (after encoding)
X = pd.get_dummies(X, drop_first=True)

In [5]:
# ----- 2. Train/Test Split -----
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# ----- 3. Train Models -----

## Linear Regression
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
y_pred_lin = lin_reg.predict(X_test)

## Random Forest
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

## XGBoost
xgb = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)

# ----- Polynomial Regression -----
poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X_train)
X_poly_test = poly.transform(X_test)

poly_reg = LinearRegression()
poly_reg.fit(X_poly, y_train)
y_pred_poly = poly_reg.predict(X_poly_test)

In [7]:
# ----- 5. Evaluate -----
print("📊 R² Scores:")
print(f"Linear Regression:      {r2_score(y_test, y_pred_lin):.4f}")
print(f"Random Forest:          {r2_score(y_test, y_pred_rf):.4f}")
print(f"XGBoost:                {r2_score(y_test, y_pred_xgb):.4f}")
print(f"Polynomial Regression:  {r2_score(y_test, y_pred_poly):.4f}")

📊 R² Scores:
Linear Regression:      0.4698
Random Forest:          0.6730
XGBoost:                0.7062
Polynomial Regression:  0.5753


In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score
import pickle

# 1. Load dataset
cleaned = pd.read_csv("cleaned_cars_data.csv")
print(cleaned.head())
print(cleaned.columns)

# 2. Define features and target
X = cleaned[["Year", "Kilometers"]]  
y = cleaned["Price"]

# 3. Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Define models
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=300, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=500, learning_rate=0.1, max_depth=6, random_state=42),
    "Polynomial Regression": make_pipeline(PolynomialFeatures(degree=2), LinearRegression())
}

# 5. Train, evaluate, and select best
results = {}
best_model = None
best_score = -1

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    results[name] = r2
    print(f"{name}: R² = {r2:.4f}")
    
    if r2 > best_score:
        best_score = r2
        best_model = model
        best_name = name

print("\n📊 Best model:", best_name, "with R² =", best_score)

# 6. Save best model
with open("best_model.pkl", "wb") as f:
    pickle.dump(best_model, f)

print("✅ Best model saved as best_model.pkl")


     Price  Kilometers Transmission      Fuel  Year     Brand Body Type  \
0   6000.0       100.0    Automatic  Gasoline  2024     Chery     Sedan   
1  39900.0     43000.0    Automatic  Electric  2021   Porsche     Sedan   
2   8500.0      3383.0    Automatic  Gasoline  2023       Kia       SUV   
3  14000.0     39694.0    Automatic  Gasoline  2023  Infiniti     Sedan   
4   8400.0    235793.0    Automatic  Gasoline  2014     Lexus       SUV   

   Color  Doors  Seats      Interior Warranty  Number_of_Owners  
0   Grey      4      5  Part Leather       No                 1  
1  White      4      5  Full Leather       No                 1  
2  Brown      4      5  Part Leather       No                 1  
3   Blue      4      5  Full Leather       No                 1  
4  White      4      7  Full Leather       No                 1  
Index(['Price', 'Kilometers', 'Transmission', 'Fuel', 'Year', 'Brand',
       'Body Type', 'Color', 'Doors', 'Seats', 'Interior', 'Warranty',
       'Num