In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [3]:
df = pd.read_csv('dataset.csv')

In [4]:
df.head()

Unnamed: 0,name,description,make,model,year,price,engine,cylinders,fuel,mileage,transmission,trim,body,doors,exterior_color,interior_color,drivetrain
0,2024 Jeep Wagoneer Series II,"\n \n Heated Leather Seats, Nav Sy...",Jeep,Wagoneer,2024,74600.0,24V GDI DOHC Twin Turbo,6.0,Gasoline,10.0,8-Speed Automatic,Series II,SUV,4.0,White,Global Black,Four-wheel Drive
1,2024 Jeep Grand Cherokee Laredo,Al West is committed to offering every custome...,Jeep,Grand Cherokee,2024,50170.0,OHV,6.0,Gasoline,1.0,8-Speed Automatic,Laredo,SUV,4.0,Metallic,Global Black,Four-wheel Drive
2,2024 GMC Yukon XL Denali,,GMC,Yukon XL,2024,96410.0,"6.2L V-8 gasoline direct injection, variable v...",8.0,Gasoline,0.0,Automatic,Denali,SUV,4.0,Summit White,Teak/Light Shale,Four-wheel Drive
3,2023 Dodge Durango Pursuit,White Knuckle Clearcoat 2023 Dodge Durango Pur...,Dodge,Durango,2023,46835.0,16V MPFI OHV,8.0,Gasoline,32.0,8-Speed Automatic,Pursuit,SUV,4.0,White Knuckle Clearcoat,Black,All-wheel Drive
4,2024 RAM 3500 Laramie,\n \n 2024 Ram 3500 Laramie Billet...,RAM,3500,2024,81663.0,24V DDI OHV Turbo Diesel,6.0,Diesel,10.0,6-Speed Automatic,Laramie,Pickup Truck,4.0,Silver,Black,Four-wheel Drive


In [6]:
df.info() # to get information about the dataset

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1002 entries, 0 to 1001
Data columns (total 17 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   name            1002 non-null   object 
 1   description     946 non-null    object 
 2   make            1002 non-null   object 
 3   model           1002 non-null   object 
 4   year            1002 non-null   int64  
 5   price           979 non-null    float64
 6   engine          1000 non-null   object 
 7   cylinders       897 non-null    float64
 8   fuel            995 non-null    object 
 9   mileage         968 non-null    float64
 10  transmission    1000 non-null   object 
 11  trim            1001 non-null   object 
 12  body            999 non-null    object 
 13  doors           995 non-null    float64
 14  exterior_color  997 non-null    object 
 15  interior_color  964 non-null    object 
 16  drivetrain      1002 non-null   object 
dtypes: float64(4), int64(1), object(1

In [7]:
df.describe()

Unnamed: 0,year,price,cylinders,mileage,doors
count,1002.0,979.0,897.0,968.0,995.0
mean,2023.916168,50202.9857,4.975474,69.033058,3.943719
std,0.298109,18700.392062,1.392526,507.435745,0.274409
min,2023.0,0.0,0.0,0.0,2.0
25%,2024.0,36600.0,4.0,4.0,4.0
50%,2024.0,47165.0,4.0,8.0,4.0
75%,2024.0,58919.5,6.0,13.0,4.0
max,2025.0,195895.0,8.0,9711.0,5.0


In [8]:
df.isnull().sum()

Unnamed: 0,0
name,0
description,56
make,0
model,0
year,0
price,23
engine,2
cylinders,105
fuel,7
mileage,34


In [9]:
df = df.dropna(subset=['price'])          # remove rows with no price
df = df[df['price'] > 0]                  # remove weird $0 prices

In [10]:
df['cylinders'] = df['cylinders'].fillna(df['cylinders'].median())
df['mileage']   = df['mileage'].fillna(df['mileage'].median())
df['doors']     = df['doors'].fillna(df['doors'].median())

In [11]:
df['fuel']           = df['fuel'].fillna('Unknown')
df['transmission']   = df['transmission'].fillna('Unknown')
df['body']           = df['body'].fillna('Unknown')
df['drivetrain']     = df['drivetrain'].fillna('Unknown')

In [12]:
print("Average price by car make (top 10):")
print(df.groupby('make')['price'].mean().sort_values(ascending=False).head(10))
print("\n")

Average price by car make (top 10):
make
BMW              91366.388889
Mercedes-Benz    81110.153846
Audi             79318.000000
Jaguar           77053.000000
Lexus            73270.000000
Genesis          71007.500000
RAM              68712.233766
Volvo            65126.666667
Land Rover       63162.750000
GMC              61221.304348
Name: price, dtype: float64




In [13]:
# 8. Simple check - average price by body type
print("Average price by body type:")
print(df.groupby('body')['price'].mean().sort_values(ascending=False))
print("\n")

Average price by body type:
body
Pickup Truck     60499.986755
Cargo Van        56634.750000
Hatchback        54360.100000
Passenger Van    49310.384615
Sedan            48006.648649
SUV              47927.386067
Minivan          45300.000000
Unknown          43979.000000
Convertible      38040.000000
Name: price, dtype: float64




In [31]:
features = ['year', 'mileage', 'cylinders', 'doors', 'make', 'fuel', 'body', 'drivetrain']

X = df[features]
y = df['price']

# Turn categories into numbers (simple way)
X = pd.get_dummies(X)

print("Shape after making dummy variables:")
print(X.shape)
print("\n")

Shape after making dummy variables:
(978, 53)




In [32]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [34]:
model = LinearRegression()
model.fit(X_train, y_train)

In [35]:
# Predict and check score
y_pred = model.predict(X_test)
print("R² score:" , r2_score(y_test, y_pred))

R² score: 0.6266903032398441


In [36]:
print("\nSome predictions vs actual (first 10):")
results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(results.head(10))


Some predictions vs actual (first 10):
      Actual     Predicted
201  83940.0  72206.052892
556  51803.0  49842.090377
176  56105.0  48249.509463
952  37335.0  42437.902849
66   28860.0  45579.163060
505  22260.0  28673.929524
768  29111.0  27610.648082
561  60080.0  49336.749462
614  42150.0  38525.289123
160  45038.0  47491.466121
