# Car Prices Regression
# Method 1: Dummy Variables
# Method 2: One-Hot Encoding

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [2]:
# Load dataset
data = pd.read_csv("/Users/jacobfrancis/dev/csc180/csv/carprices4.csv")
print("First rows of dataset:\n", data.head())

First rows of dataset:
   Car Model  Mileage  Sell Price($)  Age(yrs)
0    BMW X5    69000          18000         6
1    BMW X5    35000          34000         3
2    BMW X5    57000          26100         5
3    BMW X5    22500          40000         2
4    BMW X5    46000          31500         4


In [3]:
# Create dummy variables for Car Model (drop first to avoid multicollinearity)
dummies = pd.get_dummies(data['Car Model'], drop_first=True)
print("Dummy columns created:", list(dummies.columns))

Dummy columns created: ['BMW X5', 'Mercedez Benz C class']


In [4]:
# Combine dummy variables with original dataset
df_dummy = pd.concat([data, dummies], axis=1)
print(df_dummy.head())

  Car Model  Mileage  Sell Price($)  Age(yrs)  BMW X5  Mercedez Benz C class
0    BMW X5    69000          18000         6    True                  False
1    BMW X5    35000          34000         3    True                  False
2    BMW X5    57000          26100         5    True                  False
3    BMW X5    22500          40000         2    True                  False
4    BMW X5    46000          31500         4    True                  False


In [5]:
# Define features (X) and target (y)
X = df_dummy[['Mileage', 'Age(yrs)'] + list(dummies.columns)]
y = df_dummy['Sell Price($)']

In [7]:
# Train regression model using dummy variables
model = LinearRegression()
model.fit(X, y)

print("Model coefficients:", model.coef_)
print("Model intercept:", model.intercept_)
print("Model R² score:", model.score(X, y))

Model coefficients: [-3.70122094e-01 -1.33245363e+03 -4.28466659e+03  2.45354074e+03]
Model intercept: 56523.08523127496
Model R² score: 0.9417050937281083


In [8]:
# Prepare input feature order for predictions
cols = ['Mileage', 'Age(yrs)'] + list(dummies.columns)
print("Feature order for prediction:", cols)

Feature order for prediction: ['Mileage', 'Age(yrs)', 'BMW X5', 'Mercedez Benz C class']


In [9]:
# Predict using dummy variables
features_mercedez = [45000, 4] + [1 if col == 'Mercedez Benz C class' else 0 for col in dummies.columns]
features_bmw = [86000, 7] + [0 for col in dummies.columns]

pred1 = model.predict([features_mercedez])
pred2 = model.predict([features_bmw])

print("Predicted price (Mercedez Benz C class, 45000 miles, 4 yrs):", pred1[0])
print("Predicted price (BMW X5, 86000 miles, 7 yrs):", pred2[0])

Predicted price (Mercedez Benz C class, 45000 miles, 4 yrs): 36991.31721061283
Predicted price (BMW X5, 86000 miles, 7 yrs): 15365.409720591691




In [10]:
# Label encoding for Car Model
le = LabelEncoder()
data['Car Model Encoded'] = le.fit_transform(data['Car Model'])
print("Label Encoded Classes:", list(le.classes_))

Label Encoded Classes: ['Audi A5', 'BMW X5', 'Mercedez Benz C class']


In [11]:
# One-Hot Encoding using ColumnTransformer
ct = ColumnTransformer(
    transformers=[('encoder', OneHotEncoder(drop='first'), [0])],
    remainder='passthrough'
)

X2 = ct.fit_transform(data[['Car Model Encoded', 'Mileage', 'Age(yrs)']])
y2 = data['Sell Price($)']

In [12]:
# Train model using One-Hot Encoding
model2 = LinearRegression()
model2.fit(X2, y2)

print("Coefficients:", model2.coef_)
print("Intercept:", model2.intercept_)
print("R² score:", model2.score(X2, y2))
print("OneHot categories:", ct.named_transformers_['encoder'].categories_)

Coefficients: [-4.28466659e+03  2.45354074e+03 -3.70122094e-01 -1.33245363e+03]
Intercept: 56523.08523127322
R² score: 0.9417050937281082
OneHot categories: [array([0, 1, 2])]


In [13]:
# Predictions using One-Hot Encoding
pred1_ohe = model2.predict([[0, 0, 45000, 4]])
pred2_ohe = model2.predict([[0, 1, 86000, 7]])

In [14]:
# Compare both encoding methods
print("\nDummy Variables")
print("Predicted price (Mercedez Benz C class, 45000 miles, 4 yrs):", pred1[0])
print("Predicted price (BMW X5, 86000 miles, 7 yrs):", pred2[0])

print("\nOne-Hot Encoding")
print("Predicted price (Mercedez Benz C class, 45000 miles, 4 yrs):", pred1_ohe[0])
print("Predicted price (BMW X5, 86000 miles, 7 yrs):", pred2_ohe[0])


Dummy Variables
Predicted price (Mercedez Benz C class, 45000 miles, 4 yrs): 36991.31721061283
Predicted price (BMW X5, 86000 miles, 7 yrs): 15365.409720591691

One-Hot Encoding
Predicted price (Mercedez Benz C class, 45000 miles, 4 yrs): 34537.77647335152
Predicted price (BMW X5, 86000 miles, 7 yrs): 17818.950457856758
