In [2]:
# Step 1: Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Step 2: Load dataset
df = pd.read_csv('Sport car price.csv')

# Step 3: Convert Price to numeric (remove commas)
df['Price (in USD)'] = df['Price (in USD)'].replace(r'[\,]', '', regex=True).astype(float)

# Step 4: Drop non-numeric / problematic columns
columns_to_drop = ['Car Model', 'Engine Size (L)']  # Engine Size contains 'Electric'
df = df.drop(columns=[col for col in columns_to_drop if col in df.columns])

# Step 5: Encode categorical columns (Car Make)
df = pd.get_dummies(df, columns=['Car Make'], drop_first=True)

# Step 6: Convert remaining numeric columns from string to float
numeric_cols = ['Horsepower', 'Torque (lb-ft)', '0-60 MPH Time (seconds)']
for col in numeric_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col].replace(r'[\,]', '', regex=True), errors='coerce')

# Step 7: Fill missing values
df = df.fillna(df.mean())

# Step 8: Features and Target
X = df.drop('Price (in USD)', axis=1)
y = df['Price (in USD)']

# Step 9: Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 10: Train Linear Regression
model = LinearRegression()
model.fit(X_train, y_train)

# Step 11: Make Predictions
y_pred = model.predict(X_test)

# Step 12: Evaluate Regression Metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2  = r2_score(y_test, y_pred)

print("MAE:", mae)
print("MSE:", mse)
print("R²:", r2)

# Step 13: Show Actual vs Predicted (first 20 for report)
comparison = pd.DataFrame({
    'Actual Price': y_test.values,
    'Predicted Price': y_pred
})
print(comparison.head(20))  # Show first 20 rows

# Step 14: Optional – create “bins” for pseudo confusion matrix
# This is ONLY if you want to show confusion matrix style comparison
bins = [0, 50000, 100000, 150000, 200000, 300000, np.inf]
y_test_binned = pd.cut(y_test, bins=bins)
y_pred_binned = pd.cut(y_pred, bins=bins)

confusion = pd.crosstab(y_test_binned, y_pred_binned)
print("\nPseudo Confusion Matrix (Price Ranges):")
print(confusion)


MAE: 116221.79836443882
MSE: 106583300634.4189
R²: 0.8243126430040657
    Actual Price  Predicted Price
0        75250.0     1.019728e+05
1       201000.0     2.043567e+05
2       310000.0     2.580408e+05
3       220000.0    -1.767032e+04
4        71500.0    -1.007560e+05
5        69995.0     6.576204e+04
6       305000.0     2.580408e+05
7       325000.0     2.710441e+05
8       201495.0     1.089292e+05
9       150980.0     1.329589e+05
10      500000.0     4.272887e+05
11       61600.0    -1.035238e+05
12       84595.0     9.177665e+04
13      208800.0     2.901415e+05
14      131190.0     1.923674e+04
15      117000.0     6.788848e+04
16       67150.0     1.716963e+04
17       64195.0     8.341513e+04
18       45790.0     4.922648e+04
19     1700000.0     1.550000e+06

Pseudo Confusion Matrix (Price Ranges):
col_0                 (0.0, 50000.0]  (50000.0, 100000.0]  \
Price (in USD)                                              
(0.0, 50000.0]                     3                 