In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_absolute_error
import pickle

In [14]:
df = pd.read_csv("car-price-prediction/CarPrice_Assignment.csv")  # Adjust path if needed
print("First 5 rows:")
print(df.head())
print("\nDataset info:")
print(df.info())


First 5 rows:
   car_ID  symboling                   CarName fueltype aspiration doornumber  \
0       1          3        alfa-romero giulia      gas        std        two   
1       2          3       alfa-romero stelvio      gas        std        two   
2       3          1  alfa-romero Quadrifoglio      gas        std        two   
3       4          2               audi 100 ls      gas        std       four   
4       5          2                audi 100ls      gas        std       four   

       carbody drivewheel enginelocation  wheelbase  ...  enginesize  \
0  convertible        rwd          front       88.6  ...         130   
1  convertible        rwd          front       88.6  ...         130   
2    hatchback        rwd          front       94.5  ...         152   
3        sedan        fwd          front       99.8  ...         109   
4        sedan        4wd          front       99.4  ...         136   

   fuelsystem  boreratio  stroke compressionratio horsepower  peak

In [15]:
df = df.drop(['car_ID'], axis=1)


In [16]:
df['CarBrand'] = df['CarName'].apply(lambda x: x.split(' ')[0])
df = df.drop('CarName', axis=1)


In [17]:
categorical_cols = ['fueltype','aspiration','doornumber','carbody','drivewheel','enginelocation','enginetype','cylindernumber','fuelsystem','CarBrand']

In [18]:
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)


In [19]:
print("\nMissing values:")
print(df.isnull().sum())
# Drop rows with missing values (if any)
df = df.dropna()



Missing values:
symboling              0
wheelbase              0
carlength              0
carwidth               0
carheight              0
                      ..
CarBrand_toyouta       0
CarBrand_vokswagen     0
CarBrand_volkswagen    0
CarBrand_volvo         0
CarBrand_vw            0
Length: 71, dtype: int64


In [None]:
numeric_cols = ['wheelbase', 'carlength', 'carwidth', 'carheight',
                'curbweight', 'enginesize', 'horsepower', 'peakrpm', 'citympg', 'highwaympg']

X = df[numeric_cols]
y = df['price']

print("\nFeature columns:")
print(X.columns.tolist())
print("\nTarget column:")
print(y.name)



Feature columns:
['symboling', 'wheelbase', 'carlength', 'carwidth', 'carheight', 'curbweight', 'enginesize', 'boreratio', 'stroke', 'compressionratio', 'horsepower', 'peakrpm', 'citympg', 'highwaympg', 'fueltype_gas', 'aspiration_turbo', 'doornumber_two', 'carbody_hardtop', 'carbody_hatchback', 'carbody_sedan', 'carbody_wagon', 'drivewheel_fwd', 'drivewheel_rwd', 'enginelocation_rear', 'enginetype_dohcv', 'enginetype_l', 'enginetype_ohc', 'enginetype_ohcf', 'enginetype_ohcv', 'enginetype_rotor', 'cylindernumber_five', 'cylindernumber_four', 'cylindernumber_six', 'cylindernumber_three', 'cylindernumber_twelve', 'cylindernumber_two', 'fuelsystem_2bbl', 'fuelsystem_4bbl', 'fuelsystem_idi', 'fuelsystem_mfi', 'fuelsystem_mpfi', 'fuelsystem_spdi', 'fuelsystem_spfi', 'CarBrand_alfa-romero', 'CarBrand_audi', 'CarBrand_bmw', 'CarBrand_buick', 'CarBrand_chevrolet', 'CarBrand_dodge', 'CarBrand_honda', 'CarBrand_isuzu', 'CarBrand_jaguar', 'CarBrand_maxda', 'CarBrand_mazda', 'CarBrand_mercury', '

In [21]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [23]:

models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42)
}

results = {}


In [24]:

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    r2 = r2_score(y_test, preds)
    mae = mean_absolute_error(y_test, preds)
    results[name] = {"r2_score": r2, "mae": mae}
    print(f"{name}: R2 = {r2:.4f}, MAE = {mae:.2f}")


Linear Regression: R2 = -136890096237173440512.0000, MAE = 16235052876289.22
Decision Tree: R2 = 0.9031, MAE = 1847.04
Random Forest: R2 = 0.9577, MAE = 1296.71


In [25]:
best_model = RandomForestRegressor(n_estimators=100, random_state=42)
best_model.fit(X_train, y_train)


In [26]:
pickle.dump(best_model, open('model.pkl', 'wb'))
pickle.dump(scaler, open('scaler.pkl', 'wb'))

print("\nModel and scaler saved as model.pkl and scaler.pkl")


Model and scaler saved as model.pkl and scaler.pkl
