In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [6]:
df = pd.read_csv('CarPrice_Assignment.csv')
df.columns

Index(['car_ID', 'symboling', 'CarName', 'fueltype', 'aspiration',
       'doornumber', 'carbody', 'drivewheel', 'enginelocation', 'wheelbase',
       'carlength', 'carwidth', 'carheight', 'curbweight', 'enginetype',
       'cylindernumber', 'enginesize', 'fuelsystem', 'boreratio', 'stroke',
       'compressionratio', 'horsepower', 'peakrpm', 'citympg', 'highwaympg',
       'price'],
      dtype='object')

In [7]:
df.corr()['price'].sort_values()

highwaympg         -0.697599
citympg            -0.685751
car_ID             -0.109093
peakrpm            -0.085267
symboling          -0.079978
compressionratio    0.067984
stroke              0.079443
carheight           0.119336
boreratio           0.553173
wheelbase           0.577816
carlength           0.682920
carwidth            0.759325
horsepower          0.808139
curbweight          0.835305
enginesize          0.874145
price               1.000000
Name: price, dtype: float64

In [8]:
cols = df.corr()['price'].sort_values()[(df.corr()['price'].sort_values() >
                                         0.759325) | (df.corr()['price'].sort_values() < -0.685751)].index
cols

Index(['highwaympg', 'citympg', 'carwidth', 'horsepower', 'curbweight',
       'enginesize', 'price'],
      dtype='object')

In [9]:
add = [
    'compressionratio',
    'fueltype',
    'aspiration',
    'cylindernumber',
    'drivewheel',

]
for i in add:
    cols = cols.insert(0, i)

In [10]:
df = df[cols]

In [11]:
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer


y = df.iloc[:, -1]
df.drop('price', axis=1, inplace=True)

numberCols = df.select_dtypes(include='number').columns.to_list()
objectCols = df.select_dtypes(include='object').columns.to_list()

scaler = MinMaxScaler()
encoder = OrdinalEncoder()

final_pipe = ColumnTransformer([
    ('num', scaler, numberCols),
    ('cat', encoder, objectCols)
])

X = final_pipe.fit_transform(df)
y = scaler.fit_transform(y.to_numpy().reshape(-1, 1))

In [12]:
x_train, x_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [13]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor


# Create a dictionary to store the regression models
car_price_models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(),
    'DecisionTreeRegressor': DecisionTreeRegressor()
}

for name, model in car_price_models.items():
    model.fit(x_train, y_train)
    score = model.score(x_test, y_test)

    print(f'{name}: {score*100:.3f}%')

Linear Regression: 78.430%
Random Forest: 94.476%
Gradient Boosting: 91.374%
DecisionTreeRegressor: 87.237%


so i choose Random Forest with 94.48% score.


In [14]:
model = RandomForestRegressor(random_state=42)
model.fit(x_train, y_train)
print(f'score {model.score(x_test, y_test)*100:.2f}%')

score 94.48%
