In [25]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score , make_scorer, mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [2]:
df = pd.read_csv(r'D:\2025 lessons\AI+ML course amaliyot\Datasets\Car features and MSRP\data.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11914 entries, 0 to 11913
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Make               11914 non-null  object 
 1   Model              11914 non-null  object 
 2   Year               11914 non-null  int64  
 3   Engine Fuel Type   11911 non-null  object 
 4   Engine HP          11845 non-null  float64
 5   Engine Cylinders   11884 non-null  float64
 6   Transmission Type  11914 non-null  object 
 7   Driven_Wheels      11914 non-null  object 
 8   Number of Doors    11908 non-null  float64
 9   Market Category    8172 non-null   object 
 10  Vehicle Size       11914 non-null  object 
 11  Vehicle Style      11914 non-null  object 
 12  highway MPG        11914 non-null  int64  
 13  city mpg           11914 non-null  int64  
 14  Popularity         11914 non-null  int64  
 15  MSRP               11914 non-null  int64  
dtypes: float64(3), int64(5

In [5]:
df.isnull().sum()

Make                    0
Model                   0
Year                    0
Engine Fuel Type        3
Engine HP              69
Engine Cylinders       30
Transmission Type       0
Driven_Wheels           0
Number of Doors         6
Market Category      3742
Vehicle Size            0
Vehicle Style           0
highway MPG             0
city mpg                0
Popularity              0
MSRP                    0
dtype: int64

In [10]:
missing_cols = df.columns[df.isnull().any()]
missing_cols

Index(['Engine Fuel Type', 'Engine HP', 'Engine Cylinders', 'Number of Doors',
       'Market Category'],
      dtype='object')

In [None]:
df.duplicated().sum()   # ! check dublicates

np.int64(715)

# Missing value

In [15]:
for col in missing_cols:
    if df[col].dtype == 'object':
        df[col] = df[col].fillna(df[col].mode()[0])
    else : 
        df[col] = df[col].fillna(df[col].mean())
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11914 entries, 0 to 11913
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Make               11914 non-null  object 
 1   Model              11914 non-null  object 
 2   Year               11914 non-null  int64  
 3   Engine Fuel Type   11914 non-null  object 
 4   Engine HP          11914 non-null  float64
 5   Engine Cylinders   11914 non-null  float64
 6   Transmission Type  11914 non-null  object 
 7   Driven_Wheels      11914 non-null  object 
 8   Number of Doors    11914 non-null  float64
 9   Market Category    11914 non-null  object 
 10  Vehicle Size       11914 non-null  object 
 11  Vehicle Style      11914 non-null  object 
 12  highway MPG        11914 non-null  int64  
 13  city mpg           11914 non-null  int64  
 14  Popularity         11914 non-null  int64  
 15  MSRP               11914 non-null  int64  
dtypes: float64(3), int64(5

# Scaling and encoding

In [29]:
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df[col].map(df[col].value_counts())   # Encoding 
    df[col] = StandardScaler().fit_transform(df[[col]])  # Scaling all data
  

# Model selection

In [33]:
x= df.drop(columns = ['MSRP'])
y =df['MSRP']

x_train, x_temp, y_train, y_temp  = train_test_split(x,y , test_size=0.2, random_state=42)
x_test, x_val, y_test, y_val = train_test_split(x_temp,y_temp, test_size=0.5, random_state=42)

In [34]:
x_test.shape

(1191, 15)

In [35]:
y_test.shape

(1191,)

In [36]:
x_val.shape

(1192, 15)

# Model Fitting

In [54]:
model = DecisionTreeRegressor(random_state=42)
model.fit(x_train,y_train)

In [55]:
y_pred = model.predict(x_test)
y_pred

array([-0.64210501, -0.08185022, -0.41475886, ..., -0.04628839,
       -0.16848799,  0.99831436], shape=(1191,))

In [56]:
MSE = mean_squared_error(y_test,y_pred)
R2 = r2_score(y_test,y_pred)
print(f"MSE = {MSE}   R2 = {R2}")

MSE = 0.039963165258386044   R2 = 0.9442224620090256


# Hyperparameter tuning : GridSearch CV

In [48]:
param_grid = {
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'sqrt', 'log2']
}


In [38]:
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)
r2_scorer = make_scorer(r2_score)

In [39]:
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring={'MSE': mse_scorer, 'R²': r2_scorer},
    refit='R²',  
    cv=5,        
    n_jobs=-1    
)

In [40]:
grid_search

In [41]:
grid_search.fit(x_train,y_train)

In [42]:
print("Best Parameters:", grid_search.best_params_)

Best Parameters: {'max_depth': 20, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 5}


In [None]:
best_model = grid_search.best_estimator_
best_model

In [60]:
best_y_pred = best_model.predict(x_test)
# best_y_pred = grid_search.predict(x_test)
best_y_pred


array([-0.64210501, -0.08185022, -0.41475886, ..., -0.04628839,
       -0.16848799,  1.08753088], shape=(1191,))

In [58]:
best_MSE = mean_squared_error(y_test, best_y_pred)
best_R2 = r2_score(y_test,best_y_pred)
print(f"best MSE = {best_MSE}  best R2 = {best_R2}")

best MSE = 0.03801059385603557  best R2 = 0.9469477122456004


In [61]:
from tabulate import tabulate

table  = [['Metric', 'Before tuning', 'GridSearchCV tuning'], 
    ['Mean squared error',MSE  , best_MSE],
    ['R2 score', R2, best_R2]]
print(tabulate(table, headers='firstrow', tablefmt='grid'))

+--------------------+-----------------+-----------------------+
| Metric             |   Before tuning |   GridSearchCV tuning |
| Mean squared error |       0.0399632 |             0.0380106 |
+--------------------+-----------------+-----------------------+
| R2 score           |       0.944222  |             0.946948  |
+--------------------+-----------------+-----------------------+
