In [37]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

## Data preparation

In [38]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv

--2025-11-08 19:44:34--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 874188 (854K) [text/plain]
Saving to: ‘car_fuel_efficiency.csv.2’


2025-11-08 19:44:34 (37.2 MB/s) - ‘car_fuel_efficiency.csv.2’ saved [874188/874188]



In [39]:
df = pd.read_csv('car_fuel_efficiency.csv')

In [40]:
df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


In [41]:
df.columns = df.columns.str.lower()

string_columns = list(df.dtypes[df.dtypes == 'object'].index)

for col in string_columns:
    df[col] = df[col].str.lower().str.replace(' ', '_')

float_columns = list(df.dtypes[df.dtypes == 'float64'].index)
for col in float_columns:
    df[col] = np.round(np.array(df[col]),decimals=1)

df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.4,17.7,2003,europe,gasoline,all-wheel_drive,0.0,13.2
1,130,5.0,97.0,3149.7,17.8,2007,usa,gasoline,front-wheel_drive,0.0,13.7
2,170,,78.0,3079.0,15.1,2018,europe,gasoline,front-wheel_drive,0.0,14.2
3,220,4.0,,2542.4,20.2,2009,usa,diesel,all-wheel_drive,2.0,16.9
4,210,1.0,140.0,3460.9,14.4,2009,europe,gasoline,all-wheel_drive,2.0,12.5


In [42]:
df = df.fillna(0)

In [43]:
df.dtypes

engine_displacement      int64
num_cylinders          float64
horsepower             float64
vehicle_weight         float64
acceleration           float64
model_year               int64
origin                  object
fuel_type               object
drivetrain              object
num_doors              float64
fuel_efficiency_mpg    float64
dtype: object

In [44]:
df.nunique()

engine_displacement      36
num_cylinders            14
horsepower              193
vehicle_weight         7492
acceleration            163
model_year               24
origin                    3
fuel_type                 2
drivetrain                2
num_doors                 9
fuel_efficiency_mpg     171
dtype: int64

In [45]:
df = df.rename(columns={'fuel_efficiency_mpg':'mpg'})

In [46]:
df.columns

Index(['engine_displacement', 'num_cylinders', 'horsepower', 'vehicle_weight',
       'acceleration', 'model_year', 'origin', 'fuel_type', 'drivetrain',
       'num_doors', 'mpg'],
      dtype='object')

Now we're ready to prepare the data for training:

* First, do train-validation-test split
* Then, apply one-hot encoding to categorical features and get the feature matrix 

In [47]:
from sklearn.model_selection import train_test_split

In [48]:
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=1)

In [49]:
y_train = df_train.mpg.values
y_val = df_val.mpg.values

In [50]:
y_train

array([15.3, 15.3, 15.3, ..., 15.2, 17.4, 16.2], shape=(5822,))

In [51]:
del df_train['mpg']
del df_val['mpg']

In [52]:
len(df_train), len(df_val), len(df_test)

(5822, 1941, 1941)

For OHE, we'll use `DictVectorizer`

In [53]:
from sklearn.feature_extraction import DictVectorizer

In [54]:
dict_train = df_train.to_dict(orient='records')
dict_val = df_val.to_dict(orient='records')

In [55]:
dv = DictVectorizer(sparse=False)

X_train = dv.fit_transform(dict_train)
X_val = dv.transform(dict_val)

In [56]:
X_train.shape

(5822, 14)

Now we're ready to train a model. We'll start with decision trees

## Decision trees

We'll use `DecisionTreeClassifier` and for evaluating the quality of our models, we'll use AUC


In [57]:
from sklearn.tree import DecisionTreeRegressor

Let's fit the tree with default parameters

In [58]:
dt = DecisionTreeRegressor(max_depth=1)
dt.fit(X_train, y_train)

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,1
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [59]:
importances = list(zip(dv.feature_names_, dt.feature_importances_))

df_importance = pd.DataFrame(importances, columns=['feature', 'gain'])
df_importance = df_importance.sort_values(by='gain', ascending=False)
df_importance

Unnamed: 0,feature,gain
13,vehicle_weight,1.0
0,acceleration,0.0
2,drivetrain=front-wheel_drive,0.0
3,engine_displacement,0.0
4,fuel_type=diesel,0.0
1,drivetrain=all-wheel_drive,0.0
5,fuel_type=gasoline,0.0
6,horsepower,0.0
8,num_cylinders,0.0
7,model_year,0.0


## Answer 1: Vehicle Weight

<br>
<br>
<br>

### Question 2

In [60]:
from sklearn.ensemble import RandomForestRegressor

In [61]:
rf = RandomForestRegressor(n_estimators=1,
                           random_state=1,
                           n_jobs=-1)

In [62]:
rf.fit(X_train,y_train)

0,1,2
,n_estimators,1
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [63]:
y_pred = rf.predict(X_val)

In [64]:
def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    return np.sqrt(mse)
    
rmse(y_pred,y_val)

np.float64(0.6124965171441)

## Answer 2: 0.46

<br>
<br>
<br>

### Question 3

scores = []

for i in range(10,201,10):
    
    rf = RandomForestRegressor(n_estimators=i,
                               random_state=1,
                               n_jobs=-1)
    rf.fit(X_train,y_train)
    y_pred = rf.predict(X_val)
    score = rmse(y_pred,y_val)
    scores.append(score)
    print('%s -> %.3f' % (i,score))



plt.figure(figsize=(6, 4))

plt.plot(range(10,201,10), scores, color='black')
plt.xticks(range(0, 201, 50))

plt.xlabel('n_est')
plt.ylabel('rmse')

# plt.savefig('ch06-figures/06_random_forest_n_estimators.svg')

plt.show()

## Answer 3:  Stops improving at 80.

<br>
<br>
<br>

### Question 4
Let's select the best max_depth:<br>
<br>
Try different values of max_depth: [10, 15, 20, 25]<br>
For each of these values,<br>
try different values of n_estimators from 10 till 200 (with step 10)<br>
calculate the mean RMSE<br>
Fix the random seed: random_state=1<br>
What's the best max_depth, using the mean RMSE?<br>

10, 15, 20, 25

scores = []

for dep in [10, 15, 20, 25]:
    for i in range(10,201,10):
        rf = RandomForestRegressor(n_estimators=i, max_depth=dep, random_state=1, n_jobs=-1)
        rf.fit(X_train,y_train)
        y_pred = rf.predict(X_val)
        score = rmse(y_pred,y_val)
        scores.append(score)
        print('%s,%d -> %.3f' % (i,dep,score))

## Answer 4: 10 is the best max depth with values of 0.440.

<br>
<br>
<br>

### Question 5
We can extract feature importance information from tree-based models.

At each step of the decision tree learning algorithm, it finds the best split. When doing it, we can calculate "gain" - the reduction in impurity before and after the split. This gain is quite useful in understanding what are the important features for tree-based models.

In Scikit-Learn, tree-based models contain this information in the feature_importances_ field.

For this homework question, we'll find the most important feature:

Train the model with these parameters:
n_estimators=10,
max_depth=20,
random_state=1,
n_jobs=-1 (optional)
Get the feature importance information from this model
What's the most important feature (among these 4)?
vehicle_weight
horsepower
acceleration
engine_displacement

In [65]:
rf = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1, n_jobs=-1)
rf.fit(X_train,y_train)
y_pred = rf.predict(X_val)

In [66]:
importances = list(zip(dv.feature_names_, rf.feature_importances_))

df_importance = pd.DataFrame(importances, columns=['feature', 'gain'])
df_importance = df_importance.sort_values(by='gain', ascending=False)
df_importance

Unnamed: 0,feature,gain
13,vehicle_weight,0.958888
6,horsepower,0.016278
0,acceleration,0.011522
3,engine_displacement,0.00336
7,model_year,0.003292
8,num_cylinders,0.002303
9,num_doors,0.001503
12,origin=usa,0.000538
11,origin=europe,0.000496
10,origin=asia,0.000433


## Answer 5:  Vehicle Weight

<br>
<br>
<br>

### Question 6
Now let's train an XGBoost model! For this question, we'll tune the eta parameter:

Install XGBoost
Create DMatrix for train and validation
Create a watchlist
Train a model with these parameters for 100 rounds:

Now change eta from 0.3 to 0.1.

Which eta leads to the best RMSE score on the validation dataset?

0.3
0.1
Both give equal value

In [76]:
import xgboost as xgb

In [69]:
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=dv.feature_names_)
dval = xgb.DMatrix(X_val, label=y_val, feature_names=dv.feature_names_)

In [78]:
xgb_params = {
    'eta': 0.1, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}

In [79]:
model = xgb.train(xgb_params, dtrain, num_boost_round=100)

In [80]:
y_pred = model.predict(dval)
y_pred[:10]

array([18.539757, 15.225349, 18.259054, 14.496225, 13.254909, 16.027948,
       15.449861, 13.063313, 10.34965 , 15.793989], dtype=float32)

In [81]:
rmse(y_pred,y_val)

np.float64(0.426912357182547)

eta 0.1 -> 0.427<br>
eta 0.3 -> 0.447

## Answer 6:  An eta of 0.1 gives the best rmse of 0.427.



