### Preparing the dataset 

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeRegressor, export_text
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, roc_auc_score
#!pip install xgboost
import xgboost as xgb

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [6]:
data = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv'
df = pd.read_csv(data)
print(df.head())

   engine_displacement  num_cylinders  horsepower  vehicle_weight  \
0                  170            3.0       159.0     3413.433759   
1                  130            5.0        97.0     3149.664934   
2                  170            NaN        78.0     3079.038997   
3                  220            4.0         NaN     2542.392402   
4                  210            1.0       140.0     3460.870990   

   acceleration  model_year  origin fuel_type         drivetrain  num_doors  \
0          17.7        2003  Europe  Gasoline    All-wheel drive        0.0   
1          17.8        2007     USA  Gasoline  Front-wheel drive        0.0   
2          15.1        2018  Europe  Gasoline  Front-wheel drive        0.0   
3          20.2        2009     USA    Diesel    All-wheel drive        2.0   
4          14.4        2009  Europe  Gasoline    All-wheel drive        2.0   

   fuel_efficiency_mpg  
0            13.231729  
1            13.688217  
2            14.246341  
3         

In [None]:
df = df.fillna(0)
print(df.columns[df.isna().any()])

Index(['num_cylinders', 'horsepower', 'acceleration', 'num_doors'], dtype='object')
Index([], dtype='object')


In [9]:
df.describe().round()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,num_doors,fuel_efficiency_mpg
count,9704.0,9222.0,8996.0,9704.0,8774.0,9704.0,9202.0,9704.0
mean,200.0,4.0,150.0,3001.0,15.0,2011.0,-0.0,15.0
std,49.0,2.0,30.0,498.0,3.0,7.0,1.0,3.0
min,10.0,0.0,37.0,953.0,6.0,2000.0,-4.0,6.0
25%,170.0,3.0,130.0,2666.0,13.0,2006.0,-1.0,13.0
50%,200.0,4.0,149.0,2993.0,15.0,2012.0,0.0,15.0
75%,230.0,5.0,170.0,3335.0,17.0,2017.0,1.0,17.0
max,380.0,13.0,271.0,4739.0,24.0,2023.0,4.0,26.0


In [43]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [44]:
y_train = df_train['fuel_efficiency_mpg'].astype(float).values
y_val   = df_val['fuel_efficiency_mpg'].astype(float).values

del df_train['fuel_efficiency_mpg']
del df_val['fuel_efficiency_mpg']
del df_test['fuel_efficiency_mpg']

train_dicts = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val.to_dict(orient='records')
X_val = dv.transform(val_dicts)


### Question 1

In [47]:
# 3) Regressor
dt = DecisionTreeRegressor(random_state=1, max_depth=1) 
dt.fit(X_train, y_train)

# 4) Predictions + metrics
y_pred = dt.predict(X_val)

rmse = mean_squared_error(y_val, y_pred)
mae  = mean_absolute_error(y_val, y_pred)
r2   = r2_score(y_val, y_pred)

print(f"RMSE: {rmse:.3f} | MAE: {mae:.3f} | R²: {r2:.3f}")

RMSE: 2.594 | MAE: 1.284 | R²: 0.614


In [48]:
print(export_text(dt, feature_names=list(dv.get_feature_names_out())))

|--- vehicle_weight <= 3022.11
|   |--- value: [16.88]
|--- vehicle_weight >  3022.11
|   |--- value: [12.94]



### Question 2

In [55]:
dt = RandomForestRegressor(max_depth=1,
                           n_estimators=10,
                           random_state=1) 

dt.fit(X_train, y_train)

y_pred = dt.predict(X_val)

rmse = mean_squared_error(y_val, y_pred)
mae  = mean_absolute_error(y_val, y_pred)
r2   = r2_score(y_val, y_pred)

print(f"RMSE: {rmse:.3f} | MAE: {mae:.3f} | R²: {r2:.3f}")

RMSE: 2.397 | MAE: 1.216 | R²: 0.644


### Question 3

In [None]:
scores = []
for n in [10, 20, 50, 100, 200]:
    rf = RandomForestRegressor(
        n_estimators=n,
        max_depth=1,
        random_state=1,
        n_jobs=-1
    )
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_val)

    rmse = mean_squared_error(y_val, y_pred)
    mae  = mean_absolute_error(y_val, y_pred)
    r2   = r2_score(y_val, y_pred)

    scores.append((n, rmse, mae, r2))
    print(f"n={n:3d} -> RMSE={rmse:.3f} | MAE={mae:.3f} | R²={r2:.3f}")


n= 10 -> RMSE=2.397 | MAE=1.216 | R²=0.644
n= 20 -> RMSE=2.399 | MAE=1.217 | R²=0.643
n= 50 -> RMSE=2.446 | MAE=1.232 | R²=0.636
n=100 -> RMSE=2.446 | MAE=1.233 | R²=0.636
n=200 -> RMSE=2.468 | MAE=1.240 | R²=0.633


### Question 4

In [64]:
depths = [10, 15, 20, 25]
estimators = range(10, 201, 10)

results = []

for depth in depths:
    rmses = []
    for n in estimators: 
        rf = RandomForestRegressor(
            n_estimators=n,
            max_depth=depth,
            random_state=1,
            n_jobs=-1
        )
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_val)

        rmse = mean_squared_error(y_val, y_pred)
        rmses.append(rmse)

    mean_rmse = np.mean(rmses)
    results.append((depth, mean_rmse))
    print(f"max_depth={depth:2d} -> mean RMSE={mean_rmse:.4f}")


max_depth=10 -> mean RMSE=0.1957
max_depth=15 -> mean RMSE=0.1981
max_depth=20 -> mean RMSE=0.1986
max_depth=25 -> mean RMSE=0.1986


### Question 5

In [66]:
rf = RandomForestRegressor(
    n_estimators=10,
    max_depth=20,
    random_state=1,
    n_jobs=-1
)
rf.fit(X_train, y_train)

feature_names = np.array(dv.get_feature_names_out())
importances = rf.feature_importances_

raw_imp = pd.Series(importances, index=feature_names).sort_values(ascending=False)
print("Top one-hot features:")
print(raw_imp.head(15))

Top one-hot features:
vehicle_weight                  0.959162
horsepower                      0.016040
acceleration                    0.011471
engine_displacement             0.003269
model_year                      0.003182
num_cylinders                   0.002359
num_doors                       0.001591
origin=USA                      0.000555
origin=Europe                   0.000520
origin=Asia                     0.000476
drivetrain=All-wheel drive      0.000382
fuel_type=Diesel                0.000344
fuel_type=Gasoline              0.000337
drivetrain=Front-wheel drive    0.000312
dtype: float64


### Question 6

In [74]:
features = list(dv.get_feature_names_out())
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=features)
dval = xgb.DMatrix(X_val, label=y_val, feature_names=features)

xgb_params = {
    'eta': 0.3, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}

model = xgb.train(xgb_params, dtrain, num_boost_round=10)
y_pred = rf.predict(X_val)
rmse = mean_squared_error(y_val, y_pred)
print("RMSE=", round(rmse,2), "eta=0.3")

xgb_params = {
    'eta': 0.1, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}

model = xgb.train(xgb_params, dtrain, num_boost_round=10)
y_pred = rf.predict(X_val)
rmse = mean_squared_error(y_val, y_pred)
print("RMSE=", round(rmse,2), "eta=0.1")

RMSE= 0.21 eta=0.3
RMSE= 0.21 eta=0.1
