In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import warnings

In [14]:
pip install xgboost


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:
df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv')

In [4]:
df = df.fillna(0)

In [5]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)


df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.fuel_efficiency_mpg.values
y_val = df_val.fuel_efficiency_mpg.values
y_test = df_test.fuel_efficiency_mpg.values

del df_train['fuel_efficiency_mpg']
del df_val['fuel_efficiency_mpg']
del df_test['fuel_efficiency_mpg']

In [6]:
train_dict = df_train.to_dict(orient='records')
val_dict = df_val.to_dict(orient='records')
test_dict = df_test.to_dict(orient='records')

dv = DictVectorizer(sparse=True)
X_train = dv.fit_transform(train_dict)
X_val = dv.transform(val_dict)
X_test = dv.transform(test_dict)

In [7]:
dt = DecisionTreeRegressor(max_depth=1, random_state=1)
dt.fit(X_train, y_train)

feature_names = dv.get_feature_names_out()
tree_feature_index = dt.tree_.feature[0]

if tree_feature_index >= 0:  # Valid feature index
    tree_feature = feature_names[tree_feature_index]
    print(f"Feature used for splitting: {tree_feature}")
else:
    print("No feature found for splitting")
    tree_feature = "unknown"

Feature used for splitting: vehicle_weight


In [8]:
rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(rmse)

0.4595777223092726


In [9]:
best_rmse = float('inf')
best_n = 0
stopping_point = None
improvements = []

for n in range(10, 201, 10):
    rf = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    rf.fit(X_train, y_train)
    
    y_pred = rf.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    
    improvements.append((n, rmse))
    print(f"n_estimators={n:3d}: RMSE = {rmse:.3f}")
    
    # Check if improvement is significant (considering 3 decimal places)
    if rmse < best_rmse - 0.0005:
        if best_rmse != float('inf'):
            print(f"  → Improvement from {best_rmse:.3f} to {rmse:.3f}")
        best_rmse = rmse
        best_n = n
        stopping_point = n
    elif stopping_point is None:
        stopping_point = n

print(f"\nBest RMSE: {best_rmse:.3f} at n_estimators={best_n}")
print(f"RMSE stops improving after n_estimators = {stopping_point}")

n_estimators= 10: RMSE = 0.460
n_estimators= 20: RMSE = 0.454
  → Improvement from 0.460 to 0.454
n_estimators= 30: RMSE = 0.452
  → Improvement from 0.454 to 0.452
n_estimators= 40: RMSE = 0.449
  → Improvement from 0.452 to 0.449
n_estimators= 50: RMSE = 0.447
  → Improvement from 0.449 to 0.447
n_estimators= 60: RMSE = 0.445
  → Improvement from 0.447 to 0.445
n_estimators= 70: RMSE = 0.445
n_estimators= 80: RMSE = 0.445
n_estimators= 90: RMSE = 0.445
  → Improvement from 0.445 to 0.445
n_estimators=100: RMSE = 0.445
n_estimators=110: RMSE = 0.444
  → Improvement from 0.445 to 0.444
n_estimators=120: RMSE = 0.444
n_estimators=130: RMSE = 0.444
n_estimators=140: RMSE = 0.443
n_estimators=150: RMSE = 0.443
  → Improvement from 0.444 to 0.443
n_estimators=160: RMSE = 0.443
n_estimators=170: RMSE = 0.443
n_estimators=180: RMSE = 0.442
  → Improvement from 0.443 to 0.442
n_estimators=190: RMSE = 0.442
n_estimators=200: RMSE = 0.442

Best RMSE: 0.442 at n_estimators=180
RMSE stops improvi

In [13]:
rf_importance = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1, n_jobs=-1)
rf_importance.fit(X_train, y_train)

importances = rf_importance.feature_importances_
feature_names = dv.get_feature_names_out()

feature_importance_dict = dict(zip(feature_names, importances))

sorted_features = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

for feature, importance in sorted_features[:10]:
    print(feature + ': ' + str(round(importance,4)))

vehicle_weight: 0.9591
horsepower: 0.016
acceleration: 0.0115
engine_displacement: 0.0033
model_year: 0.0032
num_cylinders: 0.0023
num_doors: 0.0016
origin=USA: 0.0005
origin=Europe: 0.0005
origin=Asia: 0.0005


In [15]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

watchlist = [(dtrain, 'train'), (dval, 'val')]

# Test eta=0.3
xgb_params_03 = {
    'eta': 0.3, 
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,
    'seed': 1,
    'verbosity': 0,
}

model_03 = xgb.train(xgb_params_03, dtrain, num_boost_round=100, 
                   evals=watchlist, verbose_eval=False)
y_pred_03 = model_03.predict(dval)
rmse_03 = np.sqrt(mean_squared_error(y_val, y_pred_03))
print(f"RMSE with eta=0.3: {rmse_03:.3f}")

xgb_params_01 = {
    'eta': 0.1, 
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,
    'seed': 1,
    'verbosity': 0,
}

model_01 = xgb.train(xgb_params_01, dtrain, num_boost_round=100, 
                   evals=watchlist, verbose_eval=False)
y_pred_01 = model_01.predict(dval)
rmse_01 = np.sqrt(mean_squared_error(y_val, y_pred_01))
print(f"RMSE with eta=0.1: {rmse_01:.3f}")

RMSE with eta=0.3: 0.450
RMSE with eta=0.1: 0.426
