In [8]:
!curl -o car_fuel_efficiency.csv https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  853k  100  853k    0     0  8811k      0 --:--:-- --:--:-- --:--:-- 8892k


In [9]:
import pandas as pd
df = pd.read_csv('car_fuel_efficiency.csv')
df.head().T

Unnamed: 0,0,1,2,3,4
engine_displacement,170,130,170,220,210
num_cylinders,3.0,5.0,,4.0,1.0
horsepower,159.0,97.0,78.0,,140.0
vehicle_weight,3413.433759,3149.664934,3079.038997,2542.392402,3460.87099
acceleration,17.7,17.8,15.1,20.2,14.4
model_year,2003,2007,2018,2009,2009
origin,Europe,USA,Europe,USA,Europe
fuel_type,Gasoline,Gasoline,Gasoline,Diesel,Gasoline
drivetrain,All-wheel drive,Front-wheel drive,Front-wheel drive,All-wheel drive,All-wheel drive
num_doors,0.0,0.0,0.0,2.0,2.0


In [10]:
df.columns = df.columns.str.lower()

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer

df = df.fillna(0)

df_train_full, df_test = train_test_split(df, test_size=0.4, random_state=1)
df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=1)

y_train = df_train_full.fuel_efficiency_mpg.values
y_val = df_val.fuel_efficiency_mpg.values
y_test = df_test.fuel_efficiency_mpg.values

for d in (df_train_full, df_val, df_test):
    d.drop(columns=['fuel_efficiency_mpg'], inplace=True)

In [12]:
dv = DictVectorizer(sparse=True)

train_dicts = df_train_full.to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val.to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [13]:
from sklearn.tree import DecisionTreeRegressor

dt = DecisionTreeRegressor(max_depth=1)
dt.fit(X_train, y_train)

In [14]:
feature_names = dv.get_feature_names_out()
split_feature = feature_names[dt.tree_.feature[0]]
print(split_feature)

vehicle_weight


In [16]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

rf = RandomForestRegressor(
    n_estimators=10,
    random_state=1,
    n_jobs=-1
)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_val)


rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(rmse)

0.46106454647960676


In [18]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

scores = []

for n in range(10, 201, 10):
    rf = RandomForestRegressor(
        n_estimators=n,
        random_state=1,
        n_jobs=-1
    )
    
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    scores.append((n, rmse))


for n, s in scores:
    print(f"n_estimators={n:3d}  ->  RMSE={s:.3f}")

n_estimators= 10  ->  RMSE=0.461
n_estimators= 20  ->  RMSE=0.447
n_estimators= 30  ->  RMSE=0.441
n_estimators= 40  ->  RMSE=0.440
n_estimators= 50  ->  RMSE=0.438
n_estimators= 60  ->  RMSE=0.436
n_estimators= 70  ->  RMSE=0.437
n_estimators= 80  ->  RMSE=0.437
n_estimators= 90  ->  RMSE=0.436
n_estimators=100  ->  RMSE=0.436
n_estimators=110  ->  RMSE=0.436
n_estimators=120  ->  RMSE=0.436
n_estimators=130  ->  RMSE=0.436
n_estimators=140  ->  RMSE=0.436
n_estimators=150  ->  RMSE=0.436
n_estimators=160  ->  RMSE=0.436
n_estimators=170  ->  RMSE=0.436
n_estimators=180  ->  RMSE=0.436
n_estimators=190  ->  RMSE=0.436
n_estimators=200  ->  RMSE=0.435


In [20]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

max_depth_values = [10, 15, 20, 25]
results = []

for depth in max_depth_values:
    rmse_list = []
    for n in range(10, 201, 10):
        rf = RandomForestRegressor(
            n_estimators=n,
            max_depth=depth,
            random_state=1,
            n_jobs=-1
        )
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        rmse_list.append(rmse)
    mean_rmse = np.mean(rmse_list)
    results.append((depth, mean_rmse))
    print(f"max_depth={depth:<2d}  ->  mean RMSE={mean_rmse:.3f}")

best_depth = min(results, key=lambda x: x[1])
print("\n✅ Best max_depth:", best_depth[0])

max_depth=10  ->  mean RMSE=0.436
max_depth=15  ->  mean RMSE=0.438
max_depth=20  ->  mean RMSE=0.438
max_depth=25  ->  mean RMSE=0.438

✅ Best max_depth: 10


In [21]:
from sklearn.ensemble import RandomForestRegressor
import numpy as np

rf = RandomForestRegressor(
    n_estimators=10,
    max_depth=20,
    random_state=1,
    n_jobs=-1
)
rf.fit(X_train, y_train)


feat_names = dv.get_feature_names_out()
importances = rf.feature_importances_

candidates = ['vehicle_weight', 'horsepower', 'acceleration', 'engine_displacement']

cand_scores = {f: importances[np.where(feat_names == f)[0][0]] for f in candidates}
for k, v in sorted(cand_scores.items(), key=lambda x: -x[1]):
    print(f'{k:20s} -> {v:.6f}')


best_feature = max(cand_scores, key=cand_scores.get)
print('\n✅ most important feature:', best_feature)

vehicle_weight       -> 0.959829
horsepower           -> 0.015943
acceleration         -> 0.011458
engine_displacement  -> 0.003190

✅ most important feature: vehicle_weight


In [22]:
!pip install -q xgboost

In [23]:
import xgboost as xgb
import numpy as np
from sklearn.metrics import mean_squared_error

dtrain = xgb.DMatrix(X_train, label=y_train)
dval   = xgb.DMatrix(X_val,   label=y_val)

def run_xgb(eta):
    params = {
        'eta': eta,
        'max_depth': 6,
        'min_child_weight': 1,
        'objective': 'reg:squarederror',
        'nthread': 8,
        'seed': 1,
        'verbosity': 1,
        'eval_metric': 'rmse'
    }
    watchlist = [(dtrain, 'train'), (dval, 'val')]
    model = xgb.train(params, dtrain, num_boost_round=100, evals=watchlist, verbose_eval=False)
    y_pred = model.predict(dval)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    print(f"eta={eta} -> RMSE={rmse:.3f}")
    return rmse

rmse_03 = run_xgb(0.3)
rmse_01 = run_xgb(0.1)

best = "0.1" if rmse_01 < rmse_03 else ("0.3" if rmse_03 < rmse_01 else "Both give equal value")
print("✅ Best eta:", best)

eta=0.3 -> RMSE=0.443
eta=0.1 -> RMSE=0.417
✅ Best eta: 0.1
