In [1]:
# Q1 – DecisionTreeRegressor(max_depth=1): which feature is used for the split?
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeRegressor

# 1) Load & prepare
df = pd.read_csv(
    "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv"
)
df = df.fillna(0)

y = df["fuel_efficiency_mpg"].values
X_df = df.drop(columns=["fuel_efficiency_mpg"])

# 60/20/20 split with random_state=1
X_train, X_temp, y_train, y_temp = train_test_split(
    X_df, y, test_size=0.4, random_state=1
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=1
)

# DictVectorizer (sparse=True)
dv = DictVectorizer(sparse=True)
X_train_dv = dv.fit_transform(X_train.to_dict(orient="records"))
X_val_dv = dv.transform(X_val.to_dict(orient="records"))

# 2) Train a shallow tree
tree = DecisionTreeRegressor(max_depth=1, random_state=1)
tree.fit(X_train_dv, y_train)

# 3) Inspect the single split
split_idx = tree.tree_.feature[0]  # index of the splitting feature
split_feature_name = dv.get_feature_names_out()[split_idx]
print("Split feature:", split_feature_name)

Split feature: vehicle_weight


In [13]:
# Q2 – RandomForestRegressor with n_estimators=10: RMSE on validation
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# 1) Load & prepare
df = pd.read_csv(
    "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv"
).fillna(0)
y = df["fuel_efficiency_mpg"].values
X_df = df.drop(columns=["fuel_efficiency_mpg"])

X_train, X_temp, y_train, y_temp = train_test_split(
    X_df, y, test_size=0.4, random_state=1
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=1
)

dv = DictVectorizer(sparse=True)
X_train_dv = dv.fit_transform(X_train.to_dict(orient="records"))
X_val_dv = dv.transform(X_val.to_dict(orient="records"))

# 2) Train RF and evaluate
rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
rf.fit(X_train_dv, y_train)
val_pred = rf.predict(X_val_dv)

rmse = float(np.sqrt(mean_squared_error(y_val, val_pred)))
print("Validation RMSE:", rmse)

Validation RMSE: 0.46028153670326594


In [14]:
# Q3 – RF: sweep n_estimators from 10 to 200 (step=10), find plateau point at 3 decimals
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# 1) Load & prepare
df = pd.read_csv(
    "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv"
).fillna(0)
y = df["fuel_efficiency_mpg"].values
X_df = df.drop(columns=["fuel_efficiency_mpg"])

X_train, X_temp, y_train, y_temp = train_test_split(
    X_df, y, test_size=0.4, random_state=1
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=1
)

dv = DictVectorizer(sparse=True)
X_train_dv = dv.fit_transform(X_train.to_dict(orient="records"))
X_val_dv = dv.transform(X_val.to_dict(orient="records"))

# 2) Sweep n_estimators and collect RMSEs
results = []
rf = RandomForestRegressor(n_estimators=0, random_state=1, n_jobs=-1, warm_start=True)
for n in range(10, 201, 10):
    rf.n_estimators = n  # grow the forest incrementally
    rf.fit(X_train_dv, y_train)
    rmse = float(np.sqrt(mean_squared_error(y_val, rf.predict(X_val_dv))))
    results.append((n, rmse, round(rmse, 3)))

# 3) Find earliest n after which (rounded to 3 decimals) it never improves
best_3 = None
plateau_after = None
for n, rmse, rmse3 in results:
    if (best_3 is None) or (rmse3 < best_3):
        best_3 = rmse3
        plateau_after = n

print("Per-step (n, rmse, rmse@3dec) =", results)
print("Earliest n reaching best 3-decimal RMSE:", plateau_after)

Per-step (n, rmse, rmse@3dec) = [(10, 0.4602815367032659, 0.46), (20, 0.4461567458911003, 0.446), (30, 0.4397780761280069, 0.44), (40, 0.4383939265191818, 0.438), (50, 0.43717032494674524, 0.437), (60, 0.4355914081920473, 0.436), (70, 0.4361123859130258, 0.436), (80, 0.43605455887808786, 0.436), (90, 0.43541008234407647, 0.435), (100, 0.43527736554786667, 0.435), (110, 0.43489681577046596, 0.435), (120, 0.43546652508605704, 0.435), (130, 0.4349233620666646, 0.435), (140, 0.4351068229164202, 0.435), (150, 0.43519106451533063, 0.435), (160, 0.4352369042756664, 0.435), (170, 0.4352077390021516, 0.435), (180, 0.43524040934995967, 0.435), (190, 0.43539799338117574, 0.435), (200, 0.43500312488894405, 0.435)]
Earliest n reaching best 3-decimal RMSE: 90


In [15]:
# Q4 – For each max_depth in [10,15,20,25], compute mean RMSE over n_estimators=10..200 (step=10)
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# 1) Load & prepare
df = pd.read_csv(
    "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv"
).fillna(0)
y = df["fuel_efficiency_mpg"].values
X_df = df.drop(columns=["fuel_efficiency_mpg"])

X_train, X_temp, y_train, y_temp = train_test_split(
    X_df, y, test_size=0.4, random_state=1
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=1
)

dv = DictVectorizer(sparse=True)
X_train_dv = dv.fit_transform(X_train.to_dict(orient="records"))
X_val_dv = dv.transform(X_val.to_dict(orient="records"))

# 2) Grid over depths and n_estimators
depths = [10, 15, 20, 25]
grid_n = list(range(10, 201, 10))

mean_rmse_by_depth = {}
for d in depths:
    rf = RandomForestRegressor(
        n_estimators=0, max_depth=d, random_state=1, n_jobs=-1, warm_start=True
    )
    rmses = []
    for n in grid_n:
        rf.n_estimators = n
        rf.fit(X_train_dv, y_train)
        rmses.append(float(np.sqrt(mean_squared_error(y_val, rf.predict(X_val_dv)))))
    mean_rmse_by_depth[d] = float(np.mean(rmses))

print("Mean RMSE by depth:", mean_rmse_by_depth)
best_depth = min(mean_rmse_by_depth, key=mean_rmse_by_depth.get)
print("Best max_depth:", best_depth)

Mean RMSE by depth: {10: 0.43624733022811624, 15: 0.4378245115127723, 20: 0.4376934354988413, 25: 0.43765343428485853}
Best max_depth: 10


In [9]:
# Q5 – RF feature importance: which of the four is most important?
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestRegressor

# 1) Load & prepare
df = pd.read_csv(
    "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv"
).fillna(0)
y = df["fuel_efficiency_mpg"].values
X_df = df.drop(columns=["fuel_efficiency_mpg"])

X_train, X_temp, y_train, y_temp = train_test_split(
    X_df, y, test_size=0.4, random_state=1
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=1
)

dv = DictVectorizer(sparse=True)
X_train_dv = dv.fit_transform(X_train.to_dict(orient="records"))
X_val_dv = dv.transform(X_val.to_dict(orient="records"))

# 2) Train RF and inspect importances
rf = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1, n_jobs=-1)
rf.fit(X_train_dv, y_train)

names = dv.get_feature_names_out()
imps = rf.feature_importances_
imp_map = {name: imp for name, imp in zip(names, imps)}

candidates = ["vehicle_weight", "horsepower", "acceleration", "engine_displacement"]
for c in candidates:
    print(f"{c:>20s} : {imp_map.get(c, 0.0):.6f}")

print(
    "\nMost important among the four:",
    max(candidates, key=lambda c: imp_map.get(c, 0.0)),
)

      vehicle_weight : 0.959878
          horsepower : 0.015933
        acceleration : 0.011442
 engine_displacement : 0.003159

Most important among the four: vehicle_weight


In [16]:
# Q6 – XGBoost: compare eta=0.3 vs 0.1 (100 rounds), report validation RMSE
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import mean_squared_error

# 1) Load & prepare
df = pd.read_csv(
    "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv"
).fillna(0)
y = df["fuel_efficiency_mpg"].values
X_df = df.drop(columns=["fuel_efficiency_mpg"])

X_train, X_temp, y_train, y_temp = train_test_split(
    X_df, y, test_size=0.4, random_state=1
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=1
)

dv = DictVectorizer(sparse=True)
X_train_dv = dv.fit_transform(X_train.to_dict(orient="records"))
X_val_dv = dv.transform(X_val.to_dict(orient="records"))

# 2) DMatrix + watchlist
dtrain = xgb.DMatrix(X_train_dv, label=y_train)
dval = xgb.DMatrix(X_val_dv, label=y_val)
watchlist = [(dtrain, "train"), (dval, "val")]

base_params = {
    "max_depth": 6,
    "min_child_weight": 1,
    "objective": "reg:squarederror",
    "nthread": 8,
    "seed": 1,
    "verbosity": 1,
}

# eta=0.3
params_03 = dict(base_params, eta=0.3)
model_03 = xgb.train(
    params_03, dtrain, num_boost_round=100, evals=watchlist, verbose_eval=False
)
rmse_03 = float(np.sqrt(mean_squared_error(y_val, model_03.predict(dval))))

# eta=0.1
params_01 = dict(base_params, eta=0.1)
model_01 = xgb.train(
    params_01, dtrain, num_boost_round=100, evals=watchlist, verbose_eval=False
)
rmse_01 = float(np.sqrt(mean_squared_error(y_val, model_01.predict(dval))))

print(f"Validation RMSE (eta=0.3): {rmse_03:.6f}")
print(f"Validation RMSE (eta=0.1): {rmse_01:.6f}")

answer = "0.1" if rmse_01 < rmse_03 else ("0.3" if rmse_03 < rmse_01 else "Both equal")
print("Answer:", answer)

Validation RMSE (eta=0.3): 0.443405
Validation RMSE (eta=0.1): 0.416743
Answer: 0.1
