In [1]:
# Basic data handling
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt

# Scikit-learn tools for modeling
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')


In [2]:
# Load the dataset
url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv"
df = pd.read_csv(url)

# Quick overview
print("Shape:", df.shape)
df.head()


Shape: (9704, 11)


Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


In [3]:
from sklearn.feature_extraction import DictVectorizer

df = df.fillna(0)


df_full_train, df_temp = train_test_split(df, test_size=0.4, random_state=1)


df_val, df_test = train_test_split(df_temp, test_size=0.5, random_state=1)

# 3. Separate target from features
y_train = df_full_train['fuel_efficiency_mpg'].values
y_val = df_val['fuel_efficiency_mpg'].values
y_test = df_test['fuel_efficiency_mpg'].values

X_train = df_full_train.drop('fuel_efficiency_mpg', axis=1)
X_val = df_val.drop('fuel_efficiency_mpg', axis=1)
X_test = df_test.drop('fuel_efficiency_mpg', axis=1)

# 4. Convert dataframes to dictionaries and vectorize
dv = DictVectorizer(sparse=True)

X_train = dv.fit_transform(X_train.to_dict(orient='records'))
X_val = dv.transform(X_val.to_dict(orient='records'))
X_test = dv.transform(X_test.to_dict(orient='records'))

print("Train matrix shape:", X_train.shape)
print("Validation matrix shape:", X_val.shape)
print("Test matrix shape:", X_test.shape)


Train matrix shape: (5822, 14)
Validation matrix shape: (1941, 14)
Test matrix shape: (1941, 14)


In [4]:
# Train Decision Tree Regressor with max_depth=1
dt = DecisionTreeRegressor(max_depth=1, random_state=1)
dt.fit(X_train, y_train)

# Show the feature used for the split
feature_index = dt.tree_.feature[0]  # root node split
feature_name = dv.feature_names_[feature_index]
print("Feature used for the split:", feature_name)


Feature used for the split: vehicle_weight


In [6]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# 1. Train Random Forest Regressor
rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
rf.fit(X_train, y_train)

# 2. Predict on validation set
y_pred = rf.predict(X_val)

# 3. Compute RMSE manually
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)

print("Validation RMSE:", rmse)


Validation RMSE: 0.4602815367032658


In [7]:
n_estimators_values = range(10, 201, 10)
rmse_list = []

for n in n_estimators_values:
    rf = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    rmse_list.append(rmse)

# Print results
for n, rmse in zip(n_estimators_values, rmse_list):
    print(f"n_estimators={n}, RMSE={rmse:.3f}")


n_estimators=10, RMSE=0.460
n_estimators=20, RMSE=0.446
n_estimators=30, RMSE=0.440
n_estimators=40, RMSE=0.438
n_estimators=50, RMSE=0.437
n_estimators=60, RMSE=0.436
n_estimators=70, RMSE=0.436
n_estimators=80, RMSE=0.436
n_estimators=90, RMSE=0.435
n_estimators=100, RMSE=0.435
n_estimators=110, RMSE=0.435
n_estimators=120, RMSE=0.435
n_estimators=130, RMSE=0.435
n_estimators=140, RMSE=0.435
n_estimators=150, RMSE=0.435
n_estimators=160, RMSE=0.435
n_estimators=170, RMSE=0.435
n_estimators=180, RMSE=0.435
n_estimators=190, RMSE=0.435
n_estimators=200, RMSE=0.435


In [8]:
max_depth_values = [10, 15, 20, 25]
n_estimators_values = range(10, 201, 10)

mean_rmse_results = {}

for depth in max_depth_values:
    rmse_list = []
    for n in n_estimators_values:
        rf = RandomForestRegressor(n_estimators=n, max_depth=depth, random_state=1, n_jobs=-1)
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        rmse_list.append(rmse)
    mean_rmse = np.mean(rmse_list)
    mean_rmse_results[depth] = mean_rmse
    print(f"max_depth={depth}, mean RMSE={mean_rmse:.3f}")

# Find the best max_depth
best_depth = min(mean_rmse_results, key=mean_rmse_results.get)
print("\nBest max_depth based on mean RMSE:", best_depth)


max_depth=10, mean RMSE=0.436
max_depth=15, mean RMSE=0.438
max_depth=20, mean RMSE=0.438
max_depth=25, mean RMSE=0.438

Best max_depth based on mean RMSE: 10


In [None]:
# Train the Random Forest model with specified parameters
rf = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1, n_jobs=-1)
rf.fit(X_train, y_train)

# Get feature importances
importances = rf.feature_importances_

# Map feature importances to feature names
feature_importance_dict = dict(zip(dv.get_feature_names_out(), importances))

# Sort features by importance
sorted_features = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

# Print the most important feature
most_important_feature = sorted_features[0][0]
print("Most important feature:", most_important_feature)


In [9]:
!pip install xgboost --quiet
import xgboost as xgb


In [10]:
# Convert data to DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

watchlist = [(dtrain, 'train'), (dval, 'val')]


In [11]:
xgb_params = {
    'eta': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,
    'seed': 1,
    'verbosity': 0
}

bst_03 = xgb.train(xgb_params, dtrain, num_boost_round=100, evals=watchlist, early_stopping_rounds=10)


[0]	train-rmse:1.83282	val-rmse:1.82567
[1]	train-rmse:1.33231	val-rmse:1.32771
[2]	train-rmse:0.99034	val-rmse:0.99257
[3]	train-rmse:0.76090	val-rmse:0.76897
[4]	train-rmse:0.61110	val-rmse:0.62742
[5]	train-rmse:0.51643	val-rmse:0.54010
[6]	train-rmse:0.45800	val-rmse:0.48954
[7]	train-rmse:0.42172	val-rmse:0.46026
[8]	train-rmse:0.39836	val-rmse:0.44332
[9]	train-rmse:0.38494	val-rmse:0.43456
[10]	train-rmse:0.37400	val-rmse:0.43004
[11]	train-rmse:0.36596	val-rmse:0.42696
[12]	train-rmse:0.36050	val-rmse:0.42569
[13]	train-rmse:0.35549	val-rmse:0.42519
[14]	train-rmse:0.35143	val-rmse:0.42455
[15]	train-rmse:0.34792	val-rmse:0.42450
[16]	train-rmse:0.34533	val-rmse:0.42478
[17]	train-rmse:0.34356	val-rmse:0.42472
[18]	train-rmse:0.34129	val-rmse:0.42472
[19]	train-rmse:0.33846	val-rmse:0.42502
[20]	train-rmse:0.33724	val-rmse:0.42509
[21]	train-rmse:0.33463	val-rmse:0.42538
[22]	train-rmse:0.33260	val-rmse:0.42543
[23]	train-rmse:0.33023	val-rmse:0.42611
[24]	train-rmse:0.32738	va

In [12]:
xgb_params['eta'] = 0.1

bst_01 = xgb.train(xgb_params, dtrain, num_boost_round=100, evals=watchlist, early_stopping_rounds=10)


[0]	train-rmse:2.31334	val-rmse:2.30592
[1]	train-rmse:2.09552	val-rmse:2.08865
[2]	train-rmse:1.90001	val-rmse:1.89221
[3]	train-rmse:1.72438	val-rmse:1.71766
[4]	train-rmse:1.56719	val-rmse:1.56150
[5]	train-rmse:1.42645	val-rmse:1.42157
[6]	train-rmse:1.30047	val-rmse:1.29580
[7]	train-rmse:1.18786	val-rmse:1.18468
[8]	train-rmse:1.08744	val-rmse:1.08657
[9]	train-rmse:0.99801	val-rmse:0.99964
[10]	train-rmse:0.91846	val-rmse:0.92183
[11]	train-rmse:0.84797	val-rmse:0.85324
[12]	train-rmse:0.78540	val-rmse:0.79241
[13]	train-rmse:0.73026	val-rmse:0.73968
[14]	train-rmse:0.68164	val-rmse:0.69327
[15]	train-rmse:0.63889	val-rmse:0.65351
[16]	train-rmse:0.60130	val-rmse:0.61854
[17]	train-rmse:0.56852	val-rmse:0.58847
[18]	train-rmse:0.53982	val-rmse:0.56232
[19]	train-rmse:0.51488	val-rmse:0.53952
[20]	train-rmse:0.49316	val-rmse:0.52039
[21]	train-rmse:0.47428	val-rmse:0.50442
[22]	train-rmse:0.45775	val-rmse:0.49005
[23]	train-rmse:0.44362	val-rmse:0.47827
[24]	train-rmse:0.43128	va

In [13]:

y_pred_03 = bst_03.predict(dval)
rmse_03 = np.sqrt(mean_squared_error(y_val, y_pred_03))

y_pred_01 = bst_01.predict(dval)
rmse_01 = np.sqrt(mean_squared_error(y_val, y_pred_01))

print("RMSE with eta=0.3:", rmse_03)
print("RMSE with eta=0.1:", rmse_01)


RMSE with eta=0.3: 0.42667344498698967
RMSE with eta=0.1: 0.41651145559432473
