In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_text
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error


In [2]:
df = pd.read_csv('car_fuel_efficiency.csv')

In [3]:
df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


In [4]:
df = df.fillna(0)
df.isnull().sum()

engine_displacement    0
num_cylinders          0
horsepower             0
vehicle_weight         0
acceleration           0
model_year             0
origin                 0
fuel_type              0
drivetrain             0
num_doors              0
fuel_efficiency_mpg    0
dtype: int64

In [5]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

In [6]:
df_full_train = df_full_train.reset_index(drop=True)
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [7]:
y_train = df_train['fuel_efficiency_mpg']
y_val = df_val['fuel_efficiency_mpg']
y_test = df_test['fuel_efficiency_mpg']

In [8]:
del df_train['fuel_efficiency_mpg']
del df_val['fuel_efficiency_mpg']
del df_test['fuel_efficiency_mpg']

In [9]:
train_dicts = df_train.to_dict(orient='records')
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dicts)

In [10]:
X_train

array([[1.39000000e+01, 0.00000000e+00, 1.00000000e+00, ...,
        0.00000000e+00, 1.00000000e+00, 2.96667950e+03],
       [1.71000000e+01, 0.00000000e+00, 1.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 2.95082212e+03],
       [1.74000000e+01, 1.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 1.00000000e+00, 3.07822167e+03],
       ...,
       [1.67000000e+01, 1.00000000e+00, 0.00000000e+00, ...,
        1.00000000e+00, 0.00000000e+00, 3.04196459e+03],
       [1.52000000e+01, 1.00000000e+00, 0.00000000e+00, ...,
        1.00000000e+00, 0.00000000e+00, 2.45334143e+03],
       [1.41000000e+01, 0.00000000e+00, 1.00000000e+00, ...,
        0.00000000e+00, 1.00000000e+00, 2.83389943e+03]], shape=(5822, 14))

In [11]:
#Question 1

In [12]:
dt = DecisionTreeRegressor(max_depth=1)
dt.fit(X_train, y_train)
 
val_dicts = df_val.to_dict(orient='records')
X_val = dv.transform(val_dicts)
y_pred = dt.predict(X_val)
y_pred

array([16.88218854, 16.88218854, 16.88218854, ..., 12.9383797 ,
       12.9383797 , 16.88218854], shape=(1941,))

In [13]:
names = dv.get_feature_names_out().tolist()
print(export_text(dt, feature_names=names))

|--- vehicle_weight <= 3022.11
|   |--- value: [16.88]
|--- vehicle_weight >  3022.11
|   |--- value: [12.94]



In [14]:
#Question 2

In [15]:
rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_val)
y_pred

array([18.62889858, 15.29598647, 18.22879442, ..., 14.80283652,
       13.49358341, 15.99288211], shape=(1941,))

In [16]:
root_mean_squared_error(y_val, y_pred)

0.4599777557336148

In [17]:
#Question 3

In [18]:
scores = []

for n in np.arange(10, 201, 10):
    rf = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_val)
    score = round(root_mean_squared_error(y_val, y_pred),3)
    scores.append((n, score))
scores

[(np.int64(10), 0.46),
 (np.int64(20), 0.454),
 (np.int64(30), 0.451),
 (np.int64(40), 0.448),
 (np.int64(50), 0.446),
 (np.int64(60), 0.445),
 (np.int64(70), 0.445),
 (np.int64(80), 0.445),
 (np.int64(90), 0.445),
 (np.int64(100), 0.444),
 (np.int64(110), 0.443),
 (np.int64(120), 0.444),
 (np.int64(130), 0.443),
 (np.int64(140), 0.443),
 (np.int64(150), 0.443),
 (np.int64(160), 0.443),
 (np.int64(170), 0.443),
 (np.int64(180), 0.442),
 (np.int64(190), 0.443),
 (np.int64(200), 0.443)]

In [19]:
#Question 4

In [20]:
scores = []

for n in np.arange(10, 201, 10):
    for depth in [10, 15, 20, 25]:
        rf = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1, max_depth=depth)
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_val)
        score = round(root_mean_squared_error(y_val, y_pred),3)
        scores.append((n, depth, score))
scores

[(np.int64(10), 10, 0.452),
 (np.int64(10), 15, 0.457),
 (np.int64(10), 20, 0.459),
 (np.int64(10), 25, 0.46),
 (np.int64(20), 10, 0.449),
 (np.int64(20), 15, 0.452),
 (np.int64(20), 20, 0.453),
 (np.int64(20), 25, 0.454),
 (np.int64(30), 10, 0.446),
 (np.int64(30), 15, 0.45),
 (np.int64(30), 20, 0.451),
 (np.int64(30), 25, 0.451),
 (np.int64(40), 10, 0.444),
 (np.int64(40), 15, 0.448),
 (np.int64(40), 20, 0.448),
 (np.int64(40), 25, 0.448),
 (np.int64(50), 10, 0.443),
 (np.int64(50), 15, 0.446),
 (np.int64(50), 20, 0.446),
 (np.int64(50), 25, 0.446),
 (np.int64(60), 10, 0.442),
 (np.int64(60), 15, 0.445),
 (np.int64(60), 20, 0.445),
 (np.int64(60), 25, 0.446),
 (np.int64(70), 10, 0.442),
 (np.int64(70), 15, 0.444),
 (np.int64(70), 20, 0.444),
 (np.int64(70), 25, 0.445),
 (np.int64(80), 10, 0.442),
 (np.int64(80), 15, 0.444),
 (np.int64(80), 20, 0.445),
 (np.int64(80), 25, 0.445),
 (np.int64(90), 10, 0.442),
 (np.int64(90), 15, 0.444),
 (np.int64(90), 20, 0.445),
 (np.int64(90), 25, 0.

In [21]:
df_scores = pd.DataFrame(scores, columns=['n_estimators','max_depth','rmse'])

In [22]:
df_scores.sort_values('rmse')

Unnamed: 0,n_estimators,max_depth,rmse
60,160,10,0.440
56,150,10,0.440
68,180,10,0.440
64,170,10,0.440
72,190,10,0.440
...,...,...,...
6,20,20,0.453
7,20,25,0.454
1,10,15,0.457
2,10,20,0.459


In [23]:
#Question 5

In [24]:
rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1, max_depth=20)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_val)
round(root_mean_squared_error(y_val, y_pred),3)


0.459

In [25]:
importances = rf.feature_importances_
feature_importance_df = pd.DataFrame({
    'feature': names,
    'importance': importances
}).sort_values(by='importance', ascending=False)

In [26]:
feature_importance_df

Unnamed: 0,feature,importance
13,vehicle_weight,0.959162
6,horsepower,0.01604
0,acceleration,0.011471
3,engine_displacement,0.003269
7,model_year,0.003182
8,num_cylinders,0.002359
9,num_doors,0.001591
12,origin=USA,0.000555
11,origin=Europe,0.00052
10,origin=Asia,0.000476


In [27]:
#Question 6

In [28]:
!pip install xgboost


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [29]:
import xgboost as xgb

In [45]:
features = dv.get_feature_names_out().tolist()
features

['acceleration',
 'drivetrain=All-wheel drive',
 'drivetrain=Front-wheel drive',
 'engine_displacement',
 'fuel_type=Diesel',
 'fuel_type=Gasoline',
 'horsepower',
 'model_year',
 'num_cylinders',
 'num_doors',
 'origin=Asia',
 'origin=Europe',
 'origin=USA',
 'vehicle_weight']

In [46]:
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=features)
dval = xgb.DMatrix(X_val, label=y_val, feature_names=features)

In [65]:
watchlist = [(dtrain, 'train'), (dval, 'val')]
scores = {}

In [66]:
def parse_xgb_output(output):
    results = []
 
    for line in output.stdout.strip().split('\n'):
        it_line, train_line, val_line = line.split('\t')
 
        it = int(it_line.strip('[]'))
        train = float(train_line.split(':')[1])
        val = float(val_line.split(':')[1])
 
        results.append((it, train, val))
     
    columns = ['num_iter', 'train_rmse', 'val_rmse']
    df_results = pd.DataFrame(results, columns=columns)
    return df_results

In [67]:
%%capture output

xgb_params = {
    'eta': 0.3, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    'eval_metric': 'rmse',
    
    'seed': 1,
    'verbosity': 1,
}
model = xgb.train(xgb_params, dtrain, num_boost_round=100, verbose_eval=5, evals=watchlist)

In [68]:
key = 'eta=%s' % (xgb_params['eta'])
scores[key] = parse_xgb_output(output)
key

'eta=0.3'

In [69]:
%%capture output

xgb_params = {
    'eta': 0.1, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    'eval_metric': 'rmse',
    
    'seed': 1,
    'verbosity': 1,
}
model = xgb.train(xgb_params, dtrain, num_boost_round=100, verbose_eval=5, evals=watchlist)

In [70]:
key = 'eta=%s' % (xgb_params['eta'])
scores[key] = parse_xgb_output(output)
key

'eta=0.1'

In [71]:
scores['eta=0.3'].sort_values('val_rmse', ascending=True)

Unnamed: 0,num_iter,train_rmse,val_rmse
3,15,0.34666,0.43362
4,20,0.33553,0.43376
5,25,0.32268,0.43683
6,30,0.31475,0.43752
7,35,0.3096,0.43784
2,10,0.37115,0.43896
8,40,0.30202,0.43968
9,45,0.29126,0.44024
10,50,0.28456,0.4414
11,55,0.27618,0.44225


In [72]:
scores['eta=0.1'].sort_values('val_rmse', ascending=True)

Unnamed: 0,num_iter,train_rmse,val_rmse
11,55,0.3348,0.42449
12,60,0.33054,0.42456
13,65,0.32602,0.42493
10,50,0.33998,0.42498
14,70,0.32202,0.42503
15,75,0.31895,0.42526
16,80,0.31667,0.42563
17,85,0.3144,0.42574
18,90,0.31059,0.42586
9,45,0.34621,0.42595
