In [20]:
import pandas as pd

df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv')
df.columns = df.columns.str.lower().str.replace(' ', '_')
strings = list(df.dtypes[df.dtypes == 'object'].index)
for col in strings:
    df[col] = df[col].str.lower().str.replace(' ', '_')
cols = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year', 'fuel_efficiency_mpg']
df = df[cols]
df.isnull().sum()

engine_displacement      0
horsepower             708
vehicle_weight           0
model_year               0
fuel_efficiency_mpg      0
dtype: int64

In [22]:
median_hp = df['horsepower'].median()
print(median_hp)

149.0


In [30]:
import numpy as np
n = len(df)
n_train = int(0.6 * n)
n_val   = int(0.2 * n)
n_test  = n - n_train - n_val

idx = np.arange(n)
np.random.seed(42)
np.random.shuffle(idx)

df_train = df.iloc[idx[:n_train]]
df_val   = df.iloc[idx[n_train:n_train + n_val]]
df_test  = df.iloc[idx[n_train + n_val:]]

print(n)
print(df_train.shape, df_val.shape, df_test.shape)

9704
(5822, 11) (1940, 11) (1942, 11)


In [39]:
features = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']
target = 'fuel_efficiency_mpg'

def prepare_X(df_):
    X = df_[features].values
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])   
    return X
def train_linear_regression(X, y):
    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    return w 
def predict(X, w):
    return X.dot(w)
def rmse(y, y_pred):
    return np.sqrt(((y - y_pred) ** 2).mean())
    
hp_mean = df_train['horsepower'].mean()

df_train_0 = df_train.copy()
df_val_0   = df_val.copy()

df_train_0['horsepower'] = df_train_0['horsepower'].fillna(0)
df_val_0['horsepower']   = df_val_0['horsepower'].fillna(0)

X_train_0 = prepare_X(df_train_0)
y_train_0 = df_train_0[target].values

w_0 = train_linear_regression(X_train_0, y_train_0)

X_val_0 = prepare_X(df_val_0)
y_val_0 = df_val_0[target].values

y_pred_0 = predict(X_val_0, w_0)
rmse_0 = rmse(y_val_0, y_pred_0)

df_train_m = df_train.copy()
df_val_m   = df_val.copy()

df_train_m['horsepower'] = df_train_m['horsepower'].fillna(hp_mean)
df_val_m['horsepower']   = df_val_m['horsepower'].fillna(hp_mean)

X_train_m = prepare_X(df_train_m)
y_train_m = df_train_m[target].values

w_m = train_linear_regression(X_train_m, y_train_m)

X_val_m = prepare_X(df_val_m)
y_val_m = df_val_m[target].values

y_pred_m = predict(X_val_m, w_m)
rmse_m = rmse(y_val_m, y_pred_m)

print(round(rmse_0, 2))
print(round(rmse_m, 2))

0.52
0.46


In [41]:

df_train_r = df_train.copy()
df_val_r   = df_val.copy()
df_train_r['horsepower'] = df_train_r['horsepower'].fillna(0)
df_val_r['horsepower']   = df_val_r['horsepower'].fillna(0)

X_tr = prepare_X(df_train_r)
y_tr = df_train_r[target].values
X_va = prepare_X(df_val_r)
y_va = df_val_r[target].values


def train_ridge(X, y, r):
    XTX = X.T.dot(X)
    I = np.eye(XTX.shape[0])
    I[0, 0] = 0.0  
    return np.linalg.solve(XTX + r * I, X.T.dot(y))

r_list  = [0, 0.01, 0.1, 1, 5, 10, 100]
results = []

for r in r_list:
    w = train_ridge(X_tr, y_tr, r)
    y_hat = predict(X_va, w)
    score = rmse(y_va, y_hat)
    results.append((r, round(score, 2)))

for r, s in results:
    print(f"r={r:<6} RMSE={s:.2f}")

best_r, best_rmse = min(results, key=lambda t: (t[1], t[0]))
print(best_r, best_rmse)

r=0      RMSE=0.52
r=0.01   RMSE=0.52
r=0.1    RMSE=0.52
r=1      RMSE=0.52
r=5      RMSE=0.52
r=10     RMSE=0.52
r=100    RMSE=0.52
0 0.52


In [43]:
seeds = list(range(10))
rmse_scores = []

for s in seeds:
    n = len(df)
    n_train = int(0.6 * n)
    n_val   = int(0.2 * n)

    idx = np.arange(n)
    np.random.seed(s)
    np.random.shuffle(idx)

    df_train = df.iloc[idx[:n_train]].copy()
    df_val   = df.iloc[idx[n_train:n_train + n_val]].copy()

    df_train['horsepower'] = df_train['horsepower'].fillna(0)
    df_val['horsepower']   = df_val['horsepower'].fillna(0)

    X_tr, y_tr = prepare_X(df_train), df_train['fuel_efficiency_mpg'].values
    X_va, y_va = prepare_X(df_val),   df_val['fuel_efficiency_mpg'].values

    w = train_linear_regression(X_tr, y_tr)

    y_hat = predict(X_va, w)
    score = rmse(y_va, y_hat)
    rmse_scores.append(score)

std_val = np.std(rmse_scores)
print([round(x, 2) for x in rmse_scores])
print(round(std_val, 3))

[np.float64(0.52), np.float64(0.52), np.float64(0.52), np.float64(0.52), np.float64(0.51), np.float64(0.53), np.float64(0.53), np.float64(0.51), np.float64(0.51), np.float64(0.51)]
0.007


In [44]:
seed = 9
np.random.seed(seed)

n = len(df)
n_train = int(0.6 * n)
n_val   = int(0.2 * n)
n_test  = n - n_train - n_val

idx = np.arange(n)
np.random.shuffle(idx)

df_train = df.iloc[idx[:n_train]].copy()
df_val   = df.iloc[idx[n_train:n_train + n_val]].copy()
df_test  = df.iloc[idx[n_train + n_val:]].copy()

df_full_train = pd.concat([df_train, df_val])


df_full_train['horsepower'] = df_full_train['horsepower'].fillna(0)
df_test['horsepower']       = df_test['horsepower'].fillna(0)


X_full_train = prepare_X(df_full_train)
y_full_train = df_full_train['fuel_efficiency_mpg'].values

X_test = prepare_X(df_test)
y_test = df_test['fuel_efficiency_mpg'].values


def train_ridge(X, y, r):
    XTX = X.T.dot(X)
    I = np.eye(XTX.shape[0])
    I[0, 0] = 0.0  
    return np.linalg.solve(XTX + r * I, X.T.dot(y))

w = train_ridge(X_full_train, y_full_train, r=0.001)

y_pred = predict(X_test, w)
score = rmse(y_test, y_pred)

print(round(score, 3))

0.515
