In [26]:
import pandas as pd
import numpy as np

In [27]:
df = pd.read_csv('./car_fuel_efficiency.csv')

In [28]:
cols = ["engine_displacement", "horsepower", "vehicle_weight", "model_year", "fuel_efficiency_mpg"]

In [29]:
df = df[cols]

#### q1

In [30]:
df.isna().sum()

engine_displacement      0
horsepower             708
vehicle_weight           0
model_year               0
fuel_efficiency_mpg      0
dtype: int64

#### q2

In [31]:
df.horsepower.median()

np.float64(149.0)

#### q3

In [32]:
np.random.seed(42)

n = len(df)

n_val = int(0.2 * n)
n_test = int(0.2 * n)
n_train = n - (n_val + n_test)

idx = np.arange(n)
np.random.shuffle(idx)

df_shuffled = df.iloc[idx]

df_train = df_shuffled.iloc[:n_train].copy()
df_val = df_shuffled.iloc[n_train:n_train + n_val].copy()
df_test = df_shuffled.iloc[n_train + n_val:].copy()

In [33]:
y_train = df_train.fuel_efficiency_mpg.values
y_val = df_val.fuel_efficiency_mpg.values
y_test = df_test.fuel_efficiency_mpg.values

del df_train['fuel_efficiency_mpg']
del df_val['fuel_efficiency_mpg']
del df_test['fuel_efficiency_mpg']

In [34]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

In [35]:
def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    return np.sqrt(mse)

In [36]:
def prepare_X(df, fill_value=0):
    df_num = df.copy()
    df_num = df_num.fillna(fill_value)
    X = df_num.values
    return X

In [55]:
##### fill with 0

X_train = prepare_X(df_train)
w_0, w = train_linear_regression(X_train, y_train)

In [56]:
X_val = prepare_X(df_val)
y_pred = w_0 + X_val.dot(w)

In [57]:
rmse_with_0 = round(rmse(y_val, y_pred), 2)

In [58]:
##### fill with mean
fill_value = df_train.horsepower.mean()

X_train = prepare_X(df_train, fill_value)
w_0, w = train_linear_regression(X_train, y_train)

In [59]:
X_val = prepare_X(df_val, fill_value)
y_pred = w_0 + X_val.dot(w)

In [60]:
rmse_with_mean = round(rmse(y_val, y_pred), 2)

In [61]:
rmse_with_0 > rmse_with_mean

np.True_

In [62]:
rmse_with_0, rmse_with_mean

(np.float64(0.52), np.float64(0.46))

#### q4

In [63]:
r = [0, 0.01, 0.1, 1, 5, 10, 100]

In [64]:
def train_linear_regression_reg(X, y, r=0.0):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    reg = r * np.eye(XTX.shape[0])
    XTX = XTX + reg

    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

In [69]:
best_r = 0
best_rmse = np.inf
X_train = prepare_X(df_train)
X_val = prepare_X(df_val)

for r in [0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10]:
    w_0, w = train_linear_regression_reg(X_train, y_train, r=r)
    y_pred = w_0 + X_val.dot(w)
    if best_rmse < rmse(y_val, y_pred):
        best_r = r
    print('%6s' %r, round(rmse(y_val, y_pred), 2))

 1e-06 0.52
0.0001 0.52
 0.001 0.52
  0.01 0.52
   0.1 0.52
     1 0.52
     5 0.52
    10 0.52


In [72]:
best_r

0

#### q5

In [81]:
def split_data(df, seed):
    np.random.seed(seed)
    
    n = len(df)
    
    n_val = int(0.2 * n)
    n_test = int(0.2 * n)
    n_train = n - (n_val + n_test)
    
    idx = np.arange(n)
    np.random.shuffle(idx)
    
    df_shuffled = df.iloc[idx]
    
    df_train = df_shuffled.iloc[:n_train].copy()
    df_val = df_shuffled.iloc[n_train:n_train + n_val].copy()
    df_test = df_shuffled.iloc[n_train + n_val:].copy()
    
    y_train = df_train.fuel_efficiency_mpg.values
    y_val = df_val.fuel_efficiency_mpg.values
    y_test = df_test.fuel_efficiency_mpg.values
    
    del df_train['fuel_efficiency_mpg']
    del df_val['fuel_efficiency_mpg']
    del df_test['fuel_efficiency_mpg']
    return df_train, df_val, y_train, y_val

In [82]:
rmses = []
for seed in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
    df_train, df_val, y_train, y_val = split_data(df, seed)
    X_train = prepare_X(df_train)
    w_0, w = train_linear_regression(X_train, y_train)
    X_val = prepare_X(df_val)
    y_pred = w_0 + X_val.dot(w)
    rmses.append(rmse(y_val, y_pred))

In [86]:
round(np.std(rmses), 3)

np.float64(0.007)

#### 6

In [88]:
def split_data(df, seed):
    np.random.seed(seed)
    
    n = len(df)
    
    n_val = int(0.2 * n)
    n_test = int(0.2 * n)
    n_train = n - (n_val + n_test)
    
    idx = np.arange(n)
    np.random.shuffle(idx)
    
    df_shuffled = df.iloc[idx]
    
    df_train = df_shuffled.iloc[:n_train].copy()
    df_val = df_shuffled.iloc[n_train:n_train + n_val].copy()
    df_test = df_shuffled.iloc[n_train + n_val:].copy()
    
    y_train = df_train.fuel_efficiency_mpg.values
    y_val = df_val.fuel_efficiency_mpg.values
    y_test = df_test.fuel_efficiency_mpg.values
    
    del df_train['fuel_efficiency_mpg']
    del df_val['fuel_efficiency_mpg']
    del df_test['fuel_efficiency_mpg']
    return df_train, df_val, df_test, y_train, y_val, y_test

In [89]:
df_train, df_val, df_test, y_train, y_val, y_test = split_data(df, 9)

In [96]:
df_train = pd.concat([df_train, df_val])

In [97]:
y_train = np.concat([y_train, y_val])

In [98]:
X_train = prepare_X(df_train)
X_test = prepare_X(df_test)

In [99]:
w_0, w = train_linear_regression_reg(X_train, y_train, r=0.001)

In [100]:
y_pred = w_0 + X_test.dot(w)

In [101]:
rmse(y_test, y_pred)

np.float64(0.5156261299166541)