In [1]:
import pandas as pd
import numpy as np

# Preparing the dataset

In [2]:
car_fuel_efficiency_df = pd.read_csv("car_fuel_efficiency.csv")[['engine_displacement',
                                                                 'horsepower',
                                                                 'vehicle_weight',
                                                                 'model_year',
                                                                 'fuel_efficiency_mpg']]
car_fuel_efficiency_df

Unnamed: 0,engine_displacement,horsepower,vehicle_weight,model_year,fuel_efficiency_mpg
0,170,159.0,3413.433759,2003,13.231729
1,130,97.0,3149.664934,2007,13.688217
2,170,78.0,3079.038997,2018,14.246341
3,220,,2542.392402,2009,16.912736
4,210,140.0,3460.870990,2009,12.488369
...,...,...,...,...,...
9699,140,164.0,2981.107371,2013,15.101802
9700,180,154.0,2439.525729,2004,17.962326
9701,220,138.0,2583.471318,2008,17.186587
9702,230,177.0,2905.527390,2011,15.331551


# EDA
fuel_efficiency_mpg does not have a long tail.

In [3]:
car_fuel_efficiency_df['fuel_efficiency_mpg'].skew()

-0.012062219273507958

# Question 1
There's one column with missing values. What is it?

In [4]:
car_fuel_efficiency_df.isnull().sum()

engine_displacement      0
horsepower             708
vehicle_weight           0
model_year               0
fuel_efficiency_mpg      0
dtype: int64

# Question 2
What's the median (50% percentile) for variable 'horsepower'?

In [5]:
car_fuel_efficiency_df['horsepower'].median()

149.0

# Prepare and split the dataset

In [6]:
np.random.seed(42)

n = len(car_fuel_efficiency_df)

n_train = int(0.6 * n)
n_val = int(0.2 * n)
n_test = n - (n_train + n_val)

idx = np.arange(n)
np.random.shuffle(idx)

df_shuffled = car_fuel_efficiency_df.iloc[idx]

df_train = df_shuffled.iloc[:n_train].copy()
df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
df_test = df_shuffled.iloc[n_train+n_val:].copy()

In [7]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

# Question 3

Fill missing values with 0 and train a linear regression model without regularization

In [8]:
y_train = df_train['fuel_efficiency_mpg'].values
y_val = df_val['fuel_efficiency_mpg'].values
y_test = df_test['fuel_efficiency_mpg'].values
# no need to compute log since data isn't long tail

del df_train['fuel_efficiency_mpg']
del df_val['fuel_efficiency_mpg']
del df_test['fuel_efficiency_mpg']

In [9]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

In [10]:
def prepare_X(df):
    df = df.fillna(0)
    X = df.values
    return X
X_train = prepare_X(df_train)
w_0, w = train_linear_regression(X_train, y_train)

Use the validation dataset to evaluate the models and compare the RMSE

In [11]:
def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    return np.sqrt(mse)

In [12]:
X_val = prepare_X(df_val)
y_pred = w_0 + X_val.dot(w)

In [13]:
round(rmse(y_val, y_pred),2)

0.52

Fill missing value with the mean value and train a linear regression model without regularization

In [14]:
horsepower_mean = df_train['horsepower'].mean()
def prepare_X(df):
    df = df.fillna(horsepower_mean)
    X = df.values
    return X
X_train = prepare_X(df_train)
w_0, w = train_linear_regression(X_train, y_train)

In [15]:
y_pred = w_0 + X_train.dot(w)

Use the validation dataset to evaluate the models and compare the RMSE

In [16]:
def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    return np.sqrt(mse)

In [17]:
X_val = prepare_X(df_val)
y_pred = w_0 + X_val.dot(w)

In [18]:
round(rmse(y_val, y_pred),2)

0.46

# Question 4

In [19]:
def train_linear_regression_reg(X, y, r):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    reg = r * np.eye(XTX.shape[0])
    XTX = XTX + reg

    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

In [20]:
def prepare_X(df):
    df = df.fillna(0)
    X = df.values
    return X
X_train = prepare_X(df_train)

In [21]:
for r in [0, 0.01, 0.1, 1, 5, 10, 100]:
    w_0, w = train_linear_regression_reg(X_train, y_train, r)
    X_val = prepare_X(df_val)
    y_pred = w_0 + X_val.dot(w)
    print(round(rmse(y_val, y_pred),2))

0.52
0.52
0.52
0.52
0.52
0.52
0.52


# Question 5

In [22]:
#copy the code from question 4 and plug the seed value accordingly
def evaluate_seed(seed_value):
    np.random.seed(seed_value)
    
    n = len(car_fuel_efficiency_df)
    
    n_train = int(0.6 * n)
    n_val = int(0.2 * n)
    n_test = n - (n_train + n_val)
    
    idx = np.arange(n)
    np.random.shuffle(idx)
    
    df_shuffled = car_fuel_efficiency_df.iloc[idx]
    
    df_train = df_shuffled.iloc[:n_train].copy()
    df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
    df_test = df_shuffled.iloc[n_train+n_val:].copy()

    df_train = df_train.reset_index(drop=True)
    df_val = df_val.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)

    y_train = df_train['fuel_efficiency_mpg'].values
    y_val = df_val['fuel_efficiency_mpg'].values
    y_test = df_test['fuel_efficiency_mpg'].values

    
    del df_train['fuel_efficiency_mpg']
    del df_val['fuel_efficiency_mpg']
    del df_test['fuel_efficiency_mpg']

    def train_linear_regression(X, y):
        ones = np.ones(X.shape[0])
        X = np.column_stack([ones, X])
    
        XTX = X.T.dot(X)
        XTX_inv = np.linalg.inv(XTX)
        w = XTX_inv.dot(X.T).dot(y)
        
        return w[0], w[1:]
        
    def prepare_X(df):
        df = df.fillna(0)
        X = df.values
        return X
    X_train = prepare_X(df_train)
    w_0, w = train_linear_regression(X_train, y_train)

    def rmse(y, y_pred):
        error = y_pred - y
        mse = (error ** 2).mean()
        return np.sqrt(mse)

    X_val = prepare_X(df_val)
    y_pred = w_0 + X_val.dot(w)

    return rmse(y_val, y_pred)

seed_values = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
rmse_list = [evaluate_seed(seed_value) for seed_value in seed_values]
round(np.std(rmse_list), 3)

0.007

# Question 6

In [23]:
np.random.seed(9)

n = len(car_fuel_efficiency_df)

n_train_val = int(0.8 * n)
n_test = n - n_train_val

idx = np.arange(n)
np.random.shuffle(idx)

df_shuffled = car_fuel_efficiency_df.iloc[idx]

df_train_val = df_shuffled.iloc[:n_train_val].copy()
df_test = df_shuffled.iloc[n_train_val:].copy()

df_train_val = df_train_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train_val = df_train_val['fuel_efficiency_mpg'].values
y_test = df_test['fuel_efficiency_mpg'].values


del df_train_val['fuel_efficiency_mpg']
del df_test['fuel_efficiency_mpg']

def train_linear_regression_reg(X, y, r):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    reg = r * np.eye(XTX.shape[0])
    XTX = XTX + reg

    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]
    
def prepare_X(df):
    df = df.fillna(0)
    X = df.values
    return X
X_train_val = prepare_X(df_train_val)
w_0, w = train_linear_regression_reg(X_train_val, y_train_val, r=0.001)

def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    return np.sqrt(mse)

X_test = prepare_X(df_test)
y_pred = w_0 + X_test.dot(w)

rmse(y_test, y_pred)

0.515636544184229