In [1]:
import pandas as pd

url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv"
df = pd.read_csv(url)


df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


In [9]:
df = df[['engine_displacement',
'horsepower',
'vehicle_weight',
'model_year',
'fuel_efficiency_mpg']]

In [9]:
df = df[['engine_displacement',
'horsepower',
'vehicle_weight',
'model_year',
'fuel_efficiency_mpg']]

In [12]:
#checking for the tail
df['fuel_efficiency_mpg'].iloc[-5:]

9699    15.101802
9700    17.962326
9701    17.186587
9702    15.331551
9703    14.884467
Name: fuel_efficiency_mpg, dtype: float64

In [16]:
#finding the missing value column
df.isnull().sum()

engine_displacement      0
horsepower             708
vehicle_weight           0
model_year               0
fuel_efficiency_mpg      0
dtype: int64

In [17]:
df.isna().sum()[df.isna().sum() > 0]


horsepower    708
dtype: int64

In [19]:
#median of the horsepower
df['horsepower'].median()

np.float64(149.0)

In [20]:
# setting up validation framework
n = len(df)

n_val = int(n*0.2)
n_test = int(n*0.2)
n_train = n - n_val -n_test


In [21]:
n

9704

In [22]:
n_val , n_test ,n_train

(1940, 1940, 5824)

In [27]:
df_test

Unnamed: 0,engine_displacement,horsepower,vehicle_weight,model_year,fuel_efficiency_mpg
7764,210,136.0,3001.227490,2004,14.272374
7765,290,142.0,3190.590332,2010,14.562889
7766,240,120.0,3518.444921,2008,11.872658
7767,220,141.0,1909.631801,2015,19.731391
7768,170,114.0,3825.442821,2009,10.344815
...,...,...,...,...,...
9699,140,164.0,2981.107371,2013,15.101802
9700,180,154.0,2439.525729,2004,17.962326
9701,220,138.0,2583.471318,2008,17.186587
9702,230,177.0,2905.527390,2011,15.331551


In [33]:
import numpy as np

In [34]:
idx = np.arange(n)


In [37]:
np.random.seed(2)
np.random.shuffle(idx)

In [38]:
df_train = df.iloc[idx[:n_train]]
df_val = df.iloc[idx[n_train:n_train+n_val]]
df_test = df.iloc[idx[n_train + n_val:]]

In [43]:
from sklearn.model_selection import train_test_split

df_train.head()

Unnamed: 0,engine_displacement,horsepower,vehicle_weight,model_year,fuel_efficiency_mpg
246,170,164.0,2990.040917,2019,15.963019
8125,170,,2729.623741,2012,15.931964
1927,200,142.0,3126.513375,2019,14.284901
8235,200,148.0,3136.477901,2003,14.86521
424,230,141.0,3384.681613,2006,12.428822


In [44]:
len(df_train) , len(df_val) , len(df_test)

(5824, 1940, 1940)

In [45]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [47]:
y_train = np.log1p(df_train['fuel_efficiency_mpg'].values)

In [48]:
y_val = np.log1p(df_val['fuel_efficiency_mpg'].values)


In [None]:
# y_test = np.log1p(df_test['fuel_efficiency_mpg'].values) # Only for final testing

del df_train['fuel_efficiency_mpg']
del df_val['fuel_efficiency_mpg']
#del df_test['fuel_efficiency_mpg'] 


In [52]:
#listing the features 
base_features = [
    'engine_displacement',
    'horsepower',
    'vehicle_weight',
    'model_year',
]


In [53]:
def prepare_X(df,fillna_value):
    """
    selects the fewatues and fills NAN values
    """
    df = df.copy()

    # Impute missing values for horsepower
    df['horsepower'] = df['horsepower'].fillna(fillna_value)

    #numpy matrix
    X = df[base_features].values
    return X
    

In [54]:
def train_linear_regression(X,y):
    # add columns of ones for bias term(w0)
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones,X])

    reg = 0.001
    XTX = X.T.dot(X)
    XTX = XTX +reg * np.eye(XTX.shape[0])

    # Calculate weights: w = (X^T X)^-1 X^T y
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)

    
    return w_full[0], w_full[1:] 

In [55]:
def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    return np.sqrt(mse)

In [56]:
# SCENARIO 1: Fill missing 'horsepower' with 0

# 1. Prepare data and train model
X_train_0 = prepare_X(df_train, fillna_value=0)
w_0, w_array_0 = train_linear_regression(X_train_0, y_train)

# 2. Predict on validation set
X_val_0 = prepare_X(df_val, fillna_value=0)
y_pred_0 = w_0 + X_val_0.dot(w_array_0)

# 3. Calculate and round the RMSE
rmse_0 = rmse(y_val, y_pred_0)
rounded_rmse_0 = round(rmse_0, 2)
print(f"RMSE (Fill with 0): {rmse_0:.4f} -> Rounded: {rounded_rmse_0}")

RMSE (Fill with 0): 0.0395 -> Rounded: 0.04


In [57]:
# Calculate the mean ONLY from the training data for imputation
mean_horsepower = df_train['horsepower'].mean()
print(f"\nTraining Mean for 'horsepower': {mean_horsepower:.4f}")

# Prepare training data by filling missing data with mean
X_train_mean = prepare_X(df_train, fillna_value=mean_horsepower)
w_mean, w_array_mean = train_linear_regression(X_train_mean, y_train)

X_val_mean = prepare_X(df_val, fillna_value=mean_horsepower)
y_pred_mean = w_mean + X_val_mean.dot(w_array_mean)

rmse_mean = rmse(y_val, y_pred_mean)
rounded_rmse_mean = round(rmse_mean, 2)
print(f"RMSE (Fill with Mean): {rmse_mean:.4f} -> Rounded: {rounded_rmse_mean}")


Training Mean for 'horsepower': 149.5733
RMSE (Fill with Mean): 0.0364 -> Rounded: 0.04


In [60]:
# Question 3 : Both are equally good

In [62]:
X_train_q4 = prepare_X(df_train, fillna_value=0)
X_val_q4 = prepare_X(df_val, fillna_value=0)


array([[ 170.        ,  164.        , 2990.04091672, 2019.        ],
       [ 170.        ,    0.        , 2729.62374119, 2012.        ],
       [ 200.        ,  142.        , 3126.51337459, 2019.        ],
       ...,
       [ 260.        ,  169.        , 2701.62710304, 2002.        ],
       [ 240.        ,  137.        , 2383.8964603 , 2016.        ],
       [ 160.        ,    0.        , 2417.61065044, 2005.        ]])

In [64]:

def train_linear_regression_reg(X, y, r=0.0):
 
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    # 2. Calculate Gram matrix
    XTX = X.T.dot(X)
    
    reg_term = r * np.eye(XTX.shape[0])
    reg_term[0, 0] = 0 
    XTX = XTX + reg_term

    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]


r_values = [0, 0.01, 0.1, 1, 5, 10, 100]
best_rmse = float('inf')
best_r = -1

for r in r_values:
    w0, w = train_linear_regression_reg(X_train_q4, y_train, r=r)

    y_pred = w0 + X_val_q4.dot(w)
    score = rmse(y_val,y_pred)
    rounded_score = round(score,2)

    if rounded_score < best_rmse:
        best_rmse = rounded_score
        best_r = r
    elif rounded_score == best_rmse and r < best_r:
        best_r = r

    print(f"r={r:<6} | RMSE: {score:.4f} | Rounded: {rounded_score}")


r=0      | RMSE: 0.0394 | Rounded: 0.04
r=0.01   | RMSE: 0.0394 | Rounded: 0.04
r=0.1    | RMSE: 0.0394 | Rounded: 0.04
r=1      | RMSE: 0.0394 | Rounded: 0.04
r=5      | RMSE: 0.0394 | Rounded: 0.04
r=10     | RMSE: 0.0394 | Rounded: 0.04
r=100    | RMSE: 0.0394 | Rounded: 0.04


In [65]:
#question 4 answer is 0

In [66]:
seeds = range(10)
scores = []
for seed in seeds:
    # Shuffle and split using the current seed
    idx = np.arange(n)
    np.random.seed(seed)
    np.random.shuffle(idx)
    
    df_train = df.iloc[idx[:n_train]].reset_index(drop=True)
    df_val = df.iloc[idx[n_train:n_train + n_val]].reset_index(drop=True)
    df_test = df.iloc[idx[n_train + n_val:]].reset_index(drop=True)
    
    # Prepare targets
    y_train = np.log1p(df_train['fuel_efficiency_mpg'].values)
    y_val = np.log1p(df_val['fuel_efficiency_mpg'].values)

    # Prepare feature matrices (fill missing horsepower with 0)
    X_train = prepare_X(df_train, fillna_value=0)
    X_val = prepare_X(df_val, fillna_value=0)
    
    # Train model (no regularization)
    w0, w = train_linear_regression(X_train, y_train)
    
    # Predict and evaluate on validation set
    y_pred = w0 + X_val.dot(w)
    score = rmse(y_val, y_pred)
    scores.append(score)

std = np.std(scores)
print("RMSE scores:", np.round(scores, 4))
print("Standard deviation of RMSEs:", round(std, 3))

RMSE scores: [0.038  0.0393 0.0395 0.0387 0.0373 0.0394 0.0389 0.0384 0.0402 0.0386]
Standard deviation of RMSEs: 0.001


In [70]:

np.random.seed(9)
idx = np.arange(n)
np.random.shuffle(idx)

df_train = df.iloc[idx[:n_train]].reset_index(drop=True)
df_val = df.iloc[idx[n_train:n_train + n_val]].reset_index(drop=True)
df_test = df.iloc[idx[n_train + n_val:]].reset_index(drop=True)

df_train_full = pd.concat([df_train, df_val]).reset_index(drop=True)

y_train_full = np.log1p(df_train_full['fuel_efficiency_mpg'].values)
y_test = np.log1p(df_test['fuel_efficiency_mpg'].values)

del df_train_full['fuel_efficiency_mpg']
del df_test['fuel_efficiency_mpg']

X_train_full = prepare_X(df_train_full, fillna_value=0)
X_test = prepare_X(df_test, fillna_value=0)

r = 0.001
w0, w = train_linear_regression_reg(X_train_full, y_train_full, r=r)

y_pred = w0 + X_test.dot(w)

y_test_orig = np.expm1(y_test)
y_pred_orig = np.expm1(y_pred)

rmse_test_orig = rmse(y_test_orig, y_pred_orig)
print(round(rmse_test_orig, 3))



0.607
