In [1]:
import pandas as pd

df = pd.read_csv("car_fuel_efficiency.csv")
print(df.head())

   engine_displacement  num_cylinders  horsepower  vehicle_weight  \
0                  170            3.0       159.0     3413.433759   
1                  130            5.0        97.0     3149.664934   
2                  170            NaN        78.0     3079.038997   
3                  220            4.0         NaN     2542.392402   
4                  210            1.0       140.0     3460.870990   

   acceleration  model_year  origin fuel_type         drivetrain  num_doors  \
0          17.7        2003  Europe  Gasoline    All-wheel drive        0.0   
1          17.8        2007     USA  Gasoline  Front-wheel drive        0.0   
2          15.1        2018  Europe  Gasoline  Front-wheel drive        0.0   
3          20.2        2009     USA    Diesel    All-wheel drive        2.0   
4          14.4        2009  Europe  Gasoline    All-wheel drive        2.0   

   fuel_efficiency_mpg  
0            13.231729  
1            13.688217  
2            14.246341  
3         

In [3]:
cols = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year', 'fuel_efficiency_mpg']
df = df[cols]

In [4]:
df.isnull().sum()


engine_displacement      0
horsepower             708
vehicle_weight           0
model_year               0
fuel_efficiency_mpg      0
dtype: int64

In [5]:
df['horsepower'].median()


149.0

In [6]:
from sklearn.model_selection import train_test_split

df = df.sample(frac=1, random_state=42)  # shuffle

n = len(df)
n_train = int(0.6 * n)
n_val = int(0.2 * n)
n_test = n - n_train - n_val

df_train = df.iloc[:n_train]
df_val = df.iloc[n_train:n_train+n_val]
df_test = df.iloc[n_train+n_val:]


In [7]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

# Prepare features
features = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']

def train_linear_regression(X, y):
    model = LinearRegression()
    model.fit(X, y)
    return model

def rmse(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

# Split X/y
X_train = df_train[features]
y_train = df_train['fuel_efficiency_mpg']
X_val = df_val[features]
y_val = df_val['fuel_efficiency_mpg']

# Option 1: fill with 0
X_train_0 = X_train.fillna(0)
X_val_0 = X_val.fillna(0)
model_0 = train_linear_regression(X_train_0, y_train)
rmse_0 = rmse(y_val, model_0.predict(X_val_0))

# Option 2: fill with mean (computed from train only)
mean_hp = X_train['horsepower'].mean()
X_train_mean = X_train.fillna({'horsepower': mean_hp})
X_val_mean = X_val.fillna({'horsepower': mean_hp})
model_mean = train_linear_regression(X_train_mean, y_train)
rmse_mean = rmse(y_val, model_mean.predict(X_val_mean))

print(round(rmse_0, 2), round(rmse_mean, 2))


0.52 0.46


In [8]:
from sklearn.linear_model import Ridge

r_values = [0, 0.01, 0.1, 1, 5, 10, 100]

rmse_scores = {}

for r in r_values:
    model = Ridge(alpha=r)
    X_train_0 = df_train[features].fillna(0)
    X_val_0 = df_val[features].fillna(0)
    y_train = df_train['fuel_efficiency_mpg']
    y_val = df_val['fuel_efficiency_mpg']
    model.fit(X_train_0, y_train)
    y_pred = model.predict(X_val_0)
    rmse_scores[r] = round(rmse(y_val, y_pred), 2)

rmse_scores


{0: 0.52, 0.01: 0.52, 0.1: 0.52, 1: 0.52, 5: 0.52, 10: 0.52, 100: 0.52}

In [9]:
scores = []

for seed in range(10):
    df_shuffled = df.sample(frac=1, random_state=seed)
    n = len(df)
    n_train = int(0.6 * n)
    n_val = int(0.2 * n)
    df_train = df_shuffled.iloc[:n_train]
    df_val = df_shuffled.iloc[n_train:n_train+n_val]
    
    X_train = df_train[features].fillna(0)
    X_val = df_val[features].fillna(0)
    y_train = df_train['fuel_efficiency_mpg']
    y_val = df_val['fuel_efficiency_mpg']
    
    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    scores.append(rmse(y_val, y_pred))

round(np.std(scores), 3)


0.008

In [10]:
seed = 9
df_shuffled = df.sample(frac=1, random_state=seed)
n = len(df)
n_train = int(0.6 * n)
n_val = int(0.2 * n)
df_train = df_shuffled.iloc[:n_train]
df_val = df_shuffled.iloc[n_train:n_train+n_val]
df_test = df_shuffled.iloc[n_train+n_val:]

df_full_train = pd.concat([df_train, df_val])

X_full_train = df_full_train[features].fillna(0)
y_full_train = df_full_train['fuel_efficiency_mpg']
X_test = df_test[features].fillna(0)
y_test = df_test['fuel_efficiency_mpg']

model = Ridge(alpha=0.001)
model.fit(X_full_train, y_full_train)
y_pred = model.predict(X_test)

round(rmse(y_test, y_pred), 3)


0.529