In [2]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

**Get the Data**

In [3]:
# # reading data
data = pd.read_csv("https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv")
data

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.870990,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369
...,...,...,...,...,...,...,...,...,...,...,...
9699,140,5.0,164.0,2981.107371,17.3,2013,Europe,Diesel,Front-wheel drive,,15.101802
9700,180,,154.0,2439.525729,15.0,2004,USA,Gasoline,All-wheel drive,0.0,17.962326
9701,220,2.0,138.0,2583.471318,15.1,2008,USA,Diesel,All-wheel drive,-1.0,17.186587
9702,230,4.0,177.0,2905.527390,19.4,2011,USA,Diesel,Front-wheel drive,1.0,15.331551


In [5]:
# Keep only the required columns
cols = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year', 'fuel_efficiency_mpg']
data = data[cols].copy()

data.head()

Unnamed: 0,engine_displacement,horsepower,vehicle_weight,model_year,fuel_efficiency_mpg
0,170,159.0,3413.433759,2003,13.231729
1,130,97.0,3149.664934,2007,13.688217
2,170,78.0,3079.038997,2018,14.246341
3,220,,2542.392402,2009,16.912736
4,210,140.0,3460.87099,2009,12.488369


****EDA****

****Question 1:****
**There's one column with missing values. What is it?**

In [19]:
f"Missing values per column:", data.isnull().sum()

('Missing values per column:',
 engine_displacement      0
 horsepower             708
 vehicle_weight           0
 model_year               0
 fuel_efficiency_mpg      0
 dtype: int64)

In [16]:
missing_col = data.columns[data.isnull().any()].tolist()
f'Column with missing values: {missing_col}'

"Column with missing values: ['horsepower']"

****Question 2:****
**What's the median (50% percentile) for variable 'horsepower'?**

In [20]:
median_hp = data['horsepower'].median()
f"Median horsepower =", median_hp

('Median horsepower =', np.float64(149.0))

In [22]:
# Function to split 60/20/20 manually
def split_dataset(data, seed=42):
    n = len(data)
    n_train = int(0.6 * n)
    n_val = int(0.2 * n)
    data_shuffled = data.sample(frac=1, random_state=seed).reset_index(drop=True)
    df_train = data_shuffled.iloc[:n_train]
    df_val = data_shuffled.iloc[n_train:n_train+n_val]
    df_test = data_shuffled.iloc[n_train+n_val:]
    return df_train, df_val, df_test

df_train, df_val, df_test = split_dataset(data, seed=42)

len(df_train), len(df_val), len(df_test)


(5822, 1940, 1942)

In [32]:
features = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']
target = 'fuel_efficiency_mpg'

def rmse(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

# Prepare data
X_train = df_train[features]
y_train = df_train[target]
X_val = df_val[features]
y_val = df_val[target]

# Option A – fill missing with 0
X_train_0 = X_train.fillna(0)
X_val_0 = X_val.fillna(0)
model_0 = LinearRegression()
model_0.fit(X_train_0, y_train)
rmse_0 = round(rmse(y_val, model_0.predict(X_val_0)), 2)

# Option B – fill missing with mean
mean_values = X_train.mean()
X_train_mean = X_train.fillna(mean_values)
X_val_mean = X_val.fillna(mean_values)
model_mean = LinearRegression()
model_mean.fit(X_train_mean, y_train)
rmse_mean = round(rmse(y_val, model_mean.predict(X_val_mean)), 2)

print(f"RMSE (fill 0): {rmse_0}")
print(f"RMSE (fill mean): {rmse_mean}")
print("Better option:", "With 0" if rmse_0 < rmse_mean else "With mean" if rmse_mean < rmse_0 else "Both equally good")


RMSE (fill 0): 0.51
RMSE (fill mean): 0.46
Better option: With mean


In [31]:
r_values = [0, 0.01, 0.1, 1, 5, 10, 100]
results = {}

X_train_0 = X_train.fillna(0)
X_val_0 = X_val.fillna(0)

for r in r_values:
    if r == 0:
        model = LinearRegression()
    else:
        model = Ridge(alpha=r)
    model.fit(X_train_0, y_train)
    y_pred = model.predict(X_val_0)
    results[r] = round(rmse(y_val, y_pred), 2)

print("RMSE per r:")
for k, v in results.items():
    print(f"r={k}: RMSE={v}")

best_rmse = min(results.values())
best_r = min([r for r, score in results.items() if score == best_rmse])
print("Best r:", best_r)


RMSE per r:
r=0: RMSE=0.51
r=0.01: RMSE=0.51
r=0.1: RMSE=0.51
r=1: RMSE=0.51
r=5: RMSE=0.51
r=10: RMSE=0.51
r=100: RMSE=0.51
Best r: 0


In [30]:
rmse_list = []

for s in range(10):
    df_train, df_val, df_test = split_dataset(data, seed=s)
    X_train = df_train[features].fillna(0)
    y_train = df_train[target]
    X_val = df_val[features].fillna(0)
    y_val = df_val[target]

    model = LinearRegression()
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    rmse_list.append(rmse(y_val, preds))

std_rmse = round(np.std(rmse_list), 3)
print("Standard deviation of RMSE across seeds:", std_rmse)
print("All scores:", [round(x,3) for x in rmse_list])


Standard deviation of RMSE across seeds: 0.007
All scores: [np.float64(0.521), np.float64(0.522), np.float64(0.523), np.float64(0.516), np.float64(0.511), np.float64(0.529), np.float64(0.532), np.float64(0.51), np.float64(0.515), np.float64(0.513)]


In [29]:
# Split with seed 9
df_train, df_val, df_test = split_dataset(data, seed=9)
df_train_val = pd.concat([df_train, df_val])

X_train_val = df_train_val[features].fillna(0)
y_train_val = df_train_val[target]
X_test = df_test[features].fillna(0)
y_test = df_test[target]

ridge_model = Ridge(alpha=0.001)
ridge_model.fit(X_train_val, y_train_val)
y_pred_test = ridge_model.predict(X_test)

rmse_test = round(rmse(y_test, y_pred_test), 3)
print("Test RMSE:", rmse_test)


Test RMSE: 0.515
