In [1]:
import pandas as pd 
import numpy as np


In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv')

In [3]:
df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


In [4]:
df.columns

Index(['engine_displacement', 'num_cylinders', 'horsepower', 'vehicle_weight',
       'acceleration', 'model_year', 'origin', 'fuel_type', 'drivetrain',
       'num_doors', 'fuel_efficiency_mpg'],
      dtype='object')

In [5]:
df = df[['engine_displacement', 
         'horsepower', 
         'vehicle_weight', 
         'model_year', 
         'fuel_efficiency_mpg']]


In [6]:
df.head()

Unnamed: 0,engine_displacement,horsepower,vehicle_weight,model_year,fuel_efficiency_mpg
0,170,159.0,3413.433759,2003,13.231729
1,130,97.0,3149.664934,2007,13.688217
2,170,78.0,3079.038997,2018,14.246341
3,220,,2542.392402,2009,16.912736
4,210,140.0,3460.87099,2009,12.488369


In [7]:
df.isnull().sum()

engine_displacement      0
horsepower             708
vehicle_weight           0
model_year               0
fuel_efficiency_mpg      0
dtype: int64

In [8]:
mediane_hp = df['horsepower'].median()
print (mediane_hp)


149.0


In [9]:
np.random.seed(2)

n = len(df)

n_val = int(0.2 * n)
n_test = int(0.2 * n)
n_train = n - (n_val + n_test)

idx = np.arange(n)
np.random.shuffle(idx)

df_shuffled = df.iloc[idx]

df_train = df_shuffled.iloc[:n_train].copy()
df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
df_test = df_shuffled.iloc[n_train+n_val:].copy()

In [10]:
y_train_orig = df_train.fuel_efficiency_mpg.values
y_val_orig = df_val.fuel_efficiency_mpg.values
y_test_orig = df_test.fuel_efficiency_mpg.values

y_train = np.log1p(df_train.fuel_efficiency_mpg.values)
y_val = np.log1p(df_val.fuel_efficiency_mpg.values)
y_test = np.log1p(df_test.fuel_efficiency_mpg.values)

del df_train['fuel_efficiency_mpg']
del df_val['fuel_efficiency_mpg']
del df_test['fuel_efficiency_mpg']

In [11]:
df_train_0 = df_train.fillna(0)
df_val_0 = df_val.fillna(0)
df_test_0 = df_test.fillna(0)






In [12]:
mediane_hp = df_train.horsepower.median()
df_train_median = df_train.fillna(mediane_hp)
df_val_median = df_val.fillna(mediane_hp)
df_test_median = df_test.fillna(mediane_hp)


In [13]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)

    return w[0], w[1:]

In [14]:
def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    return np.sqrt(mse)

In [15]:
w_0_nul, w_nul = train_linear_regression(df_train_0, y_train)

y_pred_0 = w_0_nul + df_train_0.dot(w_nul)

rmse(y_train, y_pred_0)

np.float64(0.03902570026567232)

In [16]:
w_0_med, w_med = train_linear_regression(df_train_median, y_train)

y_pred_median = w_0_med + df_train_median.dot(w_med)

rmse(y_train, y_pred_median)

np.float64(0.03579177175935631)

In [18]:
base = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']

In [19]:
def prepare_X(df):
    df = df.copy()
    features = base.copy()


    df_num = df[features]
    df_num = df_num.fillna(0)
    X = df_num.values
    return X

In [20]:
df.isnull().sum()

engine_displacement      0
horsepower             708
vehicle_weight           0
model_year               0
fuel_efficiency_mpg      0
dtype: int64

In [21]:
df_train = df_train.fillna(0)
df_train.isnull().sum()

engine_displacement    0
horsepower             0
vehicle_weight         0
model_year             0
dtype: int64

In [22]:
df_val = df_val.fillna(0)
df_val.isnull().sum()

engine_displacement    0
horsepower             0
vehicle_weight         0
model_year             0
dtype: int64

In [23]:
df_test = df_test.fillna(0)
df_test.isnull().sum()

engine_displacement    0
horsepower             0
vehicle_weight         0
model_year             0
dtype: int64

In [24]:
def train_linear_regression_reg(X, y, r=0.0):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    reg = r * np.eye(XTX.shape[0])
    XTX = XTX + reg

    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)

    return w[0], w[1:]

In [25]:
X_train = prepare_X(df_train)

In [26]:
for r in [0, 0.01, 0.1, 1, 5, 10, 100]:
    w_0, w = train_linear_regression_reg(df_train, y_train, r=r)
    # print(f"r={r}, w0={w_0:.2f}, weights={w}")
    y_pred = w_0 + X_train.dot(w)
    # print('train', rmse(y_train, y_pred))

    X_val = prepare_X(df_val)
    y_pred = w_0 + X_val.dot(w)
    print('r=', r, 'rmse=', rmse(y_val, y_pred))

r= 0 rmse= 0.03944653052537959
r= 0.01 rmse= 0.03951510491799508
r= 0.1 rmse= 0.04024128518108494
r= 1 rmse= 0.04116443303932604
r= 5 rmse= 0.04132350427401614
r= 10 rmse= 0.04134500010719854
r= 100 rmse= 0.041364677695047376


In [27]:
def evaluate_seed(seed):
    np.random.seed(seed)
    n = len(df)
    n_val = int(0.2 * n)
    n_test = int(0.2 * n)
    n_train = n - (n_val + n_test)

    idx = np.arange(n)
    np.random.shuffle(idx)

    df_shuffled = df.iloc[idx]

    df_train = df_shuffled.iloc[:n_train].copy()
    df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
    df_test = df_shuffled.iloc[n_train+n_val:].copy()

    y_train = np.log1p(df_train.fuel_efficiency_mpg.values)
    y_val = np.log1p(df_val.fuel_efficiency_mpg.values)

    del df_train['fuel_efficiency_mpg']
    del df_val['fuel_efficiency_mpg']

    X_train = df_train.fillna(0).values
    X_val = df_val.fillna(0).values

    w_0, w = train_linear_regression(X_train, y_train)
    y_pred = w_0 + X_val.dot(w)
    return rmse(y_val, y_pred)

rmse_scores = [evaluate_seed(s) for s in range(10)]
print("RMSEs:", rmse_scores)
std = np.std(rmse_scores)
print("Standard deviation =", round(std, 3))


RMSEs: [np.float64(0.03801775537080148), np.float64(0.03927885333921255), np.float64(0.03944653052537959), np.float64(0.038727637037731855), np.float64(0.03727535850054727), np.float64(0.03938438834004637), np.float64(0.03890763931360936), np.float64(0.03837971626977783), np.float64(0.04018986975230982), np.float64(0.03860764644229973)]
Standard deviation = 0.001


In [28]:
np.random.seed(9)
n = len(df)
n_val = int(0.2 * n)
n_test = int(0.2 * n)
n_train = n - (n_val + n_test)

idx = np.arange(n)
np.random.shuffle(idx)

df_shuffled = df.iloc[idx]

df_train = df_shuffled.iloc[:n_train].copy()
df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
df_test = df_shuffled.iloc[n_train+n_val:].copy()

y_train = np.log1p(df_train.fuel_efficiency_mpg.values)
y_val = np.log1p(df_val.fuel_efficiency_mpg.values)
y_test = np.log1p(df_test.fuel_efficiency_mpg.values)

del df_train['fuel_efficiency_mpg']
del df_val['fuel_efficiency_mpg']
del df_test['fuel_efficiency_mpg']

df_full_train = pd.concat([df_train, df_val])
y_full_train = np.concatenate([y_train, y_val])

X_full_train = df_full_train.fillna(0).values
X_test = df_test.fillna(0).values

w_0, w = train_linear_regression_reg(X_full_train, y_full_train, r=0.001)
y_pred = w_0 + X_test.dot(w)
final_rmse = rmse(y_test, y_pred)
print("Final RMSE (test):", round(final_rmse, 3))

Final RMSE (test): 0.039


In [30]:
np.random.seed(9)

n = len(df)
n_val = int(0.2 * n)
n_test = int(0.2 * n)
n_train = n - n_val - n_test

idx = np.arange(n)
np.random.shuffle(idx)

df_shuffled = df.iloc[idx]

df_train = df_shuffled.iloc[:n_train].copy()
df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
df_test = df_shuffled.iloc[n_train+n_val:].copy()

# Variables cibles (log-transformées)
y_train = np.log1p(df_train.fuel_efficiency_mpg.values)
y_val = np.log1p(df_val.fuel_efficiency_mpg.values)
y_test = np.log1p(df_test.fuel_efficiency_mpg.values)

# Supprimer la cible du DataFrame
del df_train['fuel_efficiency_mpg']
del df_val['fuel_efficiency_mpg']
del df_test['fuel_efficiency_mpg']

# --- Combine train + val ---
df_full_train = pd.concat([df_train, df_val])
df_full_train = df_full_train.reset_index(drop=True)

y_full_train = np.concatenate([y_train, y_val])

# --- Préparer les features ---
def prepare_X(df):
    df_num = df.copy()
    df_num = df_num.fillna(0)
    X = df_num.values
    return X

# --- Fonction d'entraînement avec régularisation ---
def train_linear_regression_reg(X, y, r=0.0):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    reg = r * np.eye(XTX.shape[0])
    XTX = XTX + reg

    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)

    return w[0], w[1:]

# --- Fonction RMSE ---
def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    return np.sqrt(mse)

# --- Entraîner le modèle ---
X_full_train = prepare_X(df_full_train)
w0, w = train_linear_regression_reg(X_full_train, y_full_train, r=0.001)

# --- Évaluer sur le test set ---
X_test = prepare_X(df_test)
y_pred = w0 + X_test.dot(w)

# Revenir à l’échelle originale
y_pred_final = np.expm1(y_pred)
y_test_final = np.expm1(y_test)

# Calcul du RMSE final
final_rmse = rmse(y_test_final, y_pred_final)
print("Final RMSE on test:", round(final_rmse, 3))


Final RMSE on test: 0.607
