# Code using numpy

## Import libraries

In [None]:
import numpy as np
import pandas as pd

## Import data

In [None]:
from google.colab import drive
drive.mount('/content/drive')
df = pd.read_csv('/content/drive/MyDrive/train.csv')
print(df.head())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
   v.id  on road old  on road now  years      km  rating  condition  economy  \
0     1       535651       798186      3   78945       1          2       14   
1     2       591911       861056      6  117220       5          9        9   
2     3       686990       770762      2  132538       2          8       15   
3     4       573999       722381      4  101065       4          3       11   
4     5       691388       811335      6   61559       3          9       12   

   top speed  hp  torque  current price  
0        177  73     123       351318.0  
1        148  74      95       285001.5  
2        181  53      97       215386.0  
3        197  54     116       244295.5  
4        160  53     105       531114.5  


In [None]:
data = df.drop(df.columns[0], axis = 1).to_numpy()
X = data[:, :-1]
y = data[:, -1]
y = np.expand_dims(y, axis = 1)
X.shape, y.shape

((1000, 10), (1000, 1))

## Prepare the data

In [None]:
TRAIN_RATIO, VAL_RATIO = 0.8, 0.1
BATCH_SIZE, SEED = 64, 123

n = len(X)
idx = np.arange(n)
np.random.shuffle(idx)
n_train = int(n * TRAIN_RATIO)
n_val   = int(n * VAL_RATIO)
n_test  = n - n_train - n_val

idx_train = idx[:n_train]
idx_val = idx[n_train:n_train + n_val]
idx_test = idx[n_train + n_val:]

X_train, y_train = X[idx_train], y[idx_train]
X_val, y_val = X[idx_val], y[idx_val]
X_test, y_test = X[idx_test], y[idx_test]

In [None]:
class StandardScaler:
  def __init__(self):
    self.mean_ = None
    self.scale_ = None

  def fit(self, X):
    X = np.asarray(X, dtype = np.float64)
    self.mean_ = np.mean(X, axis = 0)
    self.scale_ = np.std(X, axis = 0, ddof = 0)
    self.scale_ = np.where(self.scale_ == 0, 1.0, self.scale_)
    return self

  def transform(self, X):
    if self.mean_ is None or self.scale_ is None:
      raise ValueError("Bạn phải gọi fit() trước khi transform().")

    X = np.asarray(X, dtype=np.float64)
    return (X - self.mean_) / self.scale_

  def fit_transform(self, X):
    return self.fit(X).transform(X)

  def inverse_transform(self, X_scaled):
    if self.mean_ is None or self.scale_ is None:
      raise ValueError("Bạn phải gọi fit() trước khi inverse_transform().")

    X_scaled = np.asarray(X_scaled, dtype=np.float64)
    return X_scaled * self.scale_ + self.mean_

In [None]:
scaler_X = StandardScaler()
X_train_s = scaler_X.fit_transform(X_train)
X_val_s = scaler_X.transform(X_val)
X_test_s = scaler_X.transform(X_test)

scaler_y = StandardScaler()
y_train_s = scaler_y.fit_transform(y_train)
y_val_s = scaler_y.transform(y_val)
y_test_s = scaler_y.transform(y_test)

## Training

### Initialize parameters

In [None]:
# Thiết lập seed cho reproducibility
np.random.seed(SEED)

n_features = X_train_s.shape[1]
h1, h2 = 64, 32  # hidden sizes
out = 1          # regression

# Khởi tạo He (tốt cho ReLU)
def he_init(fan_in, fan_out):
    return np.random.randn(fan_in, fan_out) * np.sqrt(2.0 / fan_in)

W1 = he_init(n_features, h1); b1 = np.zeros((1, h1))
W2 = he_init(h1, h2);        b2 = np.zeros((1, h2))
W3 = he_init(h2, out);       b3 = np.zeros((1, out))

# Dropout rates
p_drop1 = 0.001   # sau ReLU1
p_drop2 = 0.0005   # sau ReLU2
USING_DROPOUT = False

### Activation function and forward function

In [None]:
def relu(x):        return np.maximum(0.0, x)
def relu_grad(x):   return (x > 0).astype(x.dtype)  # d/dx ReLU

def forward(Xb, using_dropout = False):
    # Layer 1
    Z1 = Xb @ W1 + b1
    A1 = relu(Z1)
    if using_dropout and p_drop1 > 0.0:
        D1 = (np.random.rand(*A1.shape) > p_drop1).astype(A1.dtype)  # Bernoulli(1-p)
        A1 = (A1 * D1) / (1.0 - p_drop1)
    else:
        D1 = None

    # Layer 2
    Z2 = A1 @ W2 + b2
    A2 = relu(Z2)
    if using_dropout and p_drop2 > 0.0:
        D2 = (np.random.rand(*A2.shape) > p_drop2).astype(A2.dtype)
        A2 = (A2 * D2) / (1.0 - p_drop2)
    else:
        D2 = None

    # Output (linear)
    Z3 = A2 @ W3 + b3
    return Z1, A1, D1, Z2, A2, D2, Z3  # Z3 = y_hat (scaled)

### Loss and Metrics

In [None]:
def mse(y_true, y_pred):
    return np.mean((y_pred - y_true) ** 2)

def rmse(y_true, y_pred):
    return np.sqrt(mse(y_true, y_pred))

def mae(y_true, y_pred):
    return np.mean(np.abs(y_pred - y_true))

def r2_score(y_true, y_pred):
    ss_res = np.sum((y_true - y_pred) ** 2)
    ss_tot = np.sum((y_true - np.mean(y_true)) ** 2)
    return 1.0 - (ss_res / (ss_tot + 1e-12))

### Adam optimizer helpers

In [None]:
# Adam hyperparams
lr = 1e-3
beta1, beta2, eps = 0.9, 0.999, 1e-8

# moments
mW1 = np.zeros_like(W1); vW1 = np.zeros_like(W1)
mW2 = np.zeros_like(W2); vW2 = np.zeros_like(W2)
mW3 = np.zeros_like(W3); vW3 = np.zeros_like(W3)
mb1 = np.zeros_like(b1); vb1 = np.zeros_like(b1)
mb2 = np.zeros_like(b2); vb2 = np.zeros_like(b2)
mb3 = np.zeros_like(b3); vb3 = np.zeros_like(b3)

t_adam = 0  # timestep
def adam_update(W, dW, mW, vW):
    global t_adam
    t_adam += 1
    mW[:] = beta1 * mW + (1 - beta1) * dW
    vW[:] = beta2 * vW + (1 - beta2) * (dW ** 2)
    m_hat = mW / (1 - beta1 ** t_adam)
    v_hat = vW / (1 - beta2 ** t_adam)
    W -= lr * m_hat / (np.sqrt(v_hat) + eps)

### Backprop

In [None]:
weight_decay = 1e-3  # đặt >0 (vd 1e-4) nếu muốn regularize

def backward(Xb, yb, Z1, A1, D1, Z2, A2, D2, yhat):
    n_b = Xb.shape[0]

    # Output layer (MSE derivative)
    dZ3 = (2.0 / n_b) * (yhat - yb)             # shape (B,1)
    dW3 = A2.T @ dZ3 + weight_decay * W3        # (h2,1)
    db3 = np.sum(dZ3, axis=0, keepdims=True)    # (1,1)

    dA2 = dZ3 @ W3.T                             # (B,h2)
    if D2 is not None:                           # dropout backward
        dA2 = (dA2 * D2) / (1.0 - p_drop2)

    dZ2 = dA2 * relu_grad(Z2)                    # (B,h2)
    dW2 = A1.T @ dZ2 + weight_decay * W2         # (h1,h2)
    db2 = np.sum(dZ2, axis=0, keepdims=True)     # (1,h2)

    dA1 = dZ2 @ W2.T                             # (B,h1)
    if D1 is not None:
        dA1 = (dA1 * D1) / (1.0 - p_drop1)

    dZ1 = dA1 * relu_grad(Z1)                    # (B,h1)
    dW1 = Xb.T @ dZ1 + weight_decay * W1         # (d,h1)
    db1 = np.sum(dZ1, axis=0, keepdims=True)     # (1,h1)

    return dW1, db1, dW2, db2, dW3, db3

### Mini-batch + Early stopping

In [None]:
epochs = 1000
patience = 20
best_val = np.inf
best_params = None
pat = 0

def get_minibatches(X, y, batch_size, seed=None):
    rng = np.random.default_rng(seed)
    idx = rng.permutation(X.shape[0])
    for i in range(0, len(idx), batch_size):
        sel = idx[i:i+batch_size]
        yield X[sel], y[sel]

for epoch in range(1, epochs+1):
    # ----- Train -----
    train_losses = []
    for Xb, yb in get_minibatches(X_train_s, y_train_s, BATCH_SIZE, seed=epoch+SEED):
        Z1, A1, D1, Z2, A2, D2, yhat = forward(Xb, USING_DROPOUT)
        loss = mse(yb, yhat)
        train_losses.append(loss)

        dW1, db1, dW2, db2, dW3, db3 = backward(Xb, yb, Z1, A1, D1, Z2, A2, D2, yhat)

        # Adam updates
        adam_update(W1, dW1, mW1, vW1); adam_update(b1, db1, mb1, vb1)
        adam_update(W2, dW2, mW2, vW2); adam_update(b2, db2, mb2, vb2)
        adam_update(W3, dW3, mW3, vW3); adam_update(b3, db3, mb3, vb3)

    # ----- Validation -----
    _, _, _, _, _, _, yhat_val_s = forward(X_val_s)
    val_rmse_scaled = rmse(y_val_s, yhat_val_s)

    # In thêm metric ở thang đo gốc (dễ hiểu hơn)
    yhat_val_orig = scaler_y.inverse_transform(yhat_val_s)
    val_rmse = rmse(y_val, yhat_val_orig)
    val_mae  = mae(y_val, yhat_val_orig)
    val_r2   = r2_score(y_val, yhat_val_orig)

    print(f"Epoch {epoch:3d} | Train MSE(scaled)={np.mean(train_losses):.6f} | "
          f"Val RMSE={val_rmse:.4f} | MAE={val_mae:.4f} | R2={val_r2:.4f}")

    # Early stopping theo RMSE (scaled để nhất quán với loss)
    if val_rmse_scaled + 1e-8 < best_val:
        best_val = val_rmse_scaled
        best_params = (W1.copy(), b1.copy(), W2.copy(), b2.copy(), W3.copy(), b3.copy())
        pat = 0
    else:
        pat += 1
        if pat >= patience:
            print(f"Early stopping at epoch {epoch}. Best scaled RMSE={best_val:.6f}")
            break

# Khôi phục tham số tốt nhất (nếu early stop)
if best_params is not None:
    W1, b1, W2, b2, W3, b3 = best_params

Epoch   1 | Train MSE(scaled)=4.022803 | Val RMSE=139553.8972 | MAE=111101.1563 | R2=-0.2073
Epoch   2 | Train MSE(scaled)=1.190200 | Val RMSE=124805.7470 | MAE=98256.1235 | R2=0.0344
Epoch   3 | Train MSE(scaled)=0.544028 | Val RMSE=82236.2928 | MAE=66215.8162 | R2=0.5808
Epoch   4 | Train MSE(scaled)=0.297201 | Val RMSE=66147.1980 | MAE=52957.9149 | R2=0.7288
Epoch   5 | Train MSE(scaled)=0.194304 | Val RMSE=58701.5907 | MAE=47010.3569 | R2=0.7864
Epoch   6 | Train MSE(scaled)=0.144642 | Val RMSE=52204.8380 | MAE=42472.4714 | R2=0.8311
Epoch   7 | Train MSE(scaled)=0.122279 | Val RMSE=48541.3128 | MAE=39358.4402 | R2=0.8539
Epoch   8 | Train MSE(scaled)=0.105540 | Val RMSE=45471.8993 | MAE=37019.6612 | R2=0.8718
Epoch   9 | Train MSE(scaled)=0.092314 | Val RMSE=42783.0285 | MAE=34974.7808 | R2=0.8865
Epoch  10 | Train MSE(scaled)=0.082246 | Val RMSE=41344.9299 | MAE=33556.6736 | R2=0.8940
Epoch  11 | Train MSE(scaled)=0.072821 | Val RMSE=39078.2897 | MAE=31800.3080 | R2=0.9053
Epoch 

### Evaluate

In [None]:
# Forward test
_, _, _, _, _, _, yhat_test_s = forward(X_test_s)

# Thang gốc
yhat_test_orig = scaler_y.inverse_transform(yhat_test_s)
test_rmse = rmse(y_test, yhat_test_orig)
test_mae  = mae(y_test, yhat_test_orig)
test_r2   = r2_score(y_test, yhat_test_orig)

print(f"\nTest — RMSE: {test_rmse:.4f} | MAE: {test_mae:.4f} | R2: {test_r2:.4f}")


Test — RMSE: 1525.3599 | MAE: 1158.3417 | R2: 0.9999


# Code using Tensorflow

## Import Libraries


In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, Sequential, regularizers
from sklearn.metrics import r2_score
import pandas as pd
from sklearn.preprocessing import StandardScaler

## Import Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')
df = pd.read_csv('/content/drive/MyDrive/train.csv')
print(df.head())

Mounted at /content/drive
   v.id  on road old  on road now  years      km  rating  condition  economy  \
0     1       535651       798186      3   78945       1          2       14   
1     2       591911       861056      6  117220       5          9        9   
2     3       686990       770762      2  132538       2          8       15   
3     4       573999       722381      4  101065       4          3       11   
4     5       691388       811335      6   61559       3          9       12   

   top speed  hp  torque  current price  
0        177  73     123       351318.0  
1        148  74      95       285001.5  
2        181  53      97       215386.0  
3        197  54     116       244295.5  
4        160  53     105       531114.5  


In [None]:
data = tf.convert_to_tensor(df.drop(df.columns[0], axis = 1))
X = data[:, :-1].numpy()
y = data[:, -1]
y = tf.expand_dims(y, 1).numpy()
X.shape, y.shape

((1000, 10), (1000, 1))

## Prepare the data

In [None]:
TRAIN_RATIO, VAL_RATIO = 0.8, 0.1
BATCH_SIZE, SEED = 64, 123

n = len(X)
idx = tf.random.shuffle(tf.range(n), seed=42)
n_train = int(n * TRAIN_RATIO)
n_val   = int(n * VAL_RATIO)
n_test  = n - n_train - n_val

idx_train = idx[:n_train].numpy()
idx_val   = idx[n_train:n_train+n_val].numpy()
idx_test  = idx[n_train+n_val:].numpy()

X_train, y_train = X[idx_train], y[idx_train]
X_val,   y_val   = X[idx_val],   y[idx_val]
X_test,  y_test  = X[idx_test],  y[idx_test]

scaler_X = StandardScaler()
X_train_s = scaler_X.fit_transform(X_train)
X_val_s = scaler_X.transform(X_val)
X_test_s = scaler_X.transform(X_test)

scaler_y = StandardScaler()
y_train_s = scaler_y.fit_transform(y_train)
y_val_s = scaler_y.transform(y_val)
y_test_s = scaler_y.transform(y_test)

train_ds = (tf.data.Dataset.from_tensor_slices((X_train_s, y_train_s))
            .shuffle(buffer_size=n_train, seed=SEED, reshuffle_each_iteration=True)
            .batch(BATCH_SIZE)
            .prefetch(tf.data.AUTOTUNE))

val_ds  = (tf.data.Dataset.from_tensor_slices((X_val_s, y_val_s))
           .batch(BATCH_SIZE)
           .prefetch(tf.data.AUTOTUNE))

test_ds = (tf.data.Dataset.from_tensor_slices((X_test_s, y_test_s))
           .batch(BATCH_SIZE)
           .prefetch(tf.data.AUTOTUNE))


## Training

In [None]:
# Weight decay thông qua L2 regularization (kernel_regularizer)
l2_lambda = 1e-3
drop1, drop2 = 0.1, 0.05
in_dim = X_train_s.shape[1]

model = Sequential([
    layers.Input(shape=(in_dim,)),
    layers.Dense(64, activation="relu",
                 kernel_regularizer=regularizers.l2(l2_lambda)),
    layers.Dropout(drop1),
    layers.Dense(32, activation="relu",
                 kernel_regularizer=regularizers.l2(l2_lambda)),
    layers.Dropout(drop2),
    layers.Dense(1)
])

model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-3),
    loss="mse",
    metrics=[keras.metrics.RootMeanSquaredError(name="rmse")]
)

# Early stopping: dừng khi val_loss không cải thiện, khôi phục trọng số tốt nhất
early_stop = keras.callbacks.EarlyStopping(
    monitor="val_loss",
    mode="min",
    patience=20,
    restore_best_weights=True
)

In [None]:
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=500,            # đặt lớn, early stopping sẽ dừng sớm
    callbacks=[early_stop],
    verbose=1
)

Epoch 1/500
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 19ms/step - loss: 0.9235 - rmse: 0.9275 - val_loss: 0.5636 - val_rmse: 0.7098
Epoch 2/500
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.5497 - rmse: 0.6986 - val_loss: 0.2803 - val_rmse: 0.4697
Epoch 3/500
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.2896 - rmse: 0.4789 - val_loss: 0.1383 - val_rmse: 0.2801
Epoch 4/500
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.2096 - rmse: 0.3864 - val_loss: 0.1029 - val_rmse: 0.2078
Epoch 5/500
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.1550 - rmse: 0.3089 - val_loss: 0.0918 - val_rmse: 0.1810
Epoch 6/500
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.1505 - rmse: 0.3027 - val_loss: 0.0828 - val_rmse: 0.1564
Epoch 7/500
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/s

## Evaluate

In [None]:
print("Eval (scaled y):", model.evaluate(test_ds, verbose=0))

# Dự đoán trên TEST (đang ở thang đã scale)
y_pred_s = model.predict(X_test_s, batch_size=BATCH_SIZE)
# Đảo về thang đo thật
y_pred = scaler_y.inverse_transform(y_pred_s)

# R² và RMSE trên thang đo thật
r2 = r2_score(y_test, y_pred)  # CHÚ Ý: (y_true, y_pred)
rmse = np.sqrt(np.mean((y_pred - y_test) ** 2))
mae  = np.mean(np.abs(y_pred - y_test))

print(f"Test R2 (original scale):  {r2:.4f}")
print(f"Test RMSE (original scale): {rmse:.4f}")
print(f"Test MAE  (original scale): {mae:.4f}")

Eval (scaled y): [0.005159873515367508, 0.03217161446809769]
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
Test R2 (original scale):  0.9989
Test RMSE (original scale): 4056.6466
Test MAE  (original scale): 3145.0366


# Code using Pytorch

## Import libraries

In [1]:
import os
import random
import numpy as np
import pandas as pd

import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader
import torch.optim as optim

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

## Import Data

In [2]:
from google.colab import drive
drive.mount('/content/drive')

df = pd.read_csv('/content/drive/MyDrive/train.csv')

# Drop the first column (as in your TF code) and split X/y
data_np = df.drop(df.columns[0], axis=1).to_numpy()
X = data_np[:, :-1].astype(np.float32)
y = data_np[:, -1].astype(np.float32).reshape(-1, 1)

Mounted at /content/drive


## Prepare the data

In [3]:
TRAIN_RATIO, VAL_RATIO = 0.8, 0.1
BATCH_SIZE, SEED = 64, 123
LR = 1e-3
L2_LAMBDA = 1e-3
DROP1, DROP2 = 0.1, 0.05
EPOCHS = 500
PATIENCE = 20

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [4]:
n = len(X)
idx = np.random.RandomState(42).permutation(n)  # similar to tf.random.shuffle(seed=42)

n_train = int(n * TRAIN_RATIO)
n_val = int(n * VAL_RATIO)
n_test = n - n_train - n_val

idx_train = idx[:n_train]
idx_val   = idx[n_train:n_train + n_val]
idx_test  = idx[n_train + n_val:]

X_train, y_train = X[idx_train], y[idx_train]
X_val,   y_val   = X[idx_val],   y[idx_val]
X_test,  y_test  = X[idx_test],  y[idx_test]

scaler_X = StandardScaler()
X_train_s = scaler_X.fit_transform(X_train)
X_val_s   = scaler_X.transform(X_val)
X_test_s  = scaler_X.transform(X_test)

scaler_y = StandardScaler()
y_train_s = scaler_y.fit_transform(y_train)
y_val_s   = scaler_y.transform(y_val)
y_test_s  = scaler_y.transform(y_test)

def to_tensor(x): return torch.tensor(x, dtype=torch.float32)
train_ds = TensorDataset(to_tensor(X_train_s), to_tensor(y_train_s))
val_ds   = TensorDataset(to_tensor(X_val_s),   to_tensor(y_val_s))
test_ds  = TensorDataset(to_tensor(X_test_s),  to_tensor(y_test_s))

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True,  drop_last=False)
val_loader   = DataLoader(val_ds,   batch_size=BATCH_SIZE, shuffle=False, drop_last=False)
test_loader  = DataLoader(test_ds,  batch_size=BATCH_SIZE, shuffle=False, drop_last=False)

## Model

In [5]:
in_dim = X_train_s.shape[1]

model = nn.Sequential(
    nn.Linear(in_dim, 64),
    nn.ReLU(),
    nn.Dropout(DROP1),
    nn.Linear(64, 32),
    nn.ReLU(),
    nn.Dropout(DROP2),
    nn.Linear(32, 1),
).to(device)

# Loss and Optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=LR, weight_decay=L2_LAMBDA)

## Loss and Optimizer

In [6]:
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=LR, weight_decay=L2_LAMBDA)

## Early Stopping

In [7]:
class EarlyStopping:
    def __init__(self, patience=10, mode="min", restore_best=True):
        self.patience = patience
        self.mode = mode
        self.restore_best = restore_best
        self.best_score = None
        self.best_state = None
        self.counter = 0

    def step(self, current_score, model):
        # For "min" mode, improvement means current_score < best_score
        if self.best_score is None or \
           (self.mode == "min" and current_score < self.best_score - 1e-12) or \
           (self.mode == "max" and current_score > self.best_score + 1e-12):
            self.best_score = current_score
            self.counter = 0
            if self.restore_best:
                # Keep a deep copy of the weights
                self.best_state = {k: v.detach().clone() for k, v in model.state_dict().items()}
            return False  # do not stop
        else:
            self.counter += 1
            return self.counter > self.patience  # stop if patience exceeded

    def restore(self, model):
        if self.restore_best and self.best_state is not None:
            model.load_state_dict(self.best_state)

early_stopper = EarlyStopping(patience=PATIENCE, mode="min", restore_best=True)

## Training

In [8]:
def run_epoch(loader, model, optimizer=None):
    is_train = optimizer is not None
    model.train(is_train)

    running_loss = 0.0
    n_samples = 0

    for xb, yb in loader:
        xb = xb.to(device)
        yb = yb.to(device)

        if is_train:
            optimizer.zero_grad()

        preds = model(xb)
        loss = criterion(preds, yb)

        if is_train:
            loss.backward()
            optimizer.step()

        batch_size = xb.size(0)
        running_loss += loss.item() * batch_size
        n_samples += batch_size

    avg_loss = running_loss / max(1, n_samples)
    rmse = np.sqrt(avg_loss)  # since loss is MSE on scaled targets
    return avg_loss, rmse

best_val = np.inf
for epoch in range(1, EPOCHS + 1):
    train_loss, train_rmse = run_epoch(train_loader, model, optimizer)
    with torch.no_grad():
        val_loss, val_rmse = run_epoch(val_loader, model, optimizer=None)

    if epoch % 1 == 0:
        print(f"Epoch {epoch:03d} | "
              f"train_loss={train_loss:.6f} train_rmse={train_rmse:.6f} | "
              f"val_loss={val_loss:.6f} val_rmse={val_rmse:.6f}")

    # Early stopping check on val_loss
    stop = early_stopper.step(val_loss, model)
    if stop:
        print(f"Early stopping at epoch {epoch}. Best val_loss: {early_stopper.best_score:.6f}")
        break

# Restore best weights
early_stopper.restore(model)

Epoch 001 | train_loss=0.969683 train_rmse=0.984725 | val_loss=0.825438 val_rmse=0.908536
Epoch 002 | train_loss=0.780927 train_rmse=0.883701 | val_loss=0.622591 val_rmse=0.789045
Epoch 003 | train_loss=0.541797 train_rmse=0.736069 | val_loss=0.341676 val_rmse=0.584531
Epoch 004 | train_loss=0.250711 train_rmse=0.500710 | val_loss=0.094362 val_rmse=0.307185
Epoch 005 | train_loss=0.097847 train_rmse=0.312805 | val_loss=0.033861 val_rmse=0.184013
Epoch 006 | train_loss=0.070889 train_rmse=0.266250 | val_loss=0.026022 val_rmse=0.161314
Epoch 007 | train_loss=0.055770 train_rmse=0.236156 | val_loss=0.019219 val_rmse=0.138633
Epoch 008 | train_loss=0.044647 train_rmse=0.211298 | val_loss=0.016056 val_rmse=0.126711
Epoch 009 | train_loss=0.045503 train_rmse=0.213314 | val_loss=0.013918 val_rmse=0.117976
Epoch 010 | train_loss=0.043569 train_rmse=0.208732 | val_loss=0.011813 val_rmse=0.108688
Epoch 011 | train_loss=0.035594 train_rmse=0.188663 | val_loss=0.012568 val_rmse=0.112105
Epoch 012 

## Evaluate

In [9]:
model.eval()
with torch.no_grad():
    test_loss, test_rmse = run_epoch(test_loader, model, optimizer=None)
print(f"Eval (scaled y): loss={test_loss:.6f}, rmse={test_rmse:.6f}")

Eval (scaled y): loss=0.001187, rmse=0.034447


In [10]:
model.eval()
y_pred_s_list = []
with torch.no_grad():
    for xb, _ in test_loader:
        xb = xb.to(device)
        preds = model(xb).cpu().numpy()
        y_pred_s_list.append(preds)

y_pred_s = np.vstack(y_pred_s_list)           # scaled predictions
y_pred = scaler_y.inverse_transform(y_pred_s) # back to original scale

r2   = r2_score(y_test, y_pred)  # (y_true, y_pred)
rmse = np.sqrt(np.mean((y_pred - y_test) ** 2))
mae  = np.mean(np.abs(y_pred - y_test))

print(f"Test R2 (original scale):   {r2:.4f}")
print(f"Test RMSE (original scale): {rmse:.4f}")
print(f"Test MAE  (original scale): {mae:.4f}")

Test R2 (original scale):   0.9989
Test RMSE (original scale): 4324.0210
Test MAE  (original scale): 3587.2085
