In [None]:
import pandas as pd
import torch
from torch import nn
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import Dataset, DataLoader

from utils.decorators import show_elapsed_time
from utils.decorators import send_notification
from utils.metrics import compute_metrics
from utils.plots import plot_loss_and_metrics, print_metrics_table
import matplotlib.pyplot as plt
from constants import TRAIN_DATA_CSV, TEST_DATA_CSV, SIMPLE_MODEL_PATH

In [None]:
df_train = pd.read_csv(TRAIN_DATA_CSV)
df_test = pd.read_csv(TEST_DATA_CSV)

In [None]:
print(f"Train shape: {df_train.shape}")
print(f"Test shape: {df_test.shape}")

# CONSTANTS

In [None]:
BATCH_SIZE = 2048
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
class CarDataset(Dataset):
    def __init__(self, features, labels):
        self.features = torch.tensor(features.values, dtype=torch.float32)
        self.labels = torch.tensor(labels.values, dtype=torch.float32)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]


train_dataset = CarDataset(df_train.drop(columns=["price", "unique_id", "input"]), df_train["price"])
test_dataset = CarDataset(df_test.drop(columns=["price", "unique_id", "input"]), df_test["price"])

print(f"Train dataset length: {len(train_dataset)}")
print(f"Test dataset length: {len(test_dataset)}")

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(10, 1024)
        self.fc2 = nn.Linear(1024, 256)
        self.fc3 = nn.Linear(256, 1)
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x

In [None]:
model = Net().to(DEVICE)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
scheduler = ReduceLROnPlateau(optimizer, mode="min", factor=0.5, min_lr=1e-9, patience=50)

trainSteps = len(train_loader.dataset) // BATCH_SIZE
testSteps = len(test_loader.dataset) // BATCH_SIZE
history = {"train_loss": [], "test_loss": []}
metrics_history = {"train_mae": [], "test_mae": [], "train_rmse": [], "test_rmse": [], "train_r2": [], "test_r2": [], "train_mse": [], "test_mse": []}

In [None]:
@show_elapsed_time
@send_notification
def train(num_epochs: int = 100):
    best_val_loss = float("inf")
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        all_train_predictions = []
        all_train_ground_truths = []
        for i, (features, labels) in enumerate(train_loader):
            features, labels = features.to(DEVICE), labels.to(DEVICE)
            optimizer.zero_grad()
            outputs = model(features)
            loss = criterion(outputs, labels.unsqueeze(1))
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            all_train_predictions.extend(outputs.cpu().detach().numpy())
            all_train_ground_truths.extend(labels.cpu().detach().numpy())

        train_metrics = compute_metrics(all_train_predictions, all_train_ground_truths)
        metrics_history["train_mae"].append(train_metrics["MAE"])
        metrics_history["train_mse"].append(train_metrics["MSE"])
        metrics_history["train_rmse"].append(train_metrics["RMSE"])
        metrics_history["train_r2"].append(train_metrics["R2"])

        model.eval()
        test_loss = 0.0
        all_test_predictions = []
        all_test_ground_truths = []
        with torch.no_grad():
            for i, (features, labels) in enumerate(test_loader):
                features, labels = features.to(DEVICE), labels.to(DEVICE)
                outputs = model(features)
                loss = criterion(outputs, labels.unsqueeze(1))
                test_loss += loss.item()
                all_test_predictions.extend(outputs.cpu().detach().numpy())
                all_test_ground_truths.extend(labels.cpu().detach().numpy())

        test_metrics = compute_metrics(all_test_predictions, all_test_ground_truths)
        metrics_history["test_mae"].append(test_metrics["MAE"])
        metrics_history["test_mse"].append(test_metrics["MSE"])
        metrics_history["test_rmse"].append(test_metrics["RMSE"])
        metrics_history["test_r2"].append(test_metrics["R2"])

        train_loss /= trainSteps
        test_loss /= testSteps

        history["train_loss"].append(train_loss)
        history["test_loss"].append(test_loss)

        scheduler.step(test_loss)
        print(scheduler.get_last_lr())

        print(f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}")
        print("Train Metrics: ", train_metrics)
        print("Test Metrics: ", test_metrics)

        if test_loss < best_val_loss:
            best_val_loss = test_loss
            torch.save(model.state_dict(), SIMPLE_MODEL_PATH)
            print(f"Epoch {epoch + 1}: New best test loss: {best_val_loss}")

In [None]:
train(5000)

In [None]:
plot_loss_and_metrics(history, metrics_history, SLICE_START=20)
print_metrics_table(metrics_history)

In [None]:
differences = []
abs_differences = []

with torch.no_grad():
    for inputs, targets in test_loader:  # Loop through the entire validation set
        inputs, targets = inputs.to(DEVICE), targets.to(DEVICE)
        outputs = model(inputs)
        inputs = inputs.cpu().numpy()
        targets = targets.cpu().numpy()
        outputs = outputs.cpu().numpy()

        for output, target in zip(outputs, targets):
            diff = output[0] - target  # Calculate the absolute difference
            abs_diff = abs(diff)  # Calculate the absolute difference
            differences.append(diff)  # Store the differences
            abs_differences.append(abs_diff)  # Store the absolute differences

# Plotting the differences
plt.figure(figsize=(10, 5))
plt.plot(differences, marker="o", linestyle="-", markersize=4)
plt.title("Differences between Predictions and Ground Truth")
plt.xlabel("Sample Index")
plt.ylabel("Prediction - Ground Truth")
plt.grid(True)
plt.show()

# calculate the average difference
average_abs_diff = sum(abs_differences) / len(abs_differences)
print(f"Average absolute difference: {average_abs_diff:.2f}")