In [None]:
import pandas as pd
import torch
from torch import nn
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import Dataset, DataLoader

from utils.decorators import show_elapsed_time
from utils.decorators import send_notification

In [None]:
df_train = pd.read_csv("rocar_train.csv")
df_test = pd.read_csv("rocar_test.csv")

In [None]:
print(f"Train shape: {df_train.shape}")
print(f"Test shape: {df_test.shape}")

# CONSTANTS

In [None]:
BATCH_SIZE = 32
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
class CarDataset(Dataset):
    def __init__(self, features, labels):
        self.features = torch.tensor(features.values, dtype=torch.float32)
        self.labels = torch.tensor(labels.values, dtype=torch.float32)

        print(f"Features shape: {self.features.shape}")
        print(f"Labels shape: {self.labels.shape}")

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

In [None]:
from sklearn.preprocessing import LabelEncoder

# Initialize the encoder
encoder = LabelEncoder()

bins = [i * 5000 for i in range(0, 13)]
print(bins)

df_train["price"] = pd.cut(df_train["price"], bins=bins, labels=[i for i in range(12)])
df_train["price"] = encoder.fit_transform(df_train["price"])

df_test["price"] = pd.cut(df_test["price"], bins=bins, labels=[i for i in range(12)])
df_test["price"] = encoder.transform(df_test["price"])

# print(df_train["price"].value_counts(normalize=True))
# print(df_test["price"].value_counts(normalize=True))

In [None]:
train_dataset = CarDataset(df_train.drop(columns="price"), df_train["price"])
test_dataset = CarDataset(df_test.drop(columns="price"), df_test["price"])

print(f"Train dataset length: {len(train_dataset)}")
print(f"Test dataset length: {len(test_dataset)}")

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(11, 128)
        self.fc2 = nn.Linear(128, 256)
        self.fc3 = nn.Linear(256, 20)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        x = self.softmax(x)
        return x

In [None]:
model = Net().to(DEVICE)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
scheduler = ReduceLROnPlateau(optimizer, mode="min", factor=0.1, patience=5)

trainSteps = len(train_loader.dataset) // BATCH_SIZE
testSteps = len(test_loader.dataset) // BATCH_SIZE
history = {"train_loss": [], "test_loss": [], "train_accuracy": [], "test_accuracy": []}

In [None]:
@show_elapsed_time
@send_notification
def train(num_epochs: int = 100):
    best_val_loss = float("inf")
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        train_correct = 0
        for i, (features, labels) in enumerate(train_loader):
            features, labels = features.to(DEVICE), labels.to(DEVICE)
            labels = labels.long()
            optimizer.zero_grad()
            outputs = model(features)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            pred = torch.argmax(outputs, dim=1)
            train_correct += (pred == labels).sum().item()

        model.eval()
        test_loss = 0.0
        test_correct = 0
        with torch.no_grad():
            for i, (features, labels) in enumerate(test_loader):
                features, labels = features.to(DEVICE), labels.to(DEVICE)
                labels = labels.long()
                outputs = model(features)
                loss = criterion(outputs, labels)
                test_loss += loss.item()
                pred = torch.argmax(outputs, dim=1)
                test_correct += (pred == labels).sum().item()

        train_loss /= trainSteps
        test_loss /= testSteps

        train_accuracy = train_correct / len(train_loader.dataset)
        test_accuracy = test_correct / len(test_loader.dataset)

        history["train_loss"].append(train_loss)
        history["test_loss"].append(test_loss)
        history["train_accuracy"].append(train_accuracy)
        history["test_accuracy"].append(test_accuracy)

        if test_loss < best_val_loss:
            best_val_loss = test_loss
        torch.save(model.state_dict(), "best_model.pth")
        print(f"Epoch {epoch}: New best test loss: {best_val_loss}")

        print(
            f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, Test Accuracy: {test_accuracy:.4f}"
        )

        scheduler.step(test_loss)

In [None]:
train(100)

In [None]:
import matplotlib.pyplot as plt

plt.plot(history["train_loss"], label="train loss")
plt.plot(history["test_loss"], label="test loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.show()

plt.plot(history["train_accuracy"], label="train accuracy")
plt.plot(history["test_accuracy"], label="test accuracy")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.legend()
plt.show()