In [None]:
import pandas as pd
import timm
import torch
from torch import nn
from torch.optim.lr_scheduler import ReduceLROnPlateau
from transformers import AutoTokenizer, AutoModelForMaskedLM

from core.src.constants import IMAGES_PATH, TRAIN_DATA_CSV, TEST_DATA_CSV, FINE_TUNED_BERT_MODEL_PATH

In [None]:
df_train = pd.read_csv(TRAIN_DATA_CSV, dtype={"unique_id": str})
df_test = pd.read_csv(TEST_DATA_CSV, dtype={"unique_id": str})
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
df_train.head()

In [None]:
df_train.info()

In [None]:
fastvit_model = timm.create_model("fastvit_t8.apple_in1k", pretrained=True, num_classes=0)
fastvit_model.to(device)
fastvit_model = fastvit_model.eval()

# get model specific transforms (normalization, resize)
data_config = timm.data.resolve_model_data_config(fastvit_model)
transforms = timm.data.create_transform(**data_config, is_training=False)

tokenizer = AutoTokenizer.from_pretrained("dumitrescustefan/bert-base-romanian-uncased-v1")
bert_model = AutoModelForMaskedLM.from_pretrained(FINE_TUNED_BERT_MODEL_PATH)
bert_model.config.output_hidden_states = True
bert_model.to(device)

torch.cuda.empty_cache()

In [None]:
train_encodings = tokenizer(df_train["input"].tolist(), padding=True, truncation=True, max_length=512)
test_encodings = tokenizer(df_train["input"].tolist(), padding=True, truncation=True, max_length=512)

In [None]:
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from PIL import Image


class MultimodalDataset(Dataset):
    def __init__(self, df: pd.DataFrame, encodings):
        self.df = df
        self.encodings = encodings

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_name = self.df.loc[idx, "unique_id"] + "/00.png"
        image = Image.open(IMAGES_PATH / img_name).convert("RGB")
        image = transforms(image)

        text = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

        STRUCTURED_COLUMNS = [
            "km",
            "putere",
            "capacitate cilindrica",
            "anul producției",
            "marca",
            "model",
            "combustibil",
            "tip caroserie",
            "firma",
            "is_automatic",
        ]
        structured_data = self.df.loc[idx, STRUCTURED_COLUMNS]
        # structured_data = torch.tensor(structured_data.values, dtype=torch.float32)
        for col in STRUCTURED_COLUMNS:
            if structured_data[col].dtype == "object":
                print(structured_data[col].unique())
                structured_data[col] = structured_data[col].astype("category").cat.codes

        structured_data = torch.tensor(structured_data.values.astype("float32"))

        target = torch.tensor(self.df.loc[idx, "price"], dtype=torch.float32)

        return image, text["input_ids"], text["attention_mask"], structured_data, target


train_dataset = MultimodalDataset(df_train, train_encodings)
test_dataset = MultimodalDataset(df_test, test_encodings)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [None]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(1546, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x


model = Net().to(device)
criterion = nn.L1Loss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
scheduler = ReduceLROnPlateau(optimizer, mode="min", factor=0.5, min_lr=1e-9, patience=5)

trainSteps = len(train_loader.dataset) // 8
testSteps = len(test_loader.dataset) // 8
history = {"train_loss": [], "test_loss": []}

In [None]:
for epoch in range(10):
    model.train()
    running_loss = 0.0

    for images, input_ids, attention_masks, structured_data, target in train_loader:
        images = images.to(device)
        input_ids = input_ids.to(device)
        attention_masks = attention_masks.to(device)
        structured_data = structured_data.to(device)
        target = target.to(device)

        with torch.no_grad():
            image_embeddings = fastvit_model(images)
            outputs = bert_model(input_ids=input_ids, attention_mask=attention_masks)
            text_embeddings = outputs.hidden_states[-1]
            mean_text_embeddings = text_embeddings.mean(dim=1)

        combined_features = torch.cat([image_embeddings, mean_text_embeddings, structured_data], dim=1)

        optimizer.zero_grad()  # Zero the parameter gradients

        outputs = model(combined_features)  # Forward pass
        loss = criterion(outputs, target.view(-1, 1))  # Define 'target' appropriately based on your data
        loss.backward()  # Backward pass
        optimizer.step()  # Optimize the model

        running_loss += loss.item() * images.size(0)

    epoch_loss = running_loss / len(train_loader.dataset)
    history["train_loss"].append(epoch_loss)
    print(f"Epoch {epoch + 1} Train Loss: {epoch_loss:.4f}")

    # Adjust learning rate based on the epoch loss
    scheduler.step(epoch_loss)

    # Validation loop (optional, but recommended)
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():
        valid_loss = 0.0
        for images, input_ids, attention_masks, structured_data, target in test_loader:
            images = images.to(device)
            input_ids = input_ids.to(device)
            attention_masks = attention_masks.to(device)
            structured_data = structured_data.to(device)
            target = target.to(device)

            # Repeat forward pass as in the training loop
            image_embeddings = fastvit_model(images)
            outputs = bert_model(input_ids=input_ids, attention_mask=attention_masks)
            text_embeddings = outputs.hidden_states[-1]
            mean_text_embeddings = text_embeddings.mean(dim=1)

            combined_features = torch.cat([image_embeddings, mean_text_embeddings, structured_data], dim=1)

            outputs = model(combined_features)
            loss = criterion(outputs, target.view(-1, 1))  # Ensure 'target' is defined for test data

            valid_loss += loss.item() * images.size(0)

        valid_loss /= len(test_loader.dataset)
        history["test_loss"].append(valid_loss)
        print(f"Epoch {epoch + 1} Validation Loss: {valid_loss:.4f}")

# Output the final training and validation losses
print("Training completed.")

In [ ]:
import matplotlib.pyplot as plt

plt.plot(history["train_loss"], label="train loss")
plt.plot(history["test_loss"], label="test loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.show()