In [None]:
import pandas as pd
import timm
import torch
from torch import nn
from torch.optim.lr_scheduler import ReduceLROnPlateau
from transformers import AutoTokenizer, AutoModelForMaskedLM

from core.src.constants import IMAGES_PATH

In [None]:
df_train = pd.read_csv("rocar_train.csv", dtype={"unique_id": str})
df_test = pd.read_csv("rocar_test.csv", dtype={"unique_id": str})
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
df_train.head()

In [None]:
df_train.info()

In [None]:
fastvit_model = timm.create_model("fastvit_t8.apple_in1k", pretrained=True, num_classes=0)
fastvit_model.to(device)
fastvit_model = fastvit_model.eval()

# get model specific transforms (normalization, resize)
data_config = timm.data.resolve_model_data_config(fastvit_model)
transforms = timm.data.create_transform(**data_config, is_training=False)

tokenizer = AutoTokenizer.from_pretrained("dumitrescustefan/bert-base-romanian-cased-v1")
# model = AutoModelForTextEncoding.from_pretrained("dumitrescustefan/bert-base-romanian-cased-v1")
bert_model = AutoModelForMaskedLM.from_pretrained("checkpoint-2250")
bert_model.config.output_hidden_states = True
bert_model.to(device)

torch.cuda.empty_cache()

In [None]:
import emoji
import re


def replace_patterns(text: str):
    email_pattern = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b"
    phone_pattern = r"\b\d{3}[-.]?\d{3}[-.]?\d{4}\b"
    html_pattern = r"<.*?>"
    text = re.sub(email_pattern, "[EMAIL]", text)
    text = re.sub(phone_pattern, "[TEL]", text)
    text = re.sub(html_pattern, "[HTML]", text)
    return text


def replace_emojis(text: str):
    return emoji.demojize(text, delimiters=("[", "]"))


def preprocess_text(text: str):
    text = replace_patterns(text)
    text = replace_emojis(text)
    return text


df_train["input"] = df_train["input"].apply(preprocess_text)
df_test["input"] = df_test["input"].apply(preprocess_text)

In [None]:
train_encodings = tokenizer(df_train["input"].tolist(), padding=True, truncation=True, max_length=512)
test_encodings = tokenizer(df_train["input"].tolist(), padding=True, truncation=True, max_length=512)

In [None]:
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from PIL import Image


class MultimodalDataset(Dataset):
    def __init__(self, df: pd.DataFrame, encodings):
        self.df = df
        self.encodings = encodings

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_name = self.df.loc[idx, "unique_id"] + "/00.png"
        image = Image.open(IMAGES_PATH / img_name).convert("RGB")
        image = transforms(image)

        text = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

        STRUCTURED_COLUMNS = [
            "km",
            "putere",
            "capacitate cilindrica",
            "anul producției",
            "marca",
            "model",
            "combustibil",
            "tip caroserie",
            "firma",
            "is_automatic",
        ]
        structured_data = self.df.loc[idx, STRUCTURED_COLUMNS]
        # structured_data = torch.tensor(structured_data.values, dtype=torch.float32)
        for col in STRUCTURED_COLUMNS:
            if structured_data[col].dtype == "object":
                print(structured_data[col].unique())
                structured_data[col] = structured_data[col].astype("category").cat.codes

        structured_data = torch.tensor(structured_data.values.astype("float32"))

        return image, text["input_ids"], text["attention_mask"], structured_data


train_dataset = MultimodalDataset(df_train, train_encodings)
test_dataset = MultimodalDataset(df_test, test_encodings)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [None]:
for images, input_ids, attention_masks, structured_data in train_loader:
    images = images.to(device)
    input_ids = input_ids.to(device)
    attention_masks = attention_masks.to(device)
    structured_data = structured_data.to(device)

    with torch.no_grad():
        image_embeddings = fastvit_model(images)

    with torch.no_grad():
        outputs = bert_model(input_ids=input_ids, attention_mask=attention_masks)
        text_embeddings = outputs.hidden_states[-1]
        mean_text_embeddings = text_embeddings.mean(dim=1)

    # Combine embeddings
    combined_features = torch.cat([image_embeddings, mean_text_embeddings, structured_data], dim=1)

    print(combined_features.shape)