In [None]:
import pandas as pd
import timm
import torch
from PIL import Image
from torch import nn
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel
import numpy as np

from core.src.constants import (
    IMAGES_PATH,
    TRAIN_DATA_CSV,
    TEST_DATA_CSV,
    TRAIN_IMAGE_FEATURES_PATH,
    TEST_IMAGE_FEATURES_PATH,
    TRAIN_TEXT_FEATURES_PATH,
    TEST_TEXT_FEATURES_PATH,
    FINE_TUNED_BERT_MODEL_PATH,
    FINE_TUNED_FASTVIT_MODEL_PATH,
)

In [None]:
df_train = pd.read_csv(TRAIN_DATA_CSV, dtype={"unique_id": str})
df_test = pd.read_csv(TEST_DATA_CSV, dtype={"unique_id": str})
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

torch.cuda.empty_cache()

# MODELS

## FASTVIT

In [None]:
fastvit = timm.create_model("fastvit_t8.apple_in1k", pretrained=True, num_classes=0)

fastvit.head = nn.Sequential(nn.AdaptiveAvgPool2d(1), nn.Flatten(), nn.Linear(fastvit.num_features, 1))

fastvit.load_state_dict(torch.load(FINE_TUNED_FASTVIT_MODEL_PATH))

fastvit = fastvit.eval()
fastvit.to(device)

data_config = timm.data.resolve_model_data_config(fastvit)
transforms = timm.data.create_transform(**data_config, is_training=False)


class FastViTEmbedding(nn.Module):
    def __init__(self, model):
        super(FastViTEmbedding, self).__init__()
        self.model = model
        self.pool = nn.AdaptiveAvgPool2d(1)

    def forward(self, x):
        # Assuming the embeddings you want are just before the head.
        # This accesses the last layer before the regression head.
        x = self.model.forward_features(x)
        x = self.pool(x)
        x = x.view(x.size(0), -1)
        return x


fastvit_model = FastViTEmbedding(fastvit).to(device)
fastvit_model.eval()

## BERT

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    "dumitrescustefan/bert-base-romanian-uncased-v1", do_lower_case=True, add_special_tokens=True, max_length=512, padding=True, truncation=True
)
bert_model = AutoModel.from_pretrained("dumitrescustefan/bert-base-romanian-uncased-v1")
bert_model.to(device)


class BERTRegressor(nn.Module):
    def __init__(self):
        super(BERTRegressor, self).__init__()
        self.bert = bert_model
        self.fc = nn.Linear(768, 1)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask)
        outputs = outputs[1]  # Use the output of the [CLS] token
        return outputs


bert_model = BERTRegressor().to(device)

bert_model.load_state_dict(torch.load(FINE_TUNED_BERT_MODEL_PATH))
bert_model.eval()

## Prepare images

In [None]:
train_images = df_train["unique_id"].values
train_images = [IMAGES_PATH / f"{path}.png" for path in train_images]

test_images = df_test["unique_id"].values
test_images = [IMAGES_PATH / f"{path}.png" for path in test_images]

In [None]:
train_encodings = tokenizer(df_train["input"].tolist(), padding=True, truncation=True, max_length=512)
test_encodings = tokenizer(df_test["input"].tolist(), padding=True, truncation=True, max_length=512)

In [None]:
def compute_image_features(images, model, transforms):
    fastvit_model.eval()
    features = []
    for i in tqdm(range(0, len(images), 16)):
        batch = images[i : i + 16]
        batch = [Image.open(image) for image in batch]
        inputs = [transforms(image) for image in batch]
        inputs = torch.stack(inputs).to(device)
        with torch.no_grad():
            outputs = model(inputs)
            features.append(outputs.cpu())
    features = torch.cat(features, dim=0)
    features_numpy = features.numpy()
    print(features_numpy.shape)
    return features_numpy


def compute_text_features(encodings, model):
    features = []
    for i in tqdm(range(0, len(encodings["input_ids"]), 16)):
        input_ids = torch.tensor(encodings["input_ids"][i : i + 16]).to(device)
        attention_mask = torch.tensor(encodings["attention_mask"][i : i + 16]).to(device)
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
            features.append(outputs.cpu())
    features = torch.cat(features, dim=0)
    features_numpy = features.numpy()
    print(features_numpy.shape)
    return features_numpy

In [None]:
train_images_features = compute_image_features(train_images, fastvit_model, transforms)
np.save(TRAIN_IMAGE_FEATURES_PATH, train_images_features)

test_image_features = compute_image_features(test_images, fastvit_model, transforms)
np.save(TEST_IMAGE_FEATURES_PATH, test_image_features)

In [None]:
train_text_features = compute_text_features(train_encodings, bert_model)
np.save(TRAIN_TEXT_FEATURES_PATH, train_text_features)

test_text_features = compute_text_features(test_encodings, bert_model)
np.save(TEST_TEXT_FEATURES_PATH, test_text_features)