In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
import os
import json
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import pickle
from transformers import VisualBertModel
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tqdm import tqdm
from tensorflow.keras.preprocessing.text import Tokenizer

#inicializácia BiLSTM modelu
class BiLSTM_Model(nn.Module):
    def __init__(self):
        super(BiLSTM_Model, self).__init__()
        self.embedding = nn.Embedding(10000, 100)
        self.lstm1 = nn.LSTM(100, 128, batch_first=True, bidirectional=True)
        self.lstm2 = nn.LSTM(256, 64, batch_first=True, bidirectional=True)
        self.bn = nn.BatchNorm1d(128)
        self.fc = nn.Linear(128, 64)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm1(x)
        x, _ = self.lstm2(x)
        x = x[:, -1, :]
        x = self.bn(x)
        return torch.relu(self.fc(x))

#inicializácia ResNet extraktora
class ResNetFeatureExtractor(nn.Module):
    def __init__(self):
        super().__init__()
        resnet = models.resnet50(pretrained=False)
        self.base = nn.Sequential(*list(resnet.children())[:-1])
        self.fc = nn.Linear(2048, 128)

    def forward(self, x):
        x = self.base(x)
        x = x.view(x.size(0), -1)
        return self.fc(x)

#trieda dataset
class FusionJSONLDataset(Dataset):
    def __init__(self, jsonl_path, img_root, vocab):
        self.data = [json.loads(l.strip()) for l in open(jsonl_path)]
        self.img_root = img_root
        self.vocab = vocab
        self.transform = transforms.Compose([
            transforms.Resize((224, 224)), transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406],[0.229, 0.224, 0.225])
        ])

    def text_to_seq(self, text, max_len=100):
        seq = [self.vocab.get(w, 1) for w in text.split()]
        return seq[:max_len] + [0]*(max_len - len(seq))

    def __getitem__(self, idx):
        item = self.data[idx]
        img = Image.open(os.path.join(self.img_root, item["img"])).convert("RGB")
        text = torch.tensor(self.text_to_seq(item["text"]), dtype=torch.long)
        label = torch.tensor(item["label"], dtype=torch.float32)
        return self.transform(img), text, label

    def __len__(self):
        return len(self.data)

#trénovanie s VisualBERT
def train_fusion():
    jsonl = "/content/drive/MyDrive/data/train.jsonl"
    img_root = "/content/drive/MyDrive"
    vocab_path = "/content/drive/MyDrive/tokenizer.pickle"
    resnet_path = "/content/drive/MyDrive/resnet_own.pth"
    lstm_path = "/content/drive/MyDrive/bilstm_model.pth"

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    #načítanie tokenizera
    with open(vocab_path, "rb") as f:
        tokenizer = pickle.load(f)
    vocab = tokenizer if isinstance(tokenizer, dict) else tokenizer.word_index

    dataset = FusionJSONLDataset(jsonl, img_root, vocab)
    loader = DataLoader(dataset, batch_size=16, shuffle=True)

    #modely obrazok a text
    resnet = ResNetFeatureExtractor().to(device)
    state = torch.load(resnet_path, map_location=device)
    state.pop("resnet.fc.weight", None)
    state.pop("resnet.fc.bias", None)
    resnet.load_state_dict(state, strict=False)
    for p in resnet.parameters():
        p.requires_grad = False
    resnet.eval()

    bilstm = BiLSTM_Model().to(device)
    bilstm.load_state_dict(torch.load(lstm_path, map_location=device), strict=False)
    for p in bilstm.parameters():
        p.requires_grad = False
    bilstm.eval()

    visualbert = VisualBertModel.from_pretrained("uclanlp/visualbert-vqa-coco-pre").to(device)
    projection = nn.Linear(192, 768).to(device)

    #optimizer a stratová funkcia
    optimizer = torch.optim.Adam(
        list(projection.parameters()) + list(visualbert.parameters()), lr=2e-5
    )
    criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([2.0]).to(device))

    #trénovanie modelu VisualBERT
    num_epochs = 5
    for epoch in range(num_epochs):
        visualbert.train()
        running_loss = 0

        for imgs, txts, labels in tqdm(loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
            imgs, txts, labels = imgs.to(device), txts.to(device), labels.to(device).unsqueeze(1)

            with torch.no_grad():
                img_feat = resnet(imgs)
                txt_feat = bilstm(txts)

            fused = torch.cat([img_feat, txt_feat], dim=1)
            embeds = projection(fused).unsqueeze(1)

            token_types = torch.zeros(embeds.shape[:2], dtype=torch.long).to(device)

            out = visualbert(inputs_embeds=embeds, token_type_ids=token_types)
            logits = out.last_hidden_state[:, 0, :].mean(dim=1, keepdim=True)

            loss = criterion(logits, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        print(f" Epoch {epoch+1}, Loss: {running_loss / len(loader):.4f}")

    #uloženie modelu
    torch.save(visualbert.state_dict(), "/content/drive/MyDrive/visualbert_finetuned.pth")
    print("VisualBERT bol dotrénovaný a uložený.")

if __name__ == "__main__":
    train_fusion()


In [None]:
def test_fusion():
    jsonl = "/content/drive/MyDrive/data/test_seen.jsonl"
    img_root = "/content/drive/MyDrive"
    vocab_path = "/content/drive/MyDrive/tokenizer.pickle"
    resnet_path = "/content/drive/MyDrive/resnet_own.pth"
    lstm_path = "/content/drive/MyDrive/bilstm_model.pth"
    visualbert_path = "/content/drive/MyDrive/visualbert_finetuned.pth"

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    with open(vocab_path, "rb") as f:
        tokenizer = pickle.load(f)
    vocab = tokenizer if isinstance(tokenizer, dict) else tokenizer.word_index

    dataset = FusionJSONLDataset(jsonl, img_root, vocab)
    loader = DataLoader(dataset, batch_size=16, shuffle=False)

    resnet = ResNetFeatureExtractor().to(device)
    state = torch.load(resnet_path, map_location=device)
    state.pop("resnet.fc.weight", None)
    state.pop("resnet.fc.bias", None)
    resnet.load_state_dict(state, strict=False)
    resnet.eval()

    bilstm = BiLSTM_Model().to(device)
    bilstm.load_state_dict(torch.load(lstm_path, map_location=device), strict=False)
    bilstm.eval()

    visualbert = VisualBertModel.from_pretrained("uclanlp/visualbert-vqa-coco-pre").to(device)
    visualbert.load_state_dict(torch.load(visualbert_path, map_location=device))
    visualbert.eval()

    projection = nn.Linear(192, 768).to(device)

    all_preds = []
    all_labels = []

    with torch.no_grad():
        for imgs, txts, labels in tqdm(loader, desc="Testing"):
            imgs, txts = imgs.to(device), txts.to(device)
            labels = labels.to(device).unsqueeze(1)

            img_feat = resnet(imgs)
            txt_feat = bilstm(txts)
            fused = torch.cat([img_feat, txt_feat], dim=1)
            embeds = projection(fused).unsqueeze(1)
            token_types = torch.zeros(embeds.shape[:2], dtype=torch.long).to(device)

            out = visualbert(inputs_embeds=embeds, token_type_ids=token_types)
            logits = out.last_hidden_state[:, 0, :].mean(dim=1, keepdim=True)
            probs = torch.sigmoid(logits)
            preds = (probs > 0.5).float()

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    prec = precision_score(all_labels, all_preds)
    rec = recall_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)

    print(f"Accuracy:  {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall:    {rec:.4f}")
    print(f"F1 Score:  {f1:.4f}")

In [None]:
test_fusion()

Testing: 100%|██████████| 63/63 [04:23<00:00,  4.19s/it]

Accuracy:  0.4900
Precision: 0.4900
Recall:    1.0000
F1 Score:  0.6577





In [None]:
def test_fusion():
    jsonl = "/content/drive/MyDrive/data/test_unseen.jsonl"
    img_root = "/content/drive/MyDrive"
    vocab_path = "/content/drive/MyDrive/tokenizer.pickle"
    resnet_path = "/content/drive/MyDrive/resnet_own.pth"
    lstm_path = "/content/drive/MyDrive/bilstm_model.pth"
    visualbert_path = "/content/drive/MyDrive/visualbert_finetuned.pth"

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    with open(vocab_path, "rb") as f:
        tokenizer = pickle.load(f)
    vocab = tokenizer if isinstance(tokenizer, dict) else tokenizer.word_index

    dataset = FusionJSONLDataset(jsonl, img_root, vocab)
    loader = DataLoader(dataset, batch_size=16, shuffle=False)

    resnet = ResNetFeatureExtractor().to(device)
    state = torch.load(resnet_path, map_location=device)
    state.pop("resnet.fc.weight", None)
    state.pop("resnet.fc.bias", None)
    resnet.load_state_dict(state, strict=False)
    resnet.eval()

    bilstm = BiLSTM_Model().to(device)
    bilstm.load_state_dict(torch.load(lstm_path, map_location=device), strict=False)
    bilstm.eval()

    visualbert = VisualBertModel.from_pretrained("uclanlp/visualbert-vqa-coco-pre").to(device)
    visualbert.load_state_dict(torch.load(visualbert_path, map_location=device))
    visualbert.eval()

    projection = nn.Linear(192, 768).to(device)

    all_preds = []
    all_labels = []

    with torch.no_grad():
        for imgs, txts, labels in tqdm(loader, desc="Testing"):
            imgs, txts = imgs.to(device), txts.to(device)
            labels = labels.to(device).unsqueeze(1)

            img_feat = resnet(imgs)
            txt_feat = bilstm(txts)
            fused = torch.cat([img_feat, txt_feat], dim=1)
            embeds = projection(fused).unsqueeze(1)
            token_types = torch.zeros(embeds.shape[:2], dtype=torch.long).to(device)

            out = visualbert(inputs_embeds=embeds, token_type_ids=token_types)
            logits = out.last_hidden_state[:, 0, :].mean(dim=1, keepdim=True)
            probs = torch.sigmoid(logits)
            preds = (probs > 0.5).float()

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    prec = precision_score(all_labels, all_preds)
    rec = recall_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)

    print(f"Accuracy:  {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall:    {rec:.4f}")
    print(f"F1 Score:  {f1:.4f}")

test_fusion()

Testing: 100%|██████████| 125/125 [08:50<00:00,  4.24s/it]

Accuracy:  0.3750
Precision: 0.3750
Recall:    1.0000
F1 Score:  0.5455





Jednoduchý fúzny model

In [None]:
#zamrznutie ResNet modelu
for param in resnet_model.parameters():
    param.requires_grad = False

#zamrznutie LSTM modelu
for param in lstm_model.parameters():
    param.requires_grad = False

#trénovanie iba na fúznej vrstve
for param in fusion_model.fc1.parameters():
    param.requires_grad = True
for param in fusion_model.fc2.parameters():
    param.requires_grad = True

print("ResNet a LSTM sú zamrznuté. Trénujeme iba fúznu vrstvu.")

import torch.optim as optim

#nastavenie optimalizátora
optimizer = optim.Adam([
    {"params": fusion_model.fc1.parameters()},
    {"params": fusion_model.fc2.parameters()}
], lr=0.0005)

#stratova funkcia
criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([2.0]).to("cpu"))

#parametre tréningu
num_epochs = 5
batch_size = 32
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import os
from PIL import Image
import torchvision.transforms as transforms

class FusionDataset(Dataset):
    def __init__(self, csv_path, img_folder, vocab, max_length=100, transform=None):
        self.data = pd.read_csv(csv_path)
        self.img_folder = img_folder
        self.vocab = vocab
        self.max_length = max_length
        self.transform = transform if transform else transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])

    def text_to_sequence(self, text):
        if not isinstance(text, str):
            text = ""
        sequence = [self.vocab.get(word, 1) for word in text.split()]
        return sequence[:self.max_length] + [0] * (self.max_length - len(sequence))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]

        #načítanie obrázka
        img_path = os.path.join(self.img_folder, row['img'])
        if not os.path.exists(img_path):
            print(f" Chýbajúci súbor: {img_path}")
            img_path = os.path.join(self.img_folder, "default.jpg")

        image = Image.open(img_path).convert("RGB")
        image = self.transform(image)

        #tokenizácia textu
        text_sequence = self.text_to_sequence(row['text'])

        #štítky
        label = torch.tensor(row['label'], dtype=torch.float32)

        return image, torch.tensor(text_sequence, dtype=torch.long), label

#načítanie datasetu
csv_path = "/content/drive/MyDrive/data/new_merged.csv"
img_folder = "/content/drive/MyDrive/"
vocab = {}

dataset = FusionDataset(csv_path, img_folder, vocab)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

print(f"Dataset načítaný: {len(train_dataset)} tréningových, {len(val_dataset)} validačných vzoriek")

In [None]:
import torch
import torch.nn as nn
import torchvision.models as models
import torch.optim as optim
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader, random_split
from tqdm import tqdm
from collections import Counter
import pandas as pd
import os
from PIL import Image


import torch
import torch.nn as nn
import torchvision.models as models

#BiLSTM model s výstupnou črtou (return_feature=True)
class BiLSTM_Model(nn.Module):
    def __init__(self, vocab_size=10000, embedding_dim=100, max_length=100, lstm_units=64, dropout_rate=0.5):
        super(BiLSTM_Model, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm1 = nn.LSTM(embedding_dim, 128, batch_first=True, bidirectional=True)
        self.lstm2 = nn.LSTM(256, lstm_units, batch_first=True, bidirectional=True)
        self.batch_norm = nn.BatchNorm1d(2 * lstm_units)
        self.fc1 = nn.Linear(2 * lstm_units, 64)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc2 = nn.Linear(64, 1)

    def forward(self, x, return_feature=False):
        x = self.embedding(x)
        x, _ = self.lstm1(x)
        x, _ = self.lstm2(x)
        x = x[:, -1, :]
        x = self.batch_norm(x)
        feat = torch.relu(self.fc1(x))
        x = self.dropout(feat)
        out = self.fc2(x)
        return feat if return_feature else out

#ResNet model bez FC vrstvy
class ResNetFeatureExtractor(nn.Module):
    def __init__(self):
        super(ResNetFeatureExtractor, self).__init__()
        resnet = models.resnet50(pretrained=False)
        self.resnet_features = nn.Sequential(*list(resnet.children())[:-1])
        self.fc = nn.Linear(2048, 128)

    def forward(self, x):
        x = self.resnet_features(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)
        return x


from torch.utils.data import Dataset
import pandas as pd
import os
from PIL import Image
import torch
import torchvision.transforms as transforms

#opravený dataset, ktorý vracia (image, text, label)
from torch.utils.data import Dataset
import os
from PIL import Image
import torch
import torchvision.transforms as transforms

class FusionDataset(Dataset):
    def __init__(self, csv_path, img_folder, vocab, max_length=100, transform=None):
        self.data = pd.read_csv(csv_path)
        self.img_folder = img_folder
        self.vocab = vocab
        self.max_length = max_length
        self.transform = transform if transform else transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])

    def text_to_sequence(self, text):
        if not isinstance(text, str):
            text = ""
        sequence = [self.vocab.get(word, 1) for word in text.split()]
        return sequence[:self.max_length] + [0] * (self.max_length - len(sequence))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]

        #načítanie obrázka
        img_path = os.path.join(self.img_folder, row['img'])
        if not os.path.exists(img_path):
            print(f" Chýbajúci súbor: {img_path}")
            img_path = os.path.join(self.img_folder, "default.jpg")

        image = Image.open(img_path).convert("RGB")
        image = self.transform(image)

        #tokenizácia textu
        text_sequence = self.text_to_sequence(row['text'])

        #štítky
        label = torch.tensor(row['label'], dtype=torch.float32)

        return image, torch.tensor(text_sequence, dtype=torch.long), label



#inicializácia modelov
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#načítanie ResNet modelu
resnet_model_path = "/content/drive/MyDrive/resnet_own.pth"
resnet_model = ResNetFeatureExtractor()
state_dict = torch.load(resnet_model_path, map_location=device)
del state_dict['resnet.fc.weight']
del state_dict['resnet.fc.bias']
resnet_model.load_state_dict(state_dict, strict=False)
resnet_model.eval()

#načítanie CNN-LSTM modelu
lstm_model_path = "/content/drive/MyDrive/bilstm_model.pth"
lstm_model = BiLSTM_Model(vocab_size=10000, embedding_dim=100, max_length=100)
lstm_model.load_state_dict(torch.load(lstm_model_path, map_location=device))
lstm_model.eval()

#zamrznutie parametrov
for param in resnet_model.parameters():
    param.requires_grad = False
for param in lstm_model.parameters():
    param.requires_grad = False

#definícia fúzneho modelu (vážený priemer)
class ResNetLSTM_Fusion(nn.Module):
    def __init__(self, resnet_model, lstm_model):
        super(ResNetLSTM_Fusion, self).__init__()
        self.resnet = resnet_model
        self.lstm = lstm_model

        #trénovateľné logit váhy pre fúziu
        self.logits = nn.Parameter(torch.tensor([0.0, 0.0]))

        self.text_proj = nn.Linear(64, 128)
        self.batch_norm = nn.BatchNorm1d(128)
        self.fc1 = nn.Linear(128, 64)
        self.fc2 = nn.Linear(64, 1)

    def forward(self, img_input, text_input):
        with torch.no_grad():
            img_feat = self.resnet(img_input)
            text_feat = self.lstm(text_input, return_feature=True)

        text_feat = self.text_proj(text_feat)

        #softmax z logitov, čiže normované váhy (0, 1), alpha + beta = 1
        weights = torch.softmax(self.logits, dim=0)
        alpha, beta = weights[0], weights[1]

        fusion_feat = alpha * img_feat + beta * text_feat
        fusion_feat = self.batch_norm(fusion_feat)
        fusion_feat = torch.relu(self.fc1(fusion_feat))
        output = self.fc2(fusion_feat)
        return output

#inicializácia fúzneho modelu
fusion_model = ResNetLSTM_Fusion(resnet_model, lstm_model).to(device)
optimizer = optim.Adam(filter(lambda p: p.requires_grad, fusion_model.parameters()), lr=0.0005)
criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([2.0]).to(device))

#trénovanie iba fúznej vrstvy
num_epochs = 10
for epoch in range(num_epochs):
    fusion_model.train()
    running_loss = 0.0
    correct_train, total_train = 0, 0

    for images, text_inputs, labels in tqdm(train_loader, desc=f"Training Epoch {epoch+1}/{num_epochs}"):
        images, text_inputs, labels = images.to(device), text_inputs.to(device), labels.to(device).float().unsqueeze(1)
        optimizer.zero_grad()
        outputs = fusion_model(images, text_inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    print(f" Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}")

#uloženie modelu
torch.save(fusion_model.state_dict(), "/content/drive/MyDrive/fusion_model.pth")
print(" Fúzny model bol uložený.")

Training Epoch 1/10: 100%|██████████| 304/304 [05:42<00:00,  1.13s/it]


 Epoch [1/10], Loss: 0.9468


Training Epoch 2/10: 100%|██████████| 304/304 [03:25<00:00,  1.48it/s]


 Epoch [2/10], Loss: 0.9415


Training Epoch 3/10: 100%|██████████| 304/304 [03:21<00:00,  1.51it/s]


 Epoch [3/10], Loss: 0.9422


Training Epoch 4/10: 100%|██████████| 304/304 [03:21<00:00,  1.51it/s]


 Epoch [4/10], Loss: 0.9410


Training Epoch 5/10: 100%|██████████| 304/304 [03:23<00:00,  1.50it/s]


 Epoch [5/10], Loss: 0.9402


Training Epoch 6/10: 100%|██████████| 304/304 [03:27<00:00,  1.46it/s]


 Epoch [6/10], Loss: 0.9393


Training Epoch 7/10: 100%|██████████| 304/304 [03:29<00:00,  1.45it/s]


 Epoch [7/10], Loss: 0.9387


Training Epoch 8/10: 100%|██████████| 304/304 [03:25<00:00,  1.48it/s]


 Epoch [8/10], Loss: 0.9399


Training Epoch 9/10: 100%|██████████| 304/304 [03:24<00:00,  1.48it/s]


 Epoch [9/10], Loss: 0.9380


Training Epoch 10/10: 100%|██████████| 304/304 [03:25<00:00,  1.48it/s]


 Epoch [10/10], Loss: 0.9373
 Fúzny model bol uložený.


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np

def test_fusion_model(fusion_model, test_loader, device):
    fusion_model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for images, text_inputs, labels in tqdm(test_loader, desc="Testing"):
            images = images.to(device)
            text_inputs = text_inputs.to(device)
            labels = labels.to(device).unsqueeze(1)

            outputs = fusion_model(images, text_inputs)
            probs = torch.sigmoid(outputs)
            preds = (probs > 0.5).float()

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Výpočty metrík
    acc = accuracy_score(all_labels, all_preds)
    prec = precision_score(all_labels, all_preds)
    rec = recall_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)

    print(f"\n--- Evaluation Metrics ---")
    print(f"Accuracy : {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall   : {rec:.4f}")
    print(f"F1 Score : {f1:.4f}")

    # Zobrazenie naučených váh alpha a beta
    weights = torch.softmax(fusion_model.logits, dim=0)
    print(f"\nLearned weights -> alpha (image): {weights[0].item():.4f}, beta (text): {weights[1].item():.4f}")

In [None]:
class FusionJSONLDataset(Dataset):
    def __init__(self, jsonl_path, img_root, vocab, max_length=100, transform=None):
        self.data = [json.loads(line.strip()) for line in open(jsonl_path)]
        self.img_root = img_root
        self.vocab = vocab
        self.max_length = max_length
        self.transform = transform if transform else transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])
        ])

    def text_to_sequence(self, text):
        if not isinstance(text, str):
            text = ""
        sequence = [self.vocab.get(word, 1) for word in text.split()]
        return sequence[:self.max_length] + [0] * (self.max_length - len(sequence))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        img_path = os.path.join(self.img_root, item["img"])
        if not os.path.exists(img_path):
            print(f"Missing image: {img_path}")
            img_path = os.path.join(self.img_root, "default.jpg")

        image = Image.open(img_path).convert("RGB")
        image = self.transform(image)

        text_sequence = self.text_to_sequence(item["text"])
        label = torch.tensor(item["label"], dtype=torch.float32)

        return image, torch.tensor(text_sequence, dtype=torch.long), label

In [None]:
#načítanie vocab
import pickle
with open("/content/drive/MyDrive/tokenizer.pickle", "rb") as f:
    tokenizer = pickle.load(f)
vocab = tokenizer if isinstance(tokenizer, dict) else tokenizer.word_index

#dataloader
test_jsonl = "/content/drive/MyDrive/data/test_unseen.jsonl"
img_root = "/content/drive/MyDrive"

test_dataset = FusionJSONLDataset(test_jsonl, img_root, vocab)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [None]:
#načítanie modelu
fusion_model = ResNetLSTM_Fusion(resnet_model, lstm_model).to(device)
fusion_model.load_state_dict(torch.load("/content/drive/MyDrive/fusion_model.pth", map_location=device))

#spustenie testovania
test_fusion_model(fusion_model, test_loader, device)

Testing: 100%|██████████| 125/125 [00:40<00:00,  3.09it/s]


--- Evaluation Metrics ---
Accuracy : 0.4980
Precision: 0.4072
Recall   : 0.7427
F1 Score : 0.5260

Learned weights -> alpha (image): 0.5500, beta (text): 0.4500





In [None]:
#načítanie modelu
fusion_model = ResNetLSTM_Fusion(resnet_model, lstm_model).to(device)
fusion_model.load_state_dict(torch.load("/content/drive/MyDrive/fusion_model.pth", map_location=device))

#spustenie testovania
test_fusion_model(fusion_model, test_loader, device)

Testing: 100%|██████████| 63/63 [00:23<00:00,  2.68it/s]


--- Evaluation Metrics ---
Accuracy : 0.5140
Precision: 0.5027
Recall   : 0.7551
F1 Score : 0.6036

Learned weights -> alpha (image): 0.5500, beta (text): 0.4500





In [None]:

print(fusion_model)

ResNetLSTM_Fusion(
  (resnet): ResNetFeatureExtractor(
    (resnet_features): Sequential(
      (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
      (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (4): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (relu

In [None]:
!pip install torchsummary

