In [None]:
from google.colab import drive
drive.flush_and_unmount()
drive.mount('/content/drive')


Drive not mounted, so nothing to flush and unmount.
Mounted at /content/drive


In [None]:
import pandas as pd
#načítanie dát
csv_path = "/content/drive/MyDrive/data/new_merged.csv"

df = pd.read_csv(csv_path)

#výpis dát
print("Prvých 5 riadkov dát:")
print(df.head())

texts = df["text"].tolist()          # textové dáta
image_paths = df["img"].tolist()     # cesty k obrázkom
labels = df["label"].tolist()        # štítky (0 = netoxický, 1 = toxický)

#výstup počtu načítaných hodnôt
print("Načítaných textov:", len(texts))
print("Načítaných ciest k obrázkom:", len(image_paths))
print("Načítaných labelov:", len(labels))


Prvých 5 riadkov dát:
   Unnamed: 0.1  Unnamed: 0     id  \
0             0           0  42953   
1             1           1  23058   
2             2           2  13894   
3             3           3  37408   
4             4           4  82403   

                                            Fulltext  \
0  its their character not their color that matte...   
1  don't be afraid to love again everyone is not ...   
2  putting bows on your pet  [[], ['Korat', 'Russ...   
3  i love everything and everybody! except for sq...   
4  everybody loves chocolate chip cookies, even h...   

                                                text            img  label  \
0   its their character not their color that matters  img/42953.png      0   
1  don't be afraid to love again everyone is not ...  img/23058.png      0   
2                           putting bows on your pet  img/13894.png      0   
3  i love everything and everybody! except for sq...  img/37408.png      0   
4  everybody loves cho

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision.models as models
from torch.utils.data import Dataset, DataLoader
import os
import cv2
import numpy as np
from datetime import datetime

#nastavenie parametrov
target_size = (224, 224)
batch_size = 16
num_classes = 2
epochs = 10
checkpoint_dir = "/content/drive/MyDrive/checkp_eff/"
saved_model_path = "/content/drive/MyDrive/resnet_trained_model.pth"


os.makedirs(checkpoint_dir, exist_ok=True)

#načítanie obrázkov
class ImageDataset(Dataset):
    def __init__(self, image_paths, labels, transform=None):
        self.image_paths = image_paths
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_path = "/content/drive/MyDrive/" + self.image_paths[idx]
        img = cv2.imread(img_path)
        if img is None:
            raise FileNotFoundError(f"Image not found: {img_path}")

        img = cv2.resize(img, target_size)
        img = img.astype(np.float32) / 255.0  #normalizacia
        img = np.transpose(img, (2, 0, 1))  #konvertovanie do formátu (C, H, W)
        img = torch.tensor(img, dtype=torch.float32)

        label = torch.tensor(self.labels[idx], dtype=torch.float32)
        return img, label

#definovanie modelu EfficientNet B0
def create_model():
    model = models.efficientnet_b0(pretrained=True)
    num_features = model.classifier[1].in_features
    model.classifier = nn.Sequential(
        nn.Linear(num_features, 128),
        nn.ReLU(),
        nn.Dropout(0.5),
        nn.Linear(128, 1),
        nn.Sigmoid()
    )
    return model

def generate_checkpoint_name():
    timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
    return os.path.join(checkpoint_dir, f"resnet_checkpoint_{timestamp}.pth")

#načítanie posledného checkpointu
def get_latest_checkpoint():
    if not os.path.exists(checkpoint_dir):
        return None
    checkpoints = [os.path.join(checkpoint_dir, f) for f in os.listdir(checkpoint_dir) if f.endswith(".pth")]
    if not checkpoints:
        return None
    latest_checkpoint = max(checkpoints, key=os.path.getctime)
    print(f"Latest checkpoint found: {latest_checkpoint}")
    return latest_checkpoint

#funkcia na trénovanie modelu
def train_model(model, dataloader, criterion, optimizer, device, start_epoch=0):
    model.to(device)
    for epoch in range(start_epoch, epochs):
        model.train()
        running_loss = 0.0
        for images, labels in dataloader:
            images, labels = images.to(device), labels.to(device).unsqueeze(1)

            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss / len(dataloader)}")

        #uloženie checkpointu
        checkpoint_path = generate_checkpoint_name()
        torch.save({
            'epoch': epoch + 1,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict()
        }, checkpoint_path)
        print(f"Checkpoint saved: {checkpoint_path}")

    return model

#načítanie trénovaného modelu
def load_or_train_model(image_paths, labels):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    dataset = ImageDataset(image_paths, labels)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True,
                            num_workers=2, pin_memory=True)
    model = create_model().to(device)

    #výpočet váh pre BCEWithLogitsLoss
    count_0 = sum(1 for l in labels if l == 0)
    count_1 = sum(1 for l in labels if l == 1)
    pos_weight = torch.tensor([count_0 / count_1], dtype=torch.float32).to(device)

    criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    latest_checkpoint = get_latest_checkpoint()
    start_epoch = 0

    if latest_checkpoint:
        print(f"Loading latest checkpoint: {latest_checkpoint}")
        checkpoint = torch.load(latest_checkpoint, map_location=device)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        start_epoch = checkpoint['epoch']
        print(f"Resuming training from epoch {start_epoch}")
    else:
        print("No checkpoint found. Training new model from scratch.")

    model = train_model(model, dataloader, criterion, optimizer, device, start_epoch=start_epoch)
    torch.save(model.state_dict(), saved_model_path)
    print("Final model saved!")
    return model


# Run model
model = load_or_train_model(image_paths, labels)


No checkpoint found. Training new model from scratch.
Epoch 1/10, Loss: 0.6935769949968045
Checkpoint saved: /content/drive/MyDrive/checkp_eff/resnet_checkpoint_20250323-204906.pth
Epoch 2/10, Loss: 0.6765595571352885
Checkpoint saved: /content/drive/MyDrive/checkp_eff/resnet_checkpoint_20250323-205932.pth
Epoch 3/10, Loss: 0.6927596063185961
Checkpoint saved: /content/drive/MyDrive/checkp_eff/resnet_checkpoint_20250323-210953.pth
Epoch 4/10, Loss: 0.7025669389046155
Checkpoint saved: /content/drive/MyDrive/checkp_eff/resnet_checkpoint_20250323-212013.pth
Epoch 5/10, Loss: 0.6955604969691007
Checkpoint saved: /content/drive/MyDrive/checkp_eff/resnet_checkpoint_20250323-213034.pth
Epoch 6/10, Loss: 0.6952146498056558
Checkpoint saved: /content/drive/MyDrive/checkp_eff/resnet_checkpoint_20250323-214053.pth
Epoch 7/10, Loss: 0.6916491251725417
Checkpoint saved: /content/drive/MyDrive/checkp_eff/resnet_checkpoint_20250323-215105.pth
Epoch 8/10, Loss: 0.6784631907939911
Checkpoint saved: /c

In [None]:
import torch
import torch.nn as nn
from torchvision import models
from torch.utils.data import Dataset, DataLoader
import os
import cv2
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tqdm import tqdm
import json
from collections import Counter
import random

#Nastavenie parametrov
model_path = "/content/drive/MyDrive/eff/resnet_checkpoint_20250323-222153.pth"
batch_size = 16
target_size = (224, 224)

#Trieda Dataset
class ImageDataset(Dataset):
    def __init__(self, image_paths, labels, transform=None):
        self.image_paths = image_paths
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_path = "/content/drive/MyDrive/" + self.image_paths[idx]
        img = cv2.imread(img_path)
        if img is None:
            raise FileNotFoundError(f"Image not found: {img_path}")
        img = cv2.resize(img, target_size)
        img = img.astype(np.float32) / 255.0
        img = np.transpose(img, (2, 0, 1))
        img = torch.tensor(img, dtype=torch.float32)
        label = torch.tensor(self.labels[idx], dtype=torch.float32)
        return img, label

#definovanie modelu EfficientNet B0
def create_model():
    model = models.efficientnet_b0(pretrained=False)
    num_features = model.classifier[1].in_features
    model.classifier = nn.Sequential(
        nn.Linear(num_features, 128),
        nn.ReLU(),
        nn.Dropout(0.5),
        nn.Linear(128, 1),
        nn.Sigmoid()
    )
    return model

#funkcia na testovanie
def test_model(image_paths, labels):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    dataset = ImageDataset(image_paths, labels)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

    model = create_model().to(device)

    print(f"Načítava sa checkpoint: {model_path}")
    checkpoint = torch.load(model_path, map_location=device)
    model.load_state_dict(checkpoint["model_state_dict"])
    model.eval()

    all_preds = []
    all_labels = []

    with torch.no_grad():
        for images, labels_batch in tqdm(dataloader, desc="Testovanie"):
            images = images.to(device)
            labels_batch = labels_batch.to(device).unsqueeze(1)

            outputs = model(images)
            preds = (outputs > 0.5).float()

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels_batch.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    prec = precision_score(all_labels, all_preds)
    rec = recall_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)

    print("\n Výsledky testovania:")
    print(f"Accuracy:  {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall:    {rec:.4f}")
    print(f"F1 Score:  {f1:.4f}")

    pred_counts = Counter([int(p[0]) for p in all_preds])
    label_counts = Counter([int(l[0]) for l in all_labels])

    print("\n Počet vzoriek podľa tried:")
    print(" Skutočné triedy:")
    for cls in sorted(label_counts):
        print(f"  Trieda {cls}: {label_counts[cls]}")

    print(" Predikované triedy:")
    for cls in sorted(pred_counts):
        print(f"  Trieda {cls}: {pred_counts[cls]}")

#vyváženie dát zo seen/unseen
def load_unseen_test_data():
    unseen_path = "/content/drive/MyDrive/data/test_unseen.jsonl"

    with open(unseen_path, "r") as f:
        records = [json.loads(line.strip()) for line in f]

    counts = Counter([rec["label"] for rec in records])
    print(" Triedy v test_unseen:")
    for k in sorted(counts):
        print(f"  Trieda {k}: {counts[k]}")

    image_paths = [rec["img"] for rec in records]
    labels = [rec["label"] for rec in records]
    return image_paths, labels

image_paths, labels = load_unseen_test_data()
test_model(image_paths, labels)

