In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
drive_dir = '/content/drive/MyDrive/deeplearning asm2/'

In [None]:
import os
import json
import zipfile

# kaggle api
kaggle_json_str = """
{

}
"""

#  write kaggle.json
os.makedirs(os.path.expanduser('~/.kaggle'), exist_ok=True)
with open(os.path.expanduser('~/.kaggle/kaggle.json'), 'w') as f:
    f.write(kaggle_json_str.strip())
os.chmod(os.path.expanduser('~/.kaggle/kaggle.json'), 0o600)

# download
os.makedirs('./data', exist_ok=True)
os.system('kaggle competitions download -c multi-label-classification-competition-2025 -p ./data')

# unzip
for file in os.listdir('./data'):
    if file.endswith('.zip'):
        with zipfile.ZipFile(f'./data/{file}', 'r') as zip_ref:
            zip_ref.extractall('./data')


In [None]:
import re
import pandas as pd
import numpy as np
from io import StringIO
import os

import torch
import torchvision.transforms as T
import torch.nn.functional as F
from PIL import Image
from transformers import BertTokenizer, AutoTokenizer
import os
import seaborn as sns
import matplotlib.pyplot as plt
import random
from tqdm.notebook import tqdm
sns.set(style="whitegrid")
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

DIR = '/content/data/COMP5329S1A2Dataset/'


In [None]:
from transformers import AutoTokenizer

tokenizer = BertTokenizer.from_pretrained(drive_dir + 'best_model/', local_files_only=True)

MAX_LEN = 32

# tokenizer for caption input
def encode_captions_to_df(df, tokenizer, max_len=32):
    encodings = tokenizer(
        df["Caption"].tolist(),
        padding="max_length",
        truncation=True,
        max_length=max_len,
        return_tensors="pt"
    )
    df["input_ids"] = list(encodings["input_ids"].numpy())
    df["attention_mask"] = list(encodings["attention_mask"].numpy())
    return df


In [None]:
from torch.utils.data import Dataset, random_split, DataLoader
from torch.utils.data import Dataset
from PIL import Image
import torch
import os
import numpy as np
class ImgCapData(Dataset):
    def __init__(self, df, label_matrix, image_dir='data', transform=None):
        self.df = df.reset_index(drop=True)
        self.labels = torch.FloatTensor(label_matrix)
        self.image_dir = image_dir
        self.transform = transform

    def __getitem__(self, idx):
        row = self.df.loc[idx]
        image_path = os.path.join(self.image_dir, row["ImageID"])
        image = Image.open(image_path).convert("RGB")
        if self.transform:
            image = self.transform(image)

        input_ids = torch.tensor(row["input_ids"])
        attention_mask = torch.tensor(row["attention_mask"])
        label = self.labels[idx]

        return {
            "image": image,
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "label": label,
            "caption": row["Caption"]
        }

    def __len__(self):
        return len(self.df)

In [None]:

val_transform = T.Compose([
    T.Resize(400),                 # Resize shorter side to 400
    T.CenterCrop(384),          # Center crop to 384
    T.ToTensor(),
    T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])


In [None]:
import torch
import torch.nn as nn
from torchvision.models import efficientnet_v2_s, EfficientNet_V2_S_Weights
from torchvision.models import efficientnet_b3, EfficientNet_B3_Weights
from torchvision import models

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models

class AttentionPool2d(nn.Module):
    """
    ref: https://github.com/mlfoundations/open_clip/blob/main/src/open_clip/modified_resnet.py
    """
    def __init__(self, spacial_dim: int, embed_dim: int, num_heads: int):
        super().__init__()
        self.positional_embedding = nn.Parameter(torch.randn(spacial_dim**2 + 1, embed_dim) / embed_dim ** 0.5)
        self.q_proj = nn.Linear(embed_dim, embed_dim)
        self.k_proj = nn.Linear(embed_dim, embed_dim)
        self.v_proj = nn.Linear(embed_dim, embed_dim)
        self.c_proj = nn.Linear(embed_dim, embed_dim)
        self.num_heads = num_heads

    def forward(self, x):
        # x: [B, C, H, W] → [HW, B, C]
        x = x.reshape(x.shape[0], x.shape[1], -1).permute(2, 0, 1)
        cls_token = x.mean(dim=0, keepdim=True)  # [1, B, C]
        x = torch.cat([cls_token, x], dim=0)  # [HW+1, B, C]
        x = x + self.positional_embedding[:, None, :].to(x.dtype)  # add position embad

        x, _ = F.multi_head_attention_forward(
            query=x, key=x, value=x,
            embed_dim_to_check=x.shape[-1],
            num_heads=self.num_heads,
            q_proj_weight=self.q_proj.weight,
            k_proj_weight=self.k_proj.weight,
            v_proj_weight=self.v_proj.weight,
            in_proj_weight=None,
            in_proj_bias=torch.cat([self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]),
            bias_k=None, bias_v=None, add_zero_attn=False, dropout_p=0.0,
            out_proj_weight=self.c_proj.weight,
            out_proj_bias=self.c_proj.bias,
            use_separate_proj_weight=True,
            training=self.training,
            need_weights=False
        )
        return x.permute(1, 0, 2)  # → [B, HW+1, D]



from transformers import AutoImageProcessor, AutoModel
import torch.nn.functional as F


class EfficientNetEncoder(nn.Module):
    def __init__(self, out_dim=384, num_heads=4, feature_map_size=12):
        super().__init__()
        backbone = models.efficientnet_v2_s(weights=None)
        self.features = backbone.features
        self.attnpool = AttentionPool2d(feature_map_size, embed_dim=1280, num_heads=num_heads)
        self.linear_proj = nn.Sequential(
            nn.LayerNorm(1280),
            nn.Linear(1280, out_dim)
        )

    def forward(self, x):
        x = self.features(x)  # [B, 1280, H', W']
        tokens = self.attnpool(x)  # [B, N_img+1, 1280]
        return self.linear_proj(tokens)  # [B, N_img+1, D]

from transformers import AutoConfig, AutoModel


class TextEmbeddingEncoder(nn.Module):
    def __init__(self, model_path= drive_dir + 'best_model/', out_dim=384):
        super().__init__()

        # load offline config
        config = AutoConfig.from_pretrained(model_path, local_files_only=True)
        self.bert = AutoModel.from_config(config)  # initialize bert

        self.out_dim = out_dim
        self.project = nn.Sequential(
            nn.LayerNorm(config.hidden_size),
            nn.Linear(config.hidden_size, out_dim)
        )

    def forward(self, input_ids, attention_mask=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs.last_hidden_state
        return self.project(last_hidden_state)


class MultiModalTransformer(nn.Module):
    def __init__(self, embed_dim, num_heads=4, num_layers=2, dim_ff=666, max_len=666):
        super().__init__()
        self.cls_token = nn.Parameter(torch.randn(1, 1, embed_dim))
        self.pos_embedding = nn.Parameter(torch.randn(1, max_len, embed_dim))

        encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads,
                                                   dim_feedforward=dim_ff, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

    def forward(self, img_tokens, text_tokens):
        B = img_tokens.size(0)
        cls = self.cls_token.expand(B, -1, -1)
        joint = torch.cat([cls, img_tokens, text_tokens], dim=1)  # [B, N+T+1, D]
        joint = joint + self.pos_embedding[:, :joint.size(1), :]
        return self.transformer(joint)[:, 0]  # output with cls for classification

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models

class MiniViTex_EfficientNetv2s(nn.Module):
    def __init__(self,
                 model_config_path=drive_dir + 'best_model/',
                 image_out_dim=384,
                 num_heads=4,
                 num_classes=18):
        super().__init__()
        self.image_encoder = EfficientNetEncoder(out_dim=image_out_dim, num_heads=num_heads)
        self.text_encoder = TextEmbeddingEncoder(model_config_path, out_dim=image_out_dim)

        self.fusion_transformer = MultiModalTransformer(embed_dim=image_out_dim,
                                                        num_heads=num_heads)

        self.classifier = nn.Sequential(
            nn.LayerNorm(image_out_dim),
            nn.Linear(image_out_dim, num_classes)
        )

    def forward(self, image, input_ids, attention_mask=None):
        img_tokens = self.image_encoder(image)         # [B, N_img+1, D]
        text_tokens = self.text_encoder(input_ids,attention_mask)     # [B, T, D]
        fused = self.fusion_transformer(img_tokens, text_tokens)  # [B, D]
        return self.classifier(fused)  # [B, num_classes]

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models
from torch import amp


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [None]:
import torch

# Step 1: Instantiate model
model = MiniViTex_EfficientNetv2s()

# Step 2: Force map to CPU and load the state dict
state_dict = torch.load(drive_dir + 'best_model/MiniViTex_EfficientNetV2s_93_4.pth', map_location='cpu')
model.load_state_dict(state_dict)

# Step 3: Move to device
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
model.to(device)
print('load success')


load success


In [None]:

with open(DIR + 'test.csv') as file:
    lines = [re.sub(r'([^,])"(\s*[^\n])', r'\1/"\2', line) for line in file]
testdf = pd.read_csv(StringIO(''.join(lines)), escapechar="/")

# 1. Encode test captions
testdf['Caption'] = testdf['Caption'].str.lower()
testdf = encode_captions_to_df(testdf, tokenizer)

# 2. create test dataset
test_dataset = ImgCapData(
    df=testdf,
    label_matrix=np.zeros((len(testdf), 18)),  # Dummy labels for compatibility
    image_dir=DIR + 'data',  # the path to test data
    transform=val_transform
)

# 3. create dataloader
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

model.eval()
all_preds = []

with torch.no_grad():
    for batch in tqdm(test_loader):
        images = batch['image'].to(device)
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        logits = model(images, input_ids, attention_mask)
        probs = torch.sigmoid(logits)  # convert logit to probs
        for row in probs:
            threshold = 0.5
            pred = (row > threshold)
            while not pred.any() and threshold > 0:  # at least one prediction
                threshold -= 0.01
                pred = (row > threshold)
            all_preds.append(pred.cpu().numpy())


  0%|          | 0/313 [00:00<?, ?it/s]

In [None]:
# Label index mapping：because label 12 is removed，>11 label need +1
def map_label_indices(pred_row):
    indices = np.where(pred_row == 1)[0]
    adjusted = [i + 1 if i >= 11 else i for i in indices]  # skip label 12
    adjusted = [i + 1 for i in adjusted]  # labels start from 1
    return ' '.join(map(str, adjusted))

# add prediction to testdf
testdf["Labels"] = [map_label_indices(pred) for pred in all_preds]
testdf[["ImageID", "Labels"]].to_csv("Predicted_labels.txt", index=False)


In [None]:
testdf

Unnamed: 0,ImageID,Caption,input_ids,attention_mask,Labels
0,30000.jpg,a little girl waring a krispy kreme hat holdin...,"[101, 1037, 2210, 2611, 2162, 2075, 1037, 1903...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
1,30001.jpg,a beautiful young woman holding an orange fris...,"[101, 1037, 3376, 2402, 2450, 3173, 2019, 4589...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, ...",1
2,30002.jpg,a group of people sitting on couch next to a c...,"[101, 1037, 2177, 1997, 2111, 3564, 2006, 6411...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
3,30003.jpg,a person on a snowboard rides on the hill.,"[101, 1037, 2711, 2006, 1037, 4586, 6277, 1227...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, ...",1
4,30004.jpg,a man riding a skateboard with a helmet on in ...,"[101, 1037, 2158, 5559, 1037, 17260, 6277, 200...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
...,...,...,...,...,...
9995,39995.jpg,a group of men riding surfboards riding a mass...,"[101, 1037, 2177, 1997, 2273, 5559, 14175, 152...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, ...",1
9996,39996.jpg,a motorcycle parked next to a car in a parking...,"[101, 1037, 9055, 9083, 2279, 2000, 1037, 2482...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, ...",3 4
9997,39997.jpg,a little boy that is playing with a wii,"[101, 1037, 2210, 2879, 2008, 2003, 2652, 2007...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, ...",1
9998,39998.jpg,group of kids play frisbee golf in the middle ...,"[101, 2177, 1997, 4268, 2377, 10424, 2483, 113...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
