In [1]:
import torch.optim as optim
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from dataset import FlickrDataset
from model import *
import tqdm
img_root = r'D:\git\Image_Captioning\dataset\Images'
caption_root = r'D:\git\Image_Captioning\dataset\captions.txt'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
vocab_size = 25
hidden_size = 512
embedding_dim = 256
transform = transforms.Compose([
    transforms.Resize((256,256)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225]),
])
def collate_fn(batch):
    images = []
    captions = []
    max_len = max(len(caption) for _, caption in batch)  # Find max caption length in batch
    for img, caption in batch:
        images.append(img)
        # Pad caption to max_len
        pad_tensor = torch.zeros(max_len - len(caption)).long()  # Use <PAD> token index
        padded_caption = torch.cat((caption, pad_tensor), dim=0)
        captions.append(padded_caption)

    # Stack images and captions into tensors
    images = torch.stack(images, dim=0)  # Shape: (batch_size, C, H, W)
    captions = torch.stack(captions, dim=0)  # Shape: (batch_size, max_len)
    return images, captions
dataset = FlickrDataset(img_root, caption_root, transform=transform)
dataloader = DataLoader(dataset, batch_size=32, collate_fn=collate_fn)




[nltk_data] Downloading package punkt to C:\Users\admin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
model = CNNtoRNN(vocab_size, hidden_size, embedding_dim, num_layers=1).to(device)
loss_fn = nn.CrossEntropyLoss(ignore_index=dataset.vocab.stoi["<PAD>"])  # Rename for clarity
optimizer = optim.Adam(model.parameters(), lr=0.001)
epochs = 10

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for img, captions in tqdm.tqdm(dataloader):
        img = img.to(device)  # Shape: (batch_size, C, H, W)
        captions = captions.to(device)  # Shape: (batch_size, max_len)
        # Compute lengths (exclude <EOS> or <PAD>)
        # Forward pass
        optimizer.zero_grad()
        outputs = model(img, captions)  # Ensure model accepts lengths if needed

        # Prepare targets
        targets = captions[:, 1:]  # Exclude <SOS>
        packed_targets = pack_padded_sequence(targets, lengths, batch_first=True, enforce_sorted=False)[0]
        packed_outputs = pack_padded_sequence(outputs, lengths, batch_first=True, enforce_sorted=False)[0]

        # Compute loss
        loss = loss_fn(packed_outputs, packed_targets)
        total_loss += loss.item()

        # Backward pass
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # Gradient clipping
        optimizer.step()

    avg_loss = total_loss / len(dataloader)
    print(f"Epoch: {epoch+1}, Average Loss: {avg_loss:.4f}")


  0%|          | 0/1265 [00:09<?, ?it/s]


RuntimeError: start (24) + length (1) exceeds dimension size (24).

In [2]:
import torch
import torch.nn as nn
import torchvision.models as models
from torch.utils.data import DataLoader
import torchvision.transforms as transforms
import tqdm
import torch.optim as optim
from dataset import FlickrDataset  # Giả sử bạn đã định nghĩa lớp này
from vocab import Vocabulary  # Giả sử bạn đã định nghĩa lớp này

# Định nghĩa các lớp mô hình
class EncoderCNN(nn.Module):
    def __init__(self, embed_size):
        super(EncoderCNN, self).__init__()
        resnet = models.resnet101(weights="IMAGENET1K_V1")
        modules = list(resnet.children())[:-1]  # Loại bỏ lớp FC
        self.resnet = nn.Sequential(*modules)
        for param in self.resnet.parameters():
            param.requires_grad = False  # Đóng băng ResNet
        self.linear = nn.Linear(2048, embed_size)  # ResNet-101 feature size
        self.bn = nn.BatchNorm1d(embed_size, momentum=0.1)

    def forward(self, images):
        features = self.resnet(images)  # (batch_size, 2048, 1, 1)
        features = features.view(features.size(0), -1)  # (batch_size, 2048)
        features = self.bn(self.linear(features))  # (batch_size, embed_size)
        return features

class DecoderRNN(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers):
        super(DecoderRNN, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)
        self.dropout = nn.Dropout(0.5)

    def forward(self, features, captions):
        # features: (batch_size, embed_size)
        # captions: (batch_size, max_len) chứa các token từ <SOS> đến <EOS> hoặc được pad
        embeddings = self.dropout(self.embed(captions[:, :-1]))  # Loại bỏ <EOS>, (batch_size, max_len-1, embed_size)
        features = features.unsqueeze(1)  # (batch_size, 1, embed_size)
        inputs = torch.cat((features, embeddings), dim=1)  # (batch_size, max_len, embed_size)
        outputs, _ = self.lstm(inputs)  # (batch_size, max_len, hidden_size)
        outputs = self.linear(outputs)  # (batch_size, max_len, vocab_size)
        return outputs

    def sample(self, features, max_len=25):
        sample_ids = []
        inputs = features.unsqueeze(1)  # (batch_size, 1, embed_size)
        states = None
        for _ in range(max_len):
            hiddens, states = self.lstm(inputs, states)  # (batch_size, 1, hidden_size)
            outputs = self.linear(hiddens.squeeze(1))  # (batch_size, vocab_size)
            _, predicted = outputs.max(1)  # (batch_size,)
            sample_ids.append(predicted)
            inputs = self.embed(predicted).unsqueeze(1)  # (batch_size, 1, embed_size)
        return torch.stack(sample_ids, dim=1)  # (batch_size, max_len)

class CNNtoRNN(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers):
        super(CNNtoRNN, self).__init__()
        self.encoderCNN = EncoderCNN(embed_size)
        self.decoderRNN = DecoderRNN(embed_size, hidden_size, vocab_size, num_layers)

    def forward(self, images, captions):
        features = self.encoderCNN(images)  # (batch_size, embed_size)
        outputs = self.decoderRNN(features, captions)  # (batch_size, max_len, vocab_size)
        return outputs

    def caption_image(self, image, vocabulary, max_len=25):
        result_caption = []
        with torch.no_grad():
            x = self.encoderCNN(image).unsqueeze(0)  # (1, embed_size)
            states = None
            for _ in range(max_len):
                hiddens, states = self.decoderRNN.lstm(x, states)  # (1, 1, hidden_size)
                output = self.decoderRNN.linear(hiddens.squeeze(1))  # (1, vocab_size)
                predicted = output.argmax(1)  # (1,)
                result_caption.append(predicted.item())
                x = self.decoderRNN.embed(predicted).unsqueeze(1)  # (1, 1, embed_size)
                if vocabulary.itos[predicted.item()] == "<END>":
                    break
        return [vocabulary.itos[idx] for idx in result_caption]

# Hàm collate_fn để xử lý batch
def collate_fn(batch, max_len=25):
    images = []
    captions = []
    for img, caption in batch:
        images.append(img)
        # Cắt hoặc pad caption để đạt độ dài max_len
        if len(caption) > max_len:
            caption = caption[:max_len]
        else:
            pad_tensor = torch.ones(max_len - len(caption)).long() * 0  # <PAD> token
            caption = torch.cat((caption, pad_tensor), dim=0)
        captions.append(caption)
    images = torch.stack(images, dim=0)  # (batch_size, C, H, W)
    captions = torch.stack(captions, dim=0)  # (batch_size, max_len)
    return images, captions

# Cấu hình
img_root = r'D:\git\Image_Captioning\dataset\Images'
caption_root = r'D:\git\Image_Captioning\dataset\captions.txt'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
hidden_size = 512
embedding_dim = 256
num_layers = 1
batch_size = 32
epochs = 10

# Transform cho dữ liệu
transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Dataset và DataLoader
dataset = FlickrDataset(img_root, caption_root, transform=transform)
dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)
vocab_size = len(dataset.vocab)
print('Vocabulary size:', vocab_size)

# Khởi tạo mô hình, loss và optimizer
model = CNNtoRNN(embedding_dim, hidden_size, vocab_size, num_layers).to(device)
loss_fn = nn.CrossEntropyLoss(ignore_index=0)  # Bỏ qua <PAD>
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Huấn luyện
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch, (images, captions) in tqdm.tqdm(enumerate(dataloader), total=len(dataloader)):
        images = images.to(device)  # (batch_size, C, H, W)
        captions = captions.to(device)  # (batch_size, max_len)
        optimizer.zero_grad()
        outputs = model(images, captions)  # (batch_size, max_len, vocab_size)
        targets = captions[:, 1:]  # Loại bỏ <SOS>, (batch_size, max_len-1)
        outputs = outputs[:, :-1, :]  # Loại bỏ bước cuối, (batch_size, max_len-1, vocab_size)

        # Reshape cho CrossEntropyLoss
        outputs = outputs.reshape(-1, vocab_size)  # (batch_size * (max_len-1), vocab_size)
        targets = targets.reshape(-1)  # (batch_size * (max_len-1))

        loss = loss_fn(outputs, targets)
        total_loss += loss.item()

        # Backward
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()

    avg_loss = total_loss / len(dataloader)
    print(f"Epoch: {epoch + 1}, Average Loss: {avg_loss:.4f}")

    # Lưu mô hình sau mỗi epoch
    torch.save(model.state_dict(), f"model_epoch_{epoch+1}.pth")

[nltk_data] Downloading package punkt to C:\Users\admin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Vocabulary size: 4107


100%|██████████| 1265/1265 [11:06<00:00,  1.90it/s]


Epoch: 1, Average Loss: 4.0701


100%|██████████| 1265/1265 [10:56<00:00,  1.93it/s]


Epoch: 2, Average Loss: 3.4985


100%|██████████| 1265/1265 [11:50<00:00,  1.78it/s]


Epoch: 3, Average Loss: 3.2759


 29%|██▉       | 369/1265 [03:26<08:21,  1.79it/s]


KeyboardInterrupt: 

In [1]:
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")
print(f"GPU device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'No GPU detected'}")

PyTorch version: 2.6.0+cu124
CUDA available: True
CUDA version: 12.4
GPU device: NVIDIA GeForce RTX 3050 Laptop GPU
