In [1]:
%pip install sacremoses



In [20]:
%pip install torch torchvision

Note: you may need to restart the kernel to use updated packages.


In [1]:
import torch

device = torch.device('mps' if 
                      torch.backends.mps.is_available() else 'cpu')
                      #torch.cuda.is_available() else 'cpu')
print(device)

mps


In [2]:
# %%time
from transformers import MarianMTModel, MarianTokenizer


# Load the model and tokenizer outside the translation function
model_name = "Helsinki-NLP/opus-mt-en-id"
model = MarianMTModel.from_pretrained(model_name)
tokenizer = MarianTokenizer.from_pretrained(model_name)
model.to(device)

def translate(text):
# Tokenize input text
    input_ids = tokenizer.encode(text, return_tensors="pt").to(device)

    # Generate translation
    translation_ids = model.generate(input_ids, max_length=100)

    # Decode the translation
    translation = tokenizer.decode(translation_ids[0], skip_special_tokens=True)

    return translation

# Example usage
english_text = "Hello, how are you?"
indonesian_translation = translate(english_text)

print(f"English: {english_text}")
print(f"Indonesian Translation: {indonesian_translation}")

English: Hello, how are you?
Indonesian Translation: Halo, apa kabar?


In [3]:
import pandas

df = pandas.read_csv('captions.txt')
df

Unnamed: 0,image,caption
0,1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set o...
1,1000268201_693b08cb0e.jpg,A girl going into a wooden building .
2,1000268201_693b08cb0e.jpg,A little girl climbing into a wooden playhouse .
3,1000268201_693b08cb0e.jpg,A little girl climbing the stairs to her playh...
4,1000268201_693b08cb0e.jpg,A little girl in a pink dress going into a woo...
...,...,...
40450,997722733_0cb5439472.jpg,A man in a pink shirt climbs a rock face
40451,997722733_0cb5439472.jpg,A man is rock climbing high in the air .
40452,997722733_0cb5439472.jpg,A person in a red shirt climbing up a rock fac...
40453,997722733_0cb5439472.jpg,A rock climber in a red shirt .


In [5]:
df_translate = pandas.DataFrame(columns=['image', 'caption_en', 'caption_id'])

In [6]:
%%time

df_translate['image'], df_translate['caption_en'], df_translate['caption_id'] = df['image'], df['caption'], df['caption'].apply(translate)
df_translate.to_csv('captions_id.csv', index = False)

CPU times: user 4h 29min 46s, sys: 1h 45min 6s, total: 6h 14min 52s
Wall time: 1d 1h 52min 2s


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from PIL import Image
from nltk.tokenize import word_tokenize
from torch.nn.utils.rnn import pack_padded_sequence

# Assuming you have downloaded the Flickr8k dataset and have the images and captions

# Sample data loading and preprocessing
class Flickr8kDataset(Dataset):
    def __init__(self, image_folder, captions_file, transform=None):
        # Load image paths and captions
        self.image_folder = image_folder
        self.captions = self.load_captions(captions_file)
        self.image_paths = list(self.captions.keys())
        self.transform = transform

    def load_captions(self, captions_file):
        # Load captions from the file
        captions = {}
        with open(captions_file, 'r') as f:
            lines = f.readlines()
            for line in lines:
                parts = line.strip().split('\t')
                img_name, caption = parts[0], parts[1]
                img_name = img_name.split("#")[0]  # Remove the #n from image names
                if img_name not in captions:
                    captions[img_name] = []
                captions[img_name].append(caption)
        return captions

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_name = self.image_paths[idx]
        img_path = f'{self.image_folder}/{img_name}'
        image = Image.open(img_path).convert('RGB')

        if self.transform is not None:
            image = self.transform(image)

        # Return image and corresponding captions
        return image, self.captions[img_name]

# Define the image captioning model
class ImageCaptioningModel(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers=1):
        super(ImageCaptioningModel, self).__init__()
        # CNN for image feature extraction
        self.cnn = models.resnet50(pretrained=True)
        self.cnn.fc = nn.Linear(self.cnn.fc.in_features, embed_size)

        # RNN for caption generation
        self.rnn = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, images, captions, lengths):
        # Image feature extraction
        image_features = self.cnn(images)

        # Embedding captions
        captions = self.embedding(captions)

        # Concatenate image features and captions
        inputs = torch.cat((image_features.unsqueeze(1), captions), 1)

        # Pack sequences for LSTM
        packed_inputs = pack_padded_sequence(inputs, lengths, batch_first=True)

        # LSTM forward pass
        packed_outputs, _ = self.rnn(packed_inputs)

        # Fully connected layer
        outputs = self.fc(packed_outputs.data)

        return outputs

# Hyperparameters
embed_size = 256
hidden_size = 512
vocab_size = 10000  # Adjust based on your dataset vocabulary size
num_layers = 1
learning_rate = 0.001
batch_size = 64
num_epochs = 10

# Data preprocessing
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

# Initialize dataset and dataloader
dataset = Flickr8kDataset(image_folder='path/to/your/images', captions_file='path/to/your/captions.txt', transform=transform)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Initialize the model, loss, and optimizer
model = ImageCaptioningModel(embed_size, hidden_size, vocab_size, num_layers)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    for images, captions_list in dataloader:
        # Concatenate all captions into one list
        captions = [caption for captions_per_image in captions_list for caption in captions_per_image]

        # Tokenize captions
        captions = [word_tokenize(caption.lower()) for caption in captions]

        # Convert captions to indices using a vocabulary
        # (Assuming you have a vocabulary with word-to-index mapping)
        captions = [[vocab[word] for word in caption] for caption in captions]

        # Find the maximum length of captions
        max_len = max(len(caption) for caption in captions)

        # Pad captions to the maximum length
        captions_padded = [caption + [0] * (max_len - len(caption)) for caption in captions]

        # Convert to PyTorch tensor
        captions_tensor = torch.tensor(captions_padded)

        # Forward pass
        outputs = model(images, captions_tensor, [max_len] * batch_size)

        # Calculate the loss
        targets = captions_tensor[:, 1:].contiguous().view(-1)  # Exclude the <start> token
       
