<a href="https://colab.research.google.com/github/indrad123/imagecaptioning/blob/main/fin_translation_marianmt_flickr30k.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Flickr30k Datasets

In [1]:
import multiprocessing

num_cores = multiprocessing.cpu_count()
print("Number of CPU cores:", num_cores)

Number of CPU cores: 8


In [2]:
pip install datasets transformers torch


Collecting datasets
  Downloading datasets-2.19.2-py3-none-any.whl (542 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/542.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━[0m [32m225.3/542.1 kB[0m [31m7.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.1/542.1 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.1 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux201

In [None]:
# Step 1: Install required libraries
!pip install datasets transformers torch

import torch
from transformers import MarianMTModel, MarianTokenizer
from datasets import load_dataset, Dataset, DatasetDict
import pandas as pd
import numpy as np
from multiprocessing import Pool, cpu_count

# Step 2: Load the original dataset
dataset = load_dataset("Mozilla/flickr30k-transformed-captions")

# Step 3: Load the pre-trained translation model and tokenizer
model_name = 'Helsinki-NLP/opus-mt-en-id'
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# Step 4: Move the model to GPU if available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

# Function to translate text using the pre-trained model
def translate_text(texts, tokenizer, model, device):
    batch = tokenizer(texts, return_tensors="pt", padding=True, truncation=True).to(device)
    translated = model.generate(**batch)
    translated_texts = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
    return translated_texts

# Helper function to process a chunk of the dataset
def process_chunk(chunk):
    alt_texts = [record['alt_text'] for record in chunk]
    original_texts = [record['original_text'] for record in chunk]

    alt_texts_translated = translate_text(alt_texts, tokenizer, model, device)
    original_texts_translated = translate_text(original_texts, tokenizer, model, device)

    for i, record in enumerate(chunk):
        record['alt_text_id'] = alt_texts_translated[i]
        record['original_text_id'] = original_texts_translated[i]

    return chunk

# Function to split the dataset into chunks for parallel processing
def split_dataset(dataset, num_chunks):
    return np.array_split(dataset, num_chunks)

# Main processing loop
def main():
    # Determine the optimal number of chunks based on CPU cores
    num_cores = min(cpu_count(), 8)  # Use up to 8 cores
    num_chunks = num_cores * 2  # Double the number of chunks to keep all cores busy

    # Split the dataset into chunks
    dataset_chunks = split_dataset(dataset['train'], num_chunks)

    # Create a pool of worker processes
    with Pool(num_cores) as pool:
        results = pool.map(process_chunk, dataset_chunks)

    # Combine the results
    translated_data = [record for chunk in results for record in chunk]

    # Convert the translated data to a Dataset
    translated_dataset = Dataset.from_pandas(pd.DataFrame(translated_data))

    # Create a DatasetDict
    translated_dataset_dict = DatasetDict({
        'train': translated_dataset
    })

    # Push the dataset to the Hugging Face Hub
    translated_dataset_dict.push_to_hub("indrad123/flickr30k-transformed-captions-indonesia")

if __name__ == "__main__":
    main()
