<a href="https://colab.research.google.com/github/indrad123/imagecaptioning/blob/main/fin_translation_marianmt_flickr30k.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Flickr30k Datasets

In [1]:
# calculate number of cores
import multiprocessing

num_cores = multiprocessing.cpu_count()
print("Number of CPU cores:", num_cores)

Number of CPU cores: 8


In [2]:
# Step 1: Install required libraries
!pip install datasets transformers torch sacremoses


Collecting datasets
  Downloading datasets-2.19.2-py3-none-any.whl (542 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/542.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━[0m [32m286.7/542.1 kB[0m [31m8.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.1/542.1 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m47.7 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.1 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━

In [3]:
import torch
from transformers import MarianMTModel, MarianTokenizer
from datasets import load_dataset, Dataset, DatasetDict
import pandas as pd
import numpy as np
import os
from PIL import Image
import io

# Directory to save intermediate results
checkpoint_dir = "/content/drive/MyDrive/Datasets/flickr30k/checkpoints/marianmt"
os.makedirs(checkpoint_dir, exist_ok=True)

# Step 2: Load the original dataset
dataset = load_dataset("Mozilla/flickr30k-transformed-captions")

# Step 3: Check dataset splits and load the 'test' split if it exists
if 'test' in dataset:
    dataset_split = dataset['test']
else:
    raise KeyError("The dataset does not contain a 'test' split.")

# Convert dataset_split to a list of dictionaries
dataset_list = dataset_split.to_pandas().to_dict(orient='records')

# Step 4: Load the pre-trained translation model and tokenizer
model_name = 'Helsinki-NLP/opus-mt-en-id'
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# Step 5: Move the model to GPU if available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

# Function to translate text using the pre-trained model
def translate_text(texts, tokenizer, model, device):
    if not texts:
        return []
    # Ensure all texts are strings
    texts = [str(text) for text in texts]
    batch = tokenizer(texts, return_tensors="pt", padding=True, truncation=True).to(device)
    translated = model.generate(**batch)
    translated_texts = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
    return translated_texts

# Helper function to process a chunk of the dataset
def process_chunk(chunk):
    alt_texts = []
    original_texts = []
    images = []

    for record in chunk:
        if 'alt_text' in record and 'original_alt_text' in record and 'image' in record:  # Ensure correct key usage
            alt_texts.append(record['alt_text'])
            original_texts.append(record['original_alt_text'])
            images.append(record['image'])

    if not alt_texts or not original_texts:
        return chunk  # Return the chunk as-is if there are no texts to process

    alt_texts_translated = translate_text(alt_texts, tokenizer, model, device)
    original_texts_translated = translate_text(original_texts, tokenizer, model, device)

    for i, record in enumerate(chunk):
        if 'alt_text' in record and 'original_alt_text' in record and 'image' in record:  # Ensure correct key usage
            record['alt_text_id'] = alt_texts_translated[i]
            record['original_alt_text_id'] = original_texts_translated[i]
            record['image'] = images[i]  # Ensure the image is preserved

    return chunk

# Function to split the dataset into smaller chunks for sequential processing
def split_dataset(dataset, chunk_size):
    return [dataset[i:i + chunk_size] for i in range(0, len(dataset), chunk_size)]

# Determine the chunk size
chunk_size = 50  # Adjust this size based on available memory and disk space

# Split the dataset into smaller chunks
dataset_chunks = split_dataset(dataset_list, chunk_size)

# Process each chunk sequentially and save intermediate results
for i, chunk in enumerate(dataset_chunks):
    checkpoint_path = os.path.join(checkpoint_dir, f"chunk_{i}.h5")
    if os.path.exists(checkpoint_path):
        print(f"Skipping chunk {i} as it already exists.")
        continue  # Skip processing if the chunk already exists

    translated_chunk = process_chunk(chunk)
    df_chunk = pd.DataFrame(translated_chunk)
    df_chunk.to_hdf(checkpoint_path, key='df', mode='w')
    print(f"Processed and saved chunk {i}")

# Combine all intermediate results
translated_data = []
for i in range(len(dataset_chunks)):
    checkpoint_path = os.path.join(checkpoint_dir, f"chunk_{i}.h5")
    df_chunk = pd.read_hdf(checkpoint_path, key='df')
    translated_data.extend(df_chunk.to_dict(orient='records'))

# Convert the translated data to a Dataset
translated_dataset = Dataset.from_pandas(pd.DataFrame(translated_data))

# Create a DatasetDict
translated_dataset_dict = DatasetDict({
    'test': translated_dataset
})

# Push the dataset to the Hugging Face Hub -->>> Used to Push to Hugging Face
# translated_dataset_dict.push_to_hub("indrad123/flickr30k-transformed-captions-indonesia")


Downloading readme:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/459M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/463M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/474M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/461M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/479M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/489M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/518M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/497M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/466M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/31014 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/796k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/801k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.26M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/291M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['image', 'alt_text', 'sentids', 'split', 'img_id', 'filename',
       'original_alt_text', 'alt_text_id', 'original_alt_text_id'],
      dtype='object')]

  df_chunk.to_hdf(checkpoint_path, key='df', mode='w')


Processed and saved chunk 0
Processed and saved chunk 1
Processed and saved chunk 2
Processed and saved chunk 3
Processed and saved chunk 4
Processed and saved chunk 5
Processed and saved chunk 6
Processed and saved chunk 7
Processed and saved chunk 8
Processed and saved chunk 9
Processed and saved chunk 10
Processed and saved chunk 11
Processed and saved chunk 12
Processed and saved chunk 13
Processed and saved chunk 14
Processed and saved chunk 15
Processed and saved chunk 16
Processed and saved chunk 17
Processed and saved chunk 18
Processed and saved chunk 19
Processed and saved chunk 20
Processed and saved chunk 21
Processed and saved chunk 22
Processed and saved chunk 23
Processed and saved chunk 24
Processed and saved chunk 25
Processed and saved chunk 26
Processed and saved chunk 27
Processed and saved chunk 28
Processed and saved chunk 29
Processed and saved chunk 30
Processed and saved chunk 31
Processed and saved chunk 32
Processed and saved chunk 33
Processed and saved chun

# Save to Google Drive

In [4]:
# Save the translated dataset to a local directory
output_dir = "/content/drive/MyDrive/Datasets/flickr30k_marianmt_translation"
translated_dataset_dict.save_to_disk(output_dir)

print(f"Translated dataset saved to {output_dir}")

Saving the dataset (0/9 shards):   0%|          | 0/31014 [00:00<?, ? examples/s]

Translated dataset saved to /content/drive/MyDrive/Datasets/flickr30k_marianmt_translation


# Test to Reload from Google Drive

In [5]:
from datasets import load_from_disk

disk_translated_dataset_dict = load_from_disk(output_dir)


In [7]:
disk_translated_dataset_dict['test'][30000]

{'image': {'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01\x00\xf0\x00\xf0\x00\x00\xff\xdb\x00C\x00\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x02\x02\x03\x02\x02\x02\x02\x02\x04\x03\x03\x02\x03\x05\x04\x05\x05\x05\x04\x04\x04\x05\x06\x07\x06\x05\x05\x07\x06\x04\x04\x06\t\x06\x07\x08\x08\x08\x08\x08\x05\x06\t\n\t\x08\n\x07\x08\x08\x08\xff\xdb\x00C\x01\x01\x01\x01\x02\x02\x02\x04\x02\x02\x04\x08\x05\x04\x05\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\xff\xc0\x00\x11\x08\x01M\x01\xf4\x03\x01\x11\x00\x02\x11\x01\x03\x11\x01\xff\xc4\x00\x1f\x00\x00\x01\x03\x04\x03\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03\x01\x02\x04\x00\x05\x08\t\x06\x07\n\x0b\xff\xc4\x00F\x10\x00\x02\x01\x02\x04\x04\x03\x06\x04\x05\x03\x03\x04\x01\x02\x07\x01\x02\x03\x04\x11\x00\x05\x06!\x07\x121A\x08\x13Q\x14"aq\x81\xf0\t2\x91\xa1\x15\xb1\xc1\x

# Test to Load from Hugging Face

In [None]:
test_indrad123 = load_dataset("indrad123/flickr30k-transformed-captions-indonesia")

Downloading readme:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/462M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/466M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/476M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/463M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/483M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/495M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/525M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/504M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/469M [00:00<?, ?B/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [None]:
test_indrad123['test'][0]

{'image': {'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01\x00H\x00H\x00\x00\xff\xdb\x00C\x00\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x02\x02\x03\x02\x02\x02\x02\x02\x04\x03\x03\x02\x03\x05\x04\x05\x05\x05\x04\x04\x04\x05\x06\x07\x06\x05\x05\x07\x06\x04\x04\x06\t\x06\x07\x08\x08\x08\x08\x08\x05\x06\t\n\t\x08\n\x07\x08\x08\x08\xff\xdb\x00C\x01\x01\x01\x01\x02\x02\x02\x04\x02\x02\x04\x08\x05\x04\x05\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\xff\xc0\x00\x11\x08\x01\xf4\x01M\x03\x01\x11\x00\x02\x11\x01\x03\x11\x01\xff\xc4\x00\x1f\x00\x00\x02\x03\x00\x03\x01\x01\x01\x01\x00\x00\x00\x00\x00\x00\x00\x07\x08\x05\x06\t\x03\x04\n\x02\x01\x00\x0b\xff\xc4\x00D\x10\x00\x02\x02\x02\x02\x01\x03\x04\x00\x04\x04\x04\x04\x04\x02\x0b\x02\x03\x01\x04\x05\x06\x07\x11\x12\x08\x13!\x00\x14"1\t\x15#A\x162Qa$Bq\x81\x173R\x91\n%Cb\xa14\