<a href="https://colab.research.google.com/github/indrad123/imagecaptioning/blob/main/fin_translation_google_translate_flickr30k.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installing Required Libraries

In [None]:
!pip install datasets
!pip install nusacrowd
!pip install evaluate
!pip install sentencepiece
!pip install sacremoses
!pip install transformers
!pip install rouge_score
!pip install dl-translate
!pip install googletrans==3.1.0a0
!pip install dl-translate

Collecting datasets
  Downloading datasets-2.19.2-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.1/542.1 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m20.5 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.1 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)


# Initialization

In [None]:
import torch
from transformers import MarianMTModel, MarianTokenizer
from datasets import load_dataset, Dataset, DatasetDict
import pandas as pd
import numpy as np
import os
from PIL import Image
import io
from googletrans import Translator

# Directory to save intermediate results
checkpoint_dir = "/content/drive/MyDrive/Datasets/flickr30k/checkpoints/google_translate"
os.makedirs(checkpoint_dir, exist_ok=True)

# Step 2: Load the original dataset
dataset = load_dataset("Mozilla/flickr30k-transformed-captions")

# Step 3: Check dataset splits and load the 'test' split if it exists
if 'test' in dataset:
    dataset_split = dataset['test']
else:
    raise KeyError("The dataset does not contain a 'test' split.")

# Convert dataset_split to a list of dictionaries
dataset_list = dataset_split.to_pandas().to_dict(orient='records')



Downloading readme:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/459M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/463M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/474M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/461M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/479M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/489M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/518M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/497M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/466M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/31014 [00:00<?, ? examples/s]

# Translation Process

In [None]:
# Function to translate text using Google Translate
def translate_google_text(texts):
    translator = Translator()
    translated_texts = []
    for text in texts:
        try:
            translated = translator.translate(text, src='en', dest='id').text
            translated_texts.append(translated)
        except Exception as e:
            print(f"Error: {e}")
            translated_texts.append('')
    return translated_texts

# Helper function to process a chunk of the dataset
def process_chunk(chunk):
    alt_texts = []
    original_texts = []
    images = []

    for record in chunk:
        if 'alt_text' in record and 'original_alt_text' in record and 'image' in record:  # Ensure correct key usage
            alt_texts.append(record['alt_text'])
            original_texts.append(record['original_alt_text'])
            images.append(record['image'])

    if not alt_texts or not original_texts:
        return chunk  # Return the chunk as-is if there are no texts to process

    alt_texts_translated = translate_google_text(alt_texts)
    original_texts_translated = translate_google_text(original_texts)

    for i, record in enumerate(chunk):
        if 'alt_text' in record and 'original_alt_text' in record and 'image' in record:  # Ensure correct key usage
            record['alt_text_id'] = alt_texts_translated[i]
            record['original_alt_text_id'] = original_texts_translated[i]
            record['image'] = images[i]  # Ensure the image is preserved

    return chunk

# Function to split the dataset into smaller chunks for sequential processing
def split_dataset(dataset, chunk_size):
    return [dataset[i:i + chunk_size] for i in range(0, len(dataset), chunk_size)]

# Determine the chunk size
chunk_size = 50  # Adjust this size based on available memory and disk space

# Split the dataset into smaller chunks
dataset_chunks = split_dataset(dataset_list, chunk_size)

# Process each chunk sequentially and save intermediate results
for i, chunk in enumerate(dataset_chunks):
    checkpoint_path = os.path.join(checkpoint_dir, f"chunk_{i}.h5")
    if os.path.exists(checkpoint_path):
        print(f"Skipping chunk {i} as it already exists.")
        continue  # Skip processing if the chunk already exists

    translated_chunk = process_chunk(chunk)
    df_chunk = pd.DataFrame(translated_chunk)
    df_chunk.to_hdf(checkpoint_path, key='df', mode='w')
    print(f"Processed and saved chunk {i}")

# Combine all intermediate results
translated_data = []
for i in range(len(dataset_chunks)):
    checkpoint_path = os.path.join(checkpoint_dir, f"chunk_{i}.h5")
    df_chunk = pd.read_hdf(checkpoint_path, key='df')
    translated_data.extend(df_chunk.to_dict(orient='records'))

# Convert the translated data to a Dataset
translated_dataset = Dataset.from_pandas(pd.DataFrame(translated_data))

# Create a DatasetDict
translated_dataset_dict = DatasetDict({
    'test': translated_dataset
})

# Push the dataset to the Hugging Face Hub -->>> Used to Push to Hugging Face
# translated_dataset_dict.push_to_hub("indrad123/flickr30k-transformed-captions-indonesia")


your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['image', 'alt_text', 'sentids', 'split', 'img_id', 'filename',
       'original_alt_text', 'alt_text_id', 'original_alt_text_id'],
      dtype='object')]

  df_chunk.to_hdf(checkpoint_path, key='df', mode='w')


Processed and saved chunk 0
Processed and saved chunk 1
Processed and saved chunk 2
Processed and saved chunk 3
Processed and saved chunk 4
Processed and saved chunk 5
Processed and saved chunk 6
Processed and saved chunk 7
Processed and saved chunk 8
Processed and saved chunk 9
Processed and saved chunk 10
Processed and saved chunk 11
Processed and saved chunk 12
Processed and saved chunk 13
Processed and saved chunk 14
Processed and saved chunk 15
Processed and saved chunk 16
Processed and saved chunk 17
Processed and saved chunk 18
Processed and saved chunk 19
Processed and saved chunk 20
Processed and saved chunk 21
Processed and saved chunk 22
Processed and saved chunk 23
Processed and saved chunk 24
Processed and saved chunk 25
Processed and saved chunk 26
Processed and saved chunk 27
Processed and saved chunk 28
Processed and saved chunk 29
Processed and saved chunk 30
Processed and saved chunk 31
Processed and saved chunk 32
Processed and saved chunk 33
Processed and saved chun

# Save to File Image as Bytes (To be Uploaded to Hugging Face)

In [None]:
# Save the translated dataset to a local directory
output_dir = "/content/drive/MyDrive/Datasets/flickr30k_google_translation"
translated_dataset_dict.save_to_disk(output_dir)

print(f"Translated dataset saved to {output_dir}")

Saving the dataset (0/9 shards):   0%|          | 0/31014 [00:00<?, ? examples/s]

Translated dataset saved to /content/drive/MyDrive/Datasets/flickr30k_google_translation


In [None]:
from datasets import load_from_disk

disk_translated_dataset_dict = load_from_disk(output_dir)

In [None]:
disk_translated_dataset_dict["test"][0]

{'image': {'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01\x00H\x00H\x00\x00\xff\xdb\x00C\x00\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x02\x02\x03\x02\x02\x02\x02\x02\x04\x03\x03\x02\x03\x05\x04\x05\x05\x05\x04\x04\x04\x05\x06\x07\x06\x05\x05\x07\x06\x04\x04\x06\t\x06\x07\x08\x08\x08\x08\x08\x05\x06\t\n\t\x08\n\x07\x08\x08\x08\xff\xdb\x00C\x01\x01\x01\x01\x02\x02\x02\x04\x02\x02\x04\x08\x05\x04\x05\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\xff\xc0\x00\x11\x08\x01\xf4\x01M\x03\x01\x11\x00\x02\x11\x01\x03\x11\x01\xff\xc4\x00\x1f\x00\x00\x02\x03\x00\x03\x01\x01\x01\x01\x00\x00\x00\x00\x00\x00\x00\x07\x08\x05\x06\t\x03\x04\n\x02\x01\x00\x0b\xff\xc4\x00D\x10\x00\x02\x02\x02\x02\x01\x03\x04\x00\x04\x04\x04\x04\x04\x02\x0b\x02\x03\x01\x04\x05\x06\x07\x11\x12\x08\x13!\x00\x14"1\t\x15#A\x162Qa$Bq\x81\x173R\x91\n%Cb\xa14\

# Save Datasets with Image

In [None]:
import os
import pandas as pd
from PIL import Image
import io
from datasets import Dataset, DatasetDict, load_from_disk
import h5py

# Directory to read intermediate results and save images
checkpoint_dir = "/content/drive/MyDrive/Datasets/flickr30k/checkpoints/google_translate"
image_dir = os.path.join(checkpoint_dir, "images_converted")
output_dir = "/content/drive/MyDrive/Datasets/flickr30k_google_translation_img"

# Ensure the directory for images is created
if not os.path.exists(image_dir):
    os.makedirs(image_dir)
    print(f"Created directory: {image_dir}")
else:
    print(f"Directory already exists: {image_dir}")

# Combine all intermediate results
translated_data = []
chunk_files = [f for f in os.listdir(checkpoint_dir) if f.endswith('.h5')]
for i, chunk_file in enumerate(chunk_files):
    checkpoint_path = os.path.join(checkpoint_dir, chunk_file)
    df_chunk = pd.read_hdf(checkpoint_path, key='df')
    translated_data.extend(df_chunk.to_dict(orient='records'))

# Convert bytes to image files and update records
def convert_bytes_to_images(record, image_dir, idx):
    if 'image' in record:
        image_data = record['image']
        print(f"Processing image data of type: {type(image_data)} with value: {str(image_data)[:100]}")  # Print type and partial content
        if isinstance(image_data, bytes):
            try:
                image = Image.open(io.BytesIO(image_data))
                image_filename = os.path.join(image_dir, f"image_{idx}.png")
                image.save(image_filename)
                record['image_filename'] = image_filename
                del record['image']  # Remove the byte data to save space
                print(f"Saved image to {image_filename}")
            except Exception as e:
                print(f"Error opening image from bytes: {e}")
        elif isinstance(image_data, dict) and 'bytes' in image_data:
            try:
                image_bytes = image_data['bytes']
                image = Image.open(io.BytesIO(image_bytes))
                image_filename = os.path.join(image_dir, f"image_{idx}.png")
                image.save(image_filename)
                record['image_filename'] = image_filename
                del record['image']  # Remove the byte data to save space
                print(f"Saved image to {image_filename}")
            except Exception as e:
                print(f"Error opening image from dict: {e}")
        else:
            print(f"Unexpected image data type: {type(image_data)}")
    else:
        print("No image data found in record")
    return record

# Convert all image bytes in the dataset to image files
for idx, record in enumerate(translated_data):
    convert_bytes_to_images(record, image_dir, idx)

# Ensure the DataFrame is flattened and `image_filename` key is added
for record in translated_data:
    if 'image_filename' not in record or not record['image_filename']:
        record['image_filename'] = ""

# Convert the translated data to a Dataset
translated_dataset = Dataset.from_pandas(pd.DataFrame(translated_data))

# Create a DatasetDict
translated_dataset_dict = DatasetDict({
    'test': translated_dataset
})

# Save the translated dataset to a local directory
translated_dataset_dict.save_to_disk(output_dir)

print(f"Translated dataset saved to {output_dir}")

# Load the dataset from the saved directory
disk_translated_dataset_dict = load_from_disk(output_dir)

# Print the keys of the first record for debugging
print("Keys in the first record:", disk_translated_dataset_dict['test'][0].keys())

# Example to show how to convert back the bytes to images after loading the dataset
def display_image_from_record(record):
    if 'image_filename' in record:
        image_filename = record['image_filename']
        print(f"Trying to open image file: {image_filename}")
        if os.path.isfile(image_filename):
            try:
                image = Image.open(image_filename)
                image.show()
            except Exception as e:
                print(f"Error displaying image: {e}")
        else:
            print(f"File {image_filename} does not exist")
    else:
        print("No image_filename found in the record")

# Test loading an image from the loaded dataset
sample_record = disk_translated_dataset_dict['test'][0]
display_image_from_record(sample_record)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Processing image data of type: <class 'dict'> with value: {'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01\x01,\x01,\x00\x00\xff\xdb\x00C\x00\x01\x01\
Saved image to /content/drive/MyDrive/Datasets/flickr30k/checkpoints/google_translate/images_converted/image_28514.png
Processing image data of type: <class 'dict'> with value: {'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01\x00H\x00H\x00\x00\xff\xdb\x00C\x00\x01\x01\
Saved image to /content/drive/MyDrive/Datasets/flickr30k/checkpoints/google_translate/images_converted/image_28515.png
Processing image data of type: <class 'dict'> with value: {'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01\x00H\x00H\x00\x00\xff\xdb\x00C\x00\x01\x01\
Saved image to /content/drive/MyDrive/Datasets/flickr30k/checkpoints/google_translate/images_converted/image_28516.png
Processing image data of type: <class 'dict'> with value: {'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x0

Saving the dataset (0/1 shards):   0%|          | 0/31014 [00:00<?, ? examples/s]

Translated dataset saved to /content/drive/MyDrive/Datasets/flickr30k_google_translation_img
Keys in the first record: dict_keys(['alt_text', 'sentids', 'split', 'img_id', 'filename', 'original_alt_text', 'alt_text_id', 'original_alt_text_id', 'image_filename'])
Trying to open image file: /content/drive/MyDrive/Datasets/flickr30k/checkpoints/google_translate/images_converted/image_0.png
