# Data Preparation for Image Captioning models

## Installing and Importing libraries

In [8]:
import os
import json
from tqdm import tqdm
import shutil
import string
import random
from datasets import load_dataset

import warnings
warnings.filterwarnings("ignore")

In [2]:
data_folder = r"C:\\Users\\likhi\\Documents\\02 Pycharm Datasets\\01 Master Thesis\\04 Product Data\\"
destination_image_dir = r'C:\\Users\\likhi\\Documents\\02 Pycharm Datasets\\01 Master Thesis\\06b Image_Captioning_Dataset\\imagefolder\\'

## Creating the train - val - test split of dataset

In [3]:
def clean_descriptions(desc):
    table = str.maketrans('', '', string.punctuation)
    desc = desc.split(' ')
    desc = [word.lower() for word in desc]
    desc = [w.translate(table) for w in desc]
    desc = [word for word in desc if len(word)>1]
    desc = ' '.join(desc)
    
    return desc


# Function to create train, validation, and test directories
def create_directories(root_dir, subdirs):
    for subdir in subdirs:
        os.makedirs(os.path.join(root_dir, subdir), exist_ok=True)

# Function to copy files to train, validation, and test directories
def copy_to_split(image_path, destination_dir, split):
    shutil.copy(image_path, os.path.join(destination_dir, split))


In [4]:
# Function to process a single product
def process_product(product_folder, category_path, destination_image_dir, train_ratio, val_ratio):
    all_images = []
    product_path = os.path.join(category_path, product_folder)
   
    try:
        data = json.load(open(os.path.join(product_path, product_folder + ".json"), "r"))
        product_overview = data.get('product_overview', '')
        product_overview = ",".join([key + " is " + value for key, value in product_overview.items() 
                            if key.lower() not in ['color', 'colour']])
        product_description = ",".join(data.get('description', ''))
        categories = ",".join(data.get('categories', ''))
        product_title = data.get('Title', '')
        final_description = product_title + categories + product_description + product_overview
        
        cleaned_final_description = clean_descriptions(final_description)
        
        image_names = [i for i in os.listdir(product_path) if '.jpg' in i]
        all_images.extend(image_names)

        if len(cleaned_final_description) > 0:
            for img in image_names:
                product_dict = {}  # Create a new dictionary for each iteration
                product_dict["file_name"] = img
                product_dict["text"] = cleaned_final_description
                
                # Write the dictionary to the appropriate output file based on the split
                rand = random.random()
                if rand < train_ratio:
                    train_output_file.write(json.dumps(product_dict) + '\n')
                    copy_to_split(os.path.join(product_path, img), destination_image_dir, 'train')
                 
                elif rand < train_ratio + val_ratio:
                    val_output_file.write(json.dumps(product_dict) + '\n')
                    copy_to_split(os.path.join(product_path, img), destination_image_dir, 'val')
                else:
                    test_output_file.write(json.dumps(product_dict) + '\n')
                    copy_to_split(os.path.join(product_path, img), destination_image_dir, 'test')
                
                
    except Exception as e:
        pass
        # print(f"Error processing product {product_folder}: {e}")
        # print(f"Path to product folder: {product_path}")
        # print(f"Content of product folder: {os.listdir(product_path)}")


# Initialize lists to keep track of product names
all_product_names = []

# Define train-val-test split ratios
train_ratio = 0.6
val_ratio = 0.2

# Create train, validation, and test directories
split_dirs = ['train', 'val', 'test']
create_directories(destination_image_dir, split_dirs)

# Open separate output files for train, validation, and test data
train_output_file = open(os.path.join(destination_image_dir, 'train', 'metadata.jsonl'), 'w')
val_output_file = open(os.path.join(destination_image_dir, 'val', 'metadata.jsonl'), 'w')
test_output_file = open(os.path.join(destination_image_dir, 'test', 'metadata.jsonl'), 'w')


for category_folder in tqdm(sorted(os.listdir(data_folder))):
    category_path = os.path.join(data_folder, category_folder)
    for product_folder in sorted(os.listdir(category_path)):
        
        if product_folder not in all_product_names:                     
            all_product_names.append(product_folder)
 
            # Process each product using multiprocessing pool
            process_product(product_folder, category_path, destination_image_dir, train_ratio, val_ratio)

# Close the output files
train_output_file.close()
val_output_file.close()
test_output_file.close()

print("Output files and split directories created successfully.")

100%|██████████| 265/265 [1:03:42<00:00, 14.42s/it]

Output files and split directories created successfully.





In [7]:
for folder in os.listdir(destination_image_dir):
    if '.jsonl' not in folder:
        print(folder)
        print(len(os.listdir(os.path.join(destination_image_dir, folder))))

test
183165
train
551662
val
183989


## Checking if images in the folder are present in the JSON file

In [30]:
folders = ['train', 'val', 'test']

for folder in folders:
    
    print("Checking the folder - ", folder)
    json_data = json.load(open(os.path.join(destination_image_dir, folder, 'metadata.jsonl'), 'r'))
    json_data_filenames = [i['file_name'] for i in json_data]
    
    print(json_data_filenames[:10])
    
    print("Loaded the JSON file")
    
    images = [i for i in os.listdir(os.path.join(destination_image_dir, folder)) if '.jpg' in i]
    
    count = 0
    for image in tqdm(images):
        if image not in json_data_filenames:
            count+=1
            
    print("Missing images - ", count)
        

Checking the folder -  train
['Product_0545498562_1.jpg', 'Product_0545498562_2.jpg', 'Product_0545498562_5.jpg', 'Product_0545906520_0.jpg', 'Product_0692164308_0.jpg', 'Product_0692164308_1.jpg', 'Product_0692164308_3.jpg', 'Product_0692164308_4.jpg', 'Product_0692164308_5.jpg', 'Product_0767806239_0.jpg']
Loaded the JSON file


100%|██████████| 551662/551662 [55:05<00:00, 166.87it/s] 


Missing images -  0
Checking the folder -  val
['Product_043945669X_0.jpg', 'Product_0545449367_0.jpg', 'Product_0545498562_3.jpg', 'Product_0767836316_1.jpg', 'Product_0783226063_0.jpg', 'Product_0783226853_0.jpg', 'Product_0783226926_0.jpg', 'Product_0783227876_0.jpg', 'Product_0790740044_0.jpg', 'Product_0792833198_0.jpg']
Loaded the JSON file


100%|██████████| 183989/183989 [07:02<00:00, 435.60it/s]


Missing images -  0
Checking the folder -  test
['Product_0439740207_0.jpg', 'Product_0545498562_0.jpg', 'Product_0545498562_4.jpg', 'Product_0692164308_2.jpg', 'Product_0767836316_0.jpg', 'Product_0767836316_3.jpg', 'Product_078322592X_1.jpg', 'Product_078322592X_2.jpg', 'Product_0790731460_0.jpg', 'Product_0792838467_0.jpg']
Loaded the JSON file


100%|██████████| 183165/183165 [07:05<00:00, 430.92it/s]


Missing images -  0


## Saving the Dataset to PyTorch Dataset

In [None]:
# Path to the folder containing the data
root = r"C:/Users/likhi/Documents/02 Pycharm Datasets/01 Master Thesis/06b Image_Captioning_Dataset/imagefolder/"

In [None]:
# Set the environment variable ARROW_LARGE_MEMORY_TEST to '20GB'
os.environ['ARROW_LARGE_MEMORY_TEST'] = '200GB'

block_size = os.environ.get('ARROW_LARGE_MEMORY_TEST')
print(block_size)

### Saving the train dataset

In [None]:
# Define the paths to your train, validation, and test data directories
train_data_path = root +'train/'

# Load the train, validation, and test datasets
train_dataset = load_dataset("imagefolder", data_dir=train_data_path, split='train')
# Optionally, you can inspect the loaded datasets
print("Train Dataset:")
print(train_dataset)

In [None]:
if not os.path.exists(root + 'train_dataset/'):
    os.mkdir(root + 'train_dataset/')
    
train_dataset.save_to_disk(root + 'train_dataset/')

### Saving the validation dataset

In [None]:
val_data_path = root +'val/'

val_dataset = load_dataset("imagefolder", data_dir=val_data_path, split='train')

print("Validation Dataset:")
print(val_dataset)

In [None]:
if not os.path.exists(root + 'val_dataset/'):
    os.mkdir(root + 'val_dataset/')
    
val_dataset.save_to_disk(root + 'val_dataset/')

### Saving the test dataset

In [None]:
test_data_path = root +'test/'

test_dataset = load_dataset("imagefolder", data_dir=test_data_path, split='train')

print("Test Dataset:")
print(test_dataset)

In [None]:
if not os.path.exists(root + 'test_dataset/'):
    os.mkdir(root + 'test_dataset/')
    
test_dataset.save_to_disk(root + 'test_dataset/')