# Data Preparation for Image Captioning models

## Installing and Importing libraries

In [1]:
import os
import json
from tqdm import tqdm
import shutil
import string
import random
import multiprocessing

import warnings
warnings.filterwarnings("ignore")

In [2]:
data_folder = r"C:\\Users\\likhi\\Documents\\02 Pycharm Datasets\\01 Master Thesis\\04 Product Data\\"
destination_image_dir = r'C:\\Users\\likhi\\Documents\\02 Pycharm Datasets\\01 Master Thesis\\06b Image_Captioning_Dataset\\imagefolder\\'

## Creating the train - val - test split of dataset

In [3]:
def clean_descriptions(desc):
    table = str.maketrans('', '', string.punctuation)
    desc = desc.split(' ')
    desc = [word.lower() for word in desc]
    desc = [w.translate(table) for w in desc]
    desc = [word for word in desc if len(word)>1]
    desc = ' '.join(desc)
    
    return desc


# Function to create train, validation, and test directories
def create_directories(root_dir, subdirs):
    for subdir in subdirs:
        os.makedirs(os.path.join(root_dir, subdir), exist_ok=True)

# Function to copy files to train, validation, and test directories
def copy_to_split(image_path, destination_dir, split):
    shutil.copy(image_path, os.path.join(destination_dir, split))


In [4]:
# Function to process a single product
def process_product(product_folder, category_path, destination_image_dir, train_ratio, val_ratio):
    all_images = []
    product_path = os.path.join(category_path, product_folder)
   
    try:
        data = json.load(open(os.path.join(product_path, product_folder + ".json"), "r"))
        product_overview = data.get('product_overview', '')
        product_overview = ",".join([key + " is " + value for key, value in product_overview.items() 
                            if key.lower() not in ['color', 'colour']])
        product_description = ",".join(data.get('description', ''))
        categories = ",".join(data.get('categories', ''))
        product_title = data.get('Title', '')
        final_description = product_title + categories + product_description + product_overview
        
        cleaned_final_description = clean_descriptions(final_description)
        
        image_names = [i for i in os.listdir(product_path) if '.jpg' in i]
        all_images.extend(image_names)

        if len(cleaned_final_description) > 0:
            for img in image_names:
                product_dict = {}  # Create a new dictionary for each iteration
                product_dict["file_name"] = img
                product_dict["text"] = cleaned_final_description
                
                # Write the dictionary to the appropriate output file based on the split
                rand = random.random()
                if rand < train_ratio:
                    train_output_file.write(json.dumps(product_dict) + '\n')
                 
                elif rand < train_ratio + val_ratio:
                    val_output_file.write(json.dumps(product_dict) + '\n')
                else:
                    test_output_file.write(json.dumps(product_dict) + '\n')
                
                # Copy images to the appropriate split directories
                rand = random.random()
                if rand < train_ratio:
                    copy_to_split(os.path.join(product_path, img), destination_image_dir, 'train')
                elif rand < train_ratio + val_ratio:
                    copy_to_split(os.path.join(product_path, img), destination_image_dir, 'val')
                else:
                    copy_to_split(os.path.join(product_path, img), destination_image_dir, 'test')
                
    except Exception as e:
        pass
        # print(f"Error processing product {product_folder}: {e}")
        # print(f"Path to product folder: {product_path}")
        # print(f"Content of product folder: {os.listdir(product_path)}")


# Initialize lists to keep track of product names
all_product_names = []

# Define train-val-test split ratios
train_ratio = 0.6
val_ratio = 0.2

# Create train, validation, and test directories
split_dirs = ['train', 'val', 'test']
create_directories(destination_image_dir, split_dirs)

# Open separate output files for train, validation, and test data
train_output_file = open(os.path.join(destination_image_dir, 'train_metadata.jsonl'), 'w')
val_output_file = open(os.path.join(destination_image_dir, 'val_metadata.jsonl'), 'w')
test_output_file = open(os.path.join(destination_image_dir, 'test_metadata.jsonl'), 'w')


for category_folder in tqdm(sorted(os.listdir(data_folder))):
    category_path = os.path.join(data_folder, category_folder)
    for product_folder in sorted(os.listdir(category_path)):
        
        if product_folder not in all_product_names:                     
            all_product_names.append(product_folder)
 
            # Process each product using multiprocessing pool
            process_product(product_folder, category_path, destination_image_dir, train_ratio, val_ratio)

# Close the output files
train_output_file.close()
val_output_file.close()
test_output_file.close()

print("Output files and split directories created successfully.")

100%|██████████| 265/265 [1:03:42<00:00, 14.42s/it]

Output files and split directories created successfully.





In [6]:
for folder in os.listdir(destination_image_dir):
    print(folder)
    if '.jsonl' not in folder:
        print(len(os.listdir(os.path.join(destination_image_dir, folder))))

test
183165
test_metadata.jsonl
train
551662
train_metadata.jsonl
val
183989
val_metadata.jsonl
