# Data Preparation for Image Captioning models

## Installing and Importing libraries

In [1]:
import os
import json
from tqdm import tqdm
import shutil
import string
import pandas as pd
import multiprocessing

import warnings
warnings.filterwarnings("ignore")

In [5]:
data_folder = r"C:\\Users\\likhi\\Documents\\02 Pycharm Datasets\\01 Master Thesis\\04 Product Data"

## Loading the dataset

### Loading the text data

In [4]:
def clean_descriptions(desc):
    table = str.maketrans('', '', string.punctuation)
    desc = desc.split(' ')
    desc = [word.lower() for word in desc]
    desc = [w.translate(table) for w in desc]
    desc = [word for word in desc if len(word)>1]
    desc = ' '.join(desc)
    
    return desc

In [5]:
dataset_list = []

all_product_names = []

for category_folder in tqdm(os.listdir(data_folder)):
    category_path = os.path.join(data_folder, category_folder)
    for product_folder in os.listdir(category_path):
        
        if product_folder not in all_product_names:                     
            all_product_names.append(product_folder)
            
            product_path = os.path.join(category_path, product_folder)
            
            if len(os.listdir(product_path)) > 1:
                try:
                    data = json.load(open(os.path.join(product_path, product_folder + ".json"), "r"))
                    product_overview = data['product_overview']
                    product_overview = ",".join([key + " is " + value for key,value in product_overview.items() 
                                        if key.lower() not in ['color', 'colour']])
                    product_description = ",".join(data['description'])
                    categories = ",".join(data['categories'])
                    product_title = data['Title']
                    final_description = product_title + categories + product_description + product_overview
                    
                    cleaned_final_description = clean_descriptions(final_description)
                    
                    image_names = [i for i in os.listdir(product_path) if '.jpg' in i]
    
                    if len(cleaned_final_description) > 0:
                        for img in image_names:
                             
                            product_dict = {}  # Create a new dictionary for each iteration
                            product_dict["file_name"] = img
                            product_dict["text"] = cleaned_final_description
                            
                            dataset_list.append(product_dict)  # Append the new dictionary to the list
                    
                except:
                    continue


100%|██████████| 265/265 [12:17<00:00,  2.78s/it]


### Checking if the metadata is unique

In [6]:
len(dataset_list)

918816

In [7]:
from more_itertools import unique_everseen

# Remove duplicates from the list of dictionaries
unique_dicts = list(unique_everseen(dataset_list, key=lambda d: tuple(d.items())))

print(len(unique_dicts))

918816


In [8]:
unique_filenames = [i['file_name'] for i in unique_dicts]
print(len(unique_filenames))
print(pd.DataFrame(unique_filenames).value_counts())

918816
Product_043945669X_0.jpg      1
Product_B09Z2W76SM_5.jpg      1
Product_B09Z641MYY_2.jpg      1
Product_B09Z641MYY_3.jpg      1
Product_B09Z641MYY_4.jpg      1
                             ..
Product_B07WRSCDZN_109.jpg    1
Product_B07WRSCDZN_11.jpg     1
Product_B07WRSCDZN_110.jpg    1
Product_B07WRSCDZN_111.jpg    1
Product_B0CV7L1DHH_5.jpg      1
Name: count, Length: 918816, dtype: int64


#### Saving the metadata json file

In [4]:
destination_image_dir = r"C:\\Users\\likhi\\Documents\\02 Pycharm Datasets\\01 Master Thesis\\06 Image_Captioning_Dataset\\imagefolder\\train\\"

In [13]:
with open(destination_image_dir + 'metadata.jsonl', 'w') as file:
     file.write(json.dumps(dataset_list))

### Creating the image directory

### Method 1

In [None]:
all_product_names_for_images = []

if not os.path.exists(destination_image_dir):
    os.mkdir(destination_image_dir)
	
for category_folder in tqdm(os.listdir(data_folder)):
    category_path = os.path.join(data_folder, category_folder)
    for product_folder in os.listdir(category_path):
        if product_folder not in all_product_names_for_images:                     
            all_product_names_for_images.append(product_folder)
            product_path = os.path.join(category_path, product_folder)
            images = [i for i in os.listdir(product_path) if '.jpg' in i]
            for img in images:
                try:
                    image_path = os.path.join(product_path, img)
                    if not os.path.exists(os.path.join(destination_image_dir, img)):
                        shutil.copy(image_path, os.path.join(destination_image_dir, img))
                except OSError:
                    print("Error with file")

 24%|██▍       | 63/265 [14:00<1:50:24, 32.79s/it]

### Method 2

In [None]:
num_processes = multiprocessing.cpu_count()
pool = multiprocessing.Pool(processes=num_processes)
    
all_product_names_for_images = []

for category_folder in tqdm(os.listdir(data_folder)):
    category_path = os.path.join(data_folder, category_folder)
    for product_folder in os.listdir(category_path):
        if product_folder not in all_product_names_for_images:                     
            all_product_names_for_images.append(product_folder)
            product_path = os.path.join(category_path, product_folder)
            images = [i for i in os.listdir(product_path) if '.jpg' in i]
            for img in images:
                try:
                    image_path = os.path.join(product_path, img)
                    if not os.path.exists(os.path.join(destination_image_dir, img)):
                        pool.apply_async(copy_file, args=(image_path, os.path.join(destination_image_dir, img)))
                except OSError:
                    print("Error with file")

 38%|███▊      | 100/265 [03:56<02:38,  1.04it/s]

In [6]:
len(os.listdir(destination_image_dir)) - 1

631170