In [1]:
import os
import random
import string
from pathlib import Path
import pandas as pd
import cv2
import numpy as np
from albumentations import (
    Compose, 
    RandomBrightnessContrast, 
    RandomScale, 
    ShiftScaleRotate,
    HorizontalFlip, 
    RandomResizedCrop, 
    Blur, 
    ColorJitter
)

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
def generate_random_filename():
    letters = string.ascii_lowercase
    return ''.join(random.choice(letters) for _ in range(10))

In [3]:
def get_all_files_in_folder(folder_path):
    files = []
    for root, _, filenames in os.walk(folder_path):
        for filename in filenames:
            files.append(os.path.join(root, filename))
    return files

In [4]:
def randomise_files(folder_path):
    
    files = get_all_files_in_folder(folder_path)
    renamed_files = set()  # To keep track of used random names
    for file_path in files:
        file_dir, file_name = os.path.split(file_path)
        file_extension = os.path.splitext(file_name)[1]
        
        new_filename = generate_random_filename()
        while new_filename in renamed_files:
            new_filename = generate_random_filename()
        renamed_files.add(new_filename)
        
        new_file_path = os.path.join(file_dir, new_filename + file_extension)
        os.rename(file_path, new_file_path)

In [5]:
def rename_files_with_variable_and_count(folder_path, category):
    category = category[:-1]
    count = 1
    files = get_all_files_in_folder(folder_path)
    for file_path in files:
        file_dir, file_name = os.path.split(file_path)
        file_extension = os.path.splitext(file_name)[1]
        
        new_filename = f"{category}_{str(count).zfill(4)}"
        new_file_path = os.path.join(file_dir, new_filename + file_extension)
        os.rename(file_path, new_file_path)
        
        count += 1

In [6]:
def update_csv(train_or_test):
  folder_path = f'dataset/{train_or_test}'
  file_paths = []
  labels = []
  augmented_list = []

  subfolders = [item for item in Path(folder_path).iterdir() if item.is_dir()]

  for folder in subfolders:
    for file_name in os.listdir(folder):
      if file_name.endswith((".jpeg", ".jpg")):
        label = file_name.split('_')[0]
        path = f"{train_or_test}/{file_name}"
        augmented = 1 if 'augmented' in file_name else 0
        
        file_paths.append(path)
        labels.append(label)
        augmented_list.append(augmented)

  data = {'File_Path': file_paths, 'Label': labels, 'Augmented': augmented_list}
  df = pd.DataFrame(data)

  csv_file_path = f"{train_or_test}_labels.csv"
  df.to_csv(csv_file_path, index=False)

In [7]:
def create_augmented_images(category, folder_path):
  augmentation_transform = Compose([
      RandomBrightnessContrast(brightness_limit=0.2, contrast_limit=0.2, p=0.4),
      RandomScale(scale_limit=0.2, p=0.4),
      ShiftScaleRotate(shift_limit=0.2, scale_limit=0.2, rotate_limit=30, p=0.4),
      HorizontalFlip(p=0.4),
      RandomResizedCrop(height=256, width=256, scale=(0.8, 1.0), p=0.4),
      Blur(blur_limit=3, p=0.4),
      ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.2, p=0.4),
  ])

  duplicate_count = 9 if category in ['bags', 'others'] else 4

  for filename in os.listdir(folder_path):
    if filename.endswith(('.jpeg', '.jpg')):
      for i in range(duplicate_count): 
        image_path = os.path.join(folder_path, filename)
        image = cv2.imread(image_path)
        label = filename.split('_')[0]
        id = filename.split('_')[1][:4]

        augmented = augmentation_transform(image=image)
        augmented_image = augmented['image']

        output_path = os.path.join(folder_path, f"{label}_{id}_augmented_{i}.jpeg")
        cv2.imwrite(output_path, augmented_image)
        print(f"{label}_{id}_augmented_{i}.jpeg")


In [8]:
categories = ['tops', 'bottoms', 'shoes', 'bags', 'others']

for train_or_test in ['train', 'test']:
  for category in categories:
    folder_path = f'./dataset/{train_or_test}/{category}'
    randomise_files(folder_path)
    rename_files_with_variable_and_count(folder_path, category)
    if train_or_test == "train":
      create_augmented_images(category, folder_path)

  update_csv(train_or_test)

top_0218_augmented_0.jpeg
top_0218_augmented_1.jpeg
top_0218_augmented_2.jpeg
top_0218_augmented_3.jpeg
top_0131_augmented_0.jpeg
top_0131_augmented_1.jpeg
top_0131_augmented_2.jpeg
top_0131_augmented_3.jpeg
top_0424_augmented_0.jpeg
top_0424_augmented_1.jpeg
top_0424_augmented_2.jpeg
top_0424_augmented_3.jpeg
top_0074_augmented_0.jpeg
top_0074_augmented_1.jpeg
top_0074_augmented_2.jpeg
top_0074_augmented_3.jpeg
top_0166_augmented_0.jpeg
top_0166_augmented_1.jpeg
top_0166_augmented_2.jpeg
top_0166_augmented_3.jpeg
top_0536_augmented_0.jpeg
top_0536_augmented_1.jpeg
top_0536_augmented_2.jpeg
top_0536_augmented_3.jpeg
top_0023_augmented_0.jpeg
top_0023_augmented_1.jpeg
top_0023_augmented_2.jpeg
top_0023_augmented_3.jpeg
top_0473_augmented_0.jpeg
top_0473_augmented_1.jpeg
top_0473_augmented_2.jpeg
top_0473_augmented_3.jpeg
top_0189_augmented_0.jpeg
top_0189_augmented_1.jpeg
top_0189_augmented_2.jpeg
top_0189_augmented_3.jpeg
top_0259_augmented_0.jpeg
top_0259_augmented_1.jpeg
top_0259_aug