In [1]:
import os
from PIL import Image
import pandas as pd
import random
import numpy as np
import shutil

In [4]:
input_dir = r"C:\Users\fcali\OneDrive\Masaüstü\DATA SCIENCE\WEB-SCRAPING\AMAZON\AMAZON_DATA\ABO_BERKELEY\SORTED"
output_dir = r"C:\Users\fcali\OneDrive\Masaüstü\DATA SCIENCE\WEB-SCRAPING\AMAZON\AMAZON_DATA\ABO_BERKELEY\SORTED224"
base_directory = r"C:\Users\fcali\OneDrive\Masaüstü\DATA SCIENCE\WEB-SCRAPING\AMAZON\AMAZON_DATA\PRODUCT_IMAGES\AMAZON_IMAGES\PROCESSED_IMAGES\FINAL_DATASET"


In [72]:
def rename_and_resize_images(input_dir, output_base_dir, target_size=(224, 224)):
    for root, dirs, files in os.walk(input_dir):
        for dir_name in dirs:
            # Create corresponding subdirectories in the output directory
            subdir_path = os.path.join(root, dir_name)
            relative_path = os.path.relpath(subdir_path, input_dir)
            output_subdir = os.path.join(output_base_dir, relative_path + '_processed')
            
            if not os.path.exists(output_subdir):
                os.makedirs(output_subdir)

        for file_index, file in enumerate(files):
            file_path = os.path.join(root, file)
            try:
                # Load image using PIL
                image = Image.open(file_path)
                
                # Resize image with padding
                image = resize_and_pad_image(image, target_size)
                
                # Determine the output path and save the image with a new name
                category = os.path.basename(root)[:4]  # Get first 3 letters of the category
                new_name = f'{category}_{file_index}_abo.jpeg'
                relative_path = os.path.relpath(root, input_dir)
                output_subdir = os.path.join(output_base_dir, relative_path + '_processed')
                output_path = os.path.join(output_subdir, new_name)
                image.save(output_path, 'JPEG')
            except Exception as e:
                print(f'Error processing {file_path}: {e}')

def resize_and_pad_image(image, target_size, padding_color=(255, 255, 255)):
    # Resize image while maintaining aspect ratio
    image.thumbnail(target_size, Image.Resampling.LANCZOS)
    
    # Create a new image with white background
    new_image = Image.new('RGB', target_size, padding_color)
    
    # Calculate position to paste the resized image on the new image
    left = (target_size[0] - image.size[0]) // 2
    top = (target_size[1] - image.size[1]) // 2
    new_image.paste(image, (left, top))
    
    return new_image


In [79]:
# Class name mapping
class_mapping = {
    'bab_processed': 'BABY_PRODUCTS',
    'bea_processed': 'BEAUTY_HEALTH',
    'clo_processed': 'CLOTHING_ACCESSORIES_JEWELLERY',
    'ele_processed': 'ELECTRONICS',
    'gro_processed': 'GROCERY_FOOD',
    'hob_processed': 'HOBBY_ARTS_STATIONERY',
    'hom_processed': 'HOME_KITCHEN_TOOLS',
    'pet_processed': 'PET_SUPPLIES',
    'spo_processed': 'SPORTS_OUTDOOR'
}

# Ensure output directories exist
sub_dirs = ['train', 'val', 'check']
for sub_dir in sub_dirs:
    for class_name in class_mapping.values():
        os.makedirs(os.path.join(output_directory, sub_dir, class_name), exist_ok=True)

In [2]:
target_directory = r'C:\Users\fcali\OneDrive\Masaüstü\DATA SCIENCE\WEB-SCRAPING\AMAZON\AMAZON_DATA\PRODUCT_IMAGES\AMAZON_IMAGES\PROCESSED_IMAGES\FINAL_DATASET_3_SPLITTED'
base_directory = r"C:\Users\fcali\OneDrive\Masaüstü\DATA SCIENCE\WEB-SCRAPING\AMAZON\AMAZON_DATA\PRODUCT_IMAGES\AMAZON_IMAGES\PROCESSED_IMAGES\FINAL_DATASET"

os.listdir(base_directory)

['bab_processed',
 'bea_processed',
 'clo_processed',
 'ele_processed',
 'gro_processed',
 'hob_processed',
 'hom_processed',
 'pet_processed',
 'spo_processed']

In [3]:
# Class name mappings
class_mapping = {
    'bab_processed': 'BABY_PRODUCTS',
    'bea_processed': 'BEAUTY_HEALTH',
    'clo_processed': 'CLOTHING_ACCESSORIES_JEWELLERY',
    'ele_processed': 'ELECTRONICS',
    'gro_processed': 'GROCERY',
    'hob_processed': 'HOBBY_ARTS_STATIONERY',
    'hom_processed': 'HOME_KITCHEN_TOOLS',
    'pet_processed': 'PET_SUPPLIES',
    'spo_processed': 'SPORTS_OUTDOOR'
}

In [112]:
def downsample_class(file_list, target_count):
    return random.sample(file_list, target_count)


In [4]:
def split_dataset(file_list, train_ratio=0.77, val_ratio=0.20, check_ratio=0.03):
    random.shuffle(file_list)
    total_count = len(file_list)
    train_end = int(train_ratio * total_count)
    val_end = train_end + int(val_ratio * total_count)
    
    train_files = file_list[:train_end]
    val_files = file_list[train_end:val_end]
    check_files = file_list[val_end:]
    
    return train_files, val_files, check_files


In [5]:
def copy_files(file_list, src_directory, dest_directory):
    os.makedirs(dest_directory, exist_ok=True)
    for file in file_list:
        src_path = os.path.join(src_directory, file)
        dest_path = os.path.join(dest_directory, file)
        shutil.copy(src_path, dest_path)


In [6]:
# Iterate over each class in the mapping
for base_class, target_class in class_mapping.items():
    src_class_dir = os.path.join(base_directory, base_class)
    train_target_dir = os.path.join(target_directory, 'train', target_class)
    val_target_dir = os.path.join(target_directory, 'val', target_class)
    check_target_dir = os.path.join(target_directory, 'check', target_class)
    
    # Get the list of files in the source directory
    files = [f for f in os.listdir(src_class_dir) if os.path.isfile(os.path.join(src_class_dir, f))]
    
    # Downsample the Grocery class to 2500 images
    #if base_class == 'gro_processed':
    #    files = downsample_class(files, 2500)
    
    # Split the dataset
    train_files, val_files, check_files = split_dataset(files)
    
    # Copy the files to the target directories
    copy_files(train_files, src_class_dir, train_target_dir)
    copy_files(val_files, src_class_dir, val_target_dir)
    copy_files(check_files, src_class_dir, check_target_dir)

print("Splitting and copying completed.")


Splitting and copying completed.


In [7]:
# Check all files: is there anything apart from jpeg file
# is there anything not 224*224*3
# show class distributions

import os
import pandas as pd
from PIL import Image

def count_images_in_directory(directory):
    counts = {}
    incorrect_extension_count = 0
    incorrect_size_count = 0
    incorrect_channels_count = 0

    for class_name in os.listdir(directory):
        class_dir = os.path.join(directory, class_name)
        if os.path.isdir(class_dir):
            valid_files = []
            for file in os.listdir(class_dir):
                file_path = os.path.join(class_dir, file)
                if os.path.isfile(file_path):
                    if not file.lower().endswith('.jpeg'):
                        incorrect_extension_count += 1
                    else:
                        try:
                            with Image.open(file_path) as img:
                                if img.size != (224, 224):
                                    incorrect_size_count += 1
                                elif img.mode != 'RGB':
                                    incorrect_channels_count += 1
                                else:
                                    valid_files.append(file)
                        except Exception as e:
                            print(f"Error processing file {file_path}: {e}")
            
            counts[class_name] = len(valid_files)
    
    print(f"Number of files with incorrect extension: {incorrect_extension_count}")
    print(f"Number of images with incorrect size: {incorrect_size_count}")
    print(f"Number of images with incorrect channels: {incorrect_channels_count}")

    return counts

def create_counts_dataframe(train_counts, val_counts, check_counts):
    data = {
        'Class': [],
        'Train': [],
        'Validation': [],
        'Check': []
    }
    
    all_classes = set(train_counts.keys()).union(set(val_counts.keys())).union(set(check_counts.keys()))
    
    for class_name in all_classes:
        data['Class'].append(class_name)
        data['Train'].append(train_counts.get(class_name, 0))
        data['Validation'].append(val_counts.get(class_name, 0))
        data['Check'].append(check_counts.get(class_name, 0))
    
    df = pd.DataFrame(data)
    return df

# Directories for train, val, and check
train_dir = os.path.join(target_directory, 'train')
val_dir = os.path.join(target_directory, 'val')
check_dir = os.path.join(target_directory, 'check')

# Count images in each directory
train_counts = count_images_in_directory(train_dir)
val_counts = count_images_in_directory(val_dir)
check_counts = count_images_in_directory(check_dir)

# Create a DataFrame with the counts
df_counts = create_counts_dataframe(train_counts, val_counts, check_counts)
df_counts['Total'] = df_counts['Train'] + df_counts['Validation'] + df_counts['Check']

print(df_counts)


Number of files with incorrect extension: 0
Number of images with incorrect size: 0
Number of images with incorrect channels: 0
Number of files with incorrect extension: 0
Number of images with incorrect size: 0
Number of images with incorrect channels: 0
Number of files with incorrect extension: 0
Number of images with incorrect size: 0
Number of images with incorrect channels: 0
                            Class  Train  Validation  Check  Total
0                     ELECTRONICS   1353         351     54   1758
1                    PET_SUPPLIES   1260         327     50   1637
2           HOBBY_ARTS_STATIONERY   1091         283     43   1417
3              HOME_KITCHEN_TOOLS   1715         445     68   2228
4                         GROCERY   3978        1033    156   5167
5                  SPORTS_OUTDOOR   1235         321     49   1605
6                   BEAUTY_HEALTH   1202         312     48   1562
7  CLOTHING_ACCESSORIES_JEWELLERY   1071         278     42   1391
8            

In [9]:
df_counts['Total'].sum()

18177