In [None]:

import os
import shutil
from glob import glob
import random

# Paths to training directories for each class
TRAIN_DIR = 'chest_xray_Raw/train'
NORMAL_DIR = os.path.join(TRAIN_DIR, 'NORMAL')
PNEUMONIA_DIR = os.path.join(TRAIN_DIR, 'PNEUMONIA')

# Get the list of image files for each class (adjust file extension if necessary)
normal_files = glob(os.path.join(NORMAL_DIR, '*.jpeg'))
pneumonia_files = glob(os.path.join(PNEUMONIA_DIR, '*.jpeg'))

# Count the number of images in each class
normal_count = len(normal_files)
pneumonia_count = len(pneumonia_files)

print(f"Initial Image Count: NORMAL = {normal_count}, PNEUMONIA = {pneumonia_count}")

# Determine the minority and majority classes
if normal_count < pneumonia_count:
    minority_files = normal_files
    minority_dir = NORMAL_DIR
    majority_count = pneumonia_count
else:
    minority_files = pneumonia_files
    minority_dir = PNEUMONIA_DIR
    majority_count = normal_count

# Calculate the difference to achieve balance
oversample_count = majority_count - len(minority_files)
print(f"Oversampling {minority_dir} by duplicating {oversample_count} images.")

# Oversample by duplicating images from the minority class
for i in range(oversample_count):
    img_path = random.choice(minority_files)  # Randomly select an image
    img_name = os.path.basename(img_path)  # Extract the original image name
    new_img_name = f"aug_{i}_{img_name}"  # Create a unique name for the duplicate
    new_img_path = os.path.join(minority_dir, new_img_name)
    shutil.copy(img_path, new_img_path)  # Copy the image to the target directory

# Verify and display the new distribution
new_normal_count = len(glob(os.path.join(NORMAL_DIR, '*.jpeg')))
new_pneumonia_count = len(glob(os.path.join(PNEUMONIA_DIR, '*.jpeg')))

print(f"Updated Image Count: NORMAL = {new_normal_count}, PNEUMONIA = {new_pneumonia_count}")


Initial Image Count: NORMAL = 1341, PNEUMONIA = 3875
Oversampling chest_xray_Raw/train\NORMAL by duplicating 2534 images.
Updated Image Count: NORMAL = 3875, PNEUMONIA = 3875
