# Pre Proccesing of the data

## Looking at the data 

I will styart by just exploring the data and trying to get familiar to it and see what I am working with. 

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from PIL import Image, ImageEnhance
import random
import hashlib
from tqdm import tqdm
import os

ImportError: C extension: None not built. If you want to import pandas from the source directory, you may need to run 'python setup.py build_ext' to build the C extensions first.

In [None]:
# Load the data
data_path = "/Users/michelangelozampieri/Downloads/bttai-ajl-2025/train/train"
categories = os.listdir(data_path)
# Remove .DS_Store from the list
categories.remove('.DS_Store')
for c in categories:
    print(c)

In [None]:
# Get the count of each category and store results in a dataframe
image_counts = {category: len(os.listdir(os.path.join(data_path, category))) for category in categories}
df = pd.DataFrame(image_counts.items(), columns=['Category', 'Image Count'])    
print(df)        

Looking at a sample image

In [None]:
# Load an image
img = Image.open('/Users/michelangelozampieri/Downloads/bttai-ajl-2025/train/train/acne/0cff6f3c9bb267f68c77740fc9c58587.jpg')
plt.imshow(img)

Display a few random images for each label 

In [None]:
# Display random images from each category
fig, axes = plt.subplots(2, 5, figsize=(12, 6))  # Adjust as needed
for i, category in enumerate(random.sample(categories, min(10, len(categories)))):
    image_path = os.path.join(data_path, category, random.choice(os.listdir(os.path.join(data_path, category))))
    img = Image.open(image_path)
    ax = axes[i // 5, i % 5]
    ax.imshow(img)
    ax.set_title(category)
    ax.axis("off")

plt.tight_layout()
plt.show()

# Image Augmentation

I will perform some image augemntation which will take existing images and flip, rotate, and slightly change the existing images to create more images to get a better training set for the data. 

In [None]:
def flip_UD(image_path):
    """Flip an image upside down and return the result"""
    img = Image.open(image_path)
    return img.transpose(Image.FLIP_TOP_BOTTOM)

def flip_LR(image_path):
    """Flip an image left to right and return the result"""
    img = Image.open(image_path)
    return img.transpose(Image.FLIP_LEFT_RIGHT)

def rotate(image_path, degrees):
    """Rotate an image and return the result"""
    img = Image.open(image_path)
    return img.rotate(degrees)

def adjust_brightness(image_path, factor=0.5):
    """Adjust the brightness and return the result"""
    img = Image.open(image_path)
    enhancer = ImageEnhance.Brightness(img)
    return enhancer.enhance(factor)

def adjust_contrast(image_path, factor=1.5):
    """Adjust the contrast and return the result"""
    img = Image.open(image_path)
    enhancer = ImageEnhance.Contrast(img)
    return enhancer.enhance(factor)

def adjust_saturation(image_path, factor=1.5):
    """ADjust the saturation and return the result"""
    img = Image.open(image_path)
    enhancer = ImageEnhance.Color(img)
    return enhancer.enhance(factor)

def adjust_hue(image_path, factor=0.1):
    """Adjust the hue of an image and return the result"""
    img = Image.open(image_path).convert("RGB")  # Ensure RGB mode
    img = np.array(img)  # Convert to NumPy array
    hsv = cv2.cvtColor(img, cv2.COLOR_RGB2HSV) # Convert from RGB to HSV

    hsv[..., 0] = (hsv[..., 0].astype(np.int16) + int(factor * 180)) % 180  # Adjust hue
    adjusted_img = cv2.cvtColor(hsv, cv2.COLOR_HSV2RGB) # Convert back to RGB
    return Image.fromarray(adjusted_img)

def save_image(img, original_path, output_dir, suffix):
    """Save the image to the output directory with a suffix"""
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    base_name = os.path.basename(original_path)
    name, ext = os.path.splitext(base_name)
    new_name = f"{name}_{suffix}{ext}"
    img.save(os.path.join(output_dir, new_name))


In [None]:
def augment(category, data_path):
    """Augment all images in a given directory and save them to a directory within the given directory"""
    category_dir = os.path.join(data_path, category)
    
    # Create a new directory within the category directory to store augmented images
    augmented_dir = os.path.join(category_dir, 'augmented')
    os.makedirs(augmented_dir, exist_ok=True)
    
    # For every file in the directory
    image_files = [f for f in os.listdir(category_dir) if f.endswith(('.png', '.jpg', '.jpeg'))]
    for file in tqdm(image_files, desc="Augmenting images"):
        file_path = os.path.join(category_dir, file)
        save_image(flip_UD(file_path), file_path, augmented_dir, "flip_Up_Down")
        save_image(flip_LR(file_path), file_path, augmented_dir, "flip_Left_Right")
        save_image(rotate(file_path, 90), file_path, augmented_dir, "rotate_90")
        save_image(rotate(file_path, 180), file_path, augmented_dir, "rotate_180")
        save_image(rotate(file_path, 270), file_path, augmented_dir, "rotate_270")
        save_image(adjust_brightness(image_path, factor=0.5), file_path, augmented_dir, "adjusted_brightness")
        save_image(adjust_contrast(image_path, factor=1.5), file_path, augmented_dir, "adjusted_contrast")
        save_image(adjust_saturation(image_path, factor=1.5), file_path, augmented_dir, "adjusted_saturation")
        save_image(adjust_hue(image_path, factor=0.1), file_path, augmented_dir, "adjusted_hue")


In [None]:
augment('acne', data_path)

In [None]:
# Augment every other category except acne
for category in categories:
    if category != 'acne':
        augment(category, data_path)

In [None]:
# Count the images and stores results in a new dataframe. 
def count_images(directory):
    """Count the number of images in a directory"""
    return len([f for f in os.listdir(directory) if f.endswith(('.png', '.jpg', '.jpeg'))])

In [None]:
def create_image_counts_dataframe(base_dir):
    """Create a DataFrame with counts of original and augmented images for each category"""
    data = []
    subdirs = [subdir for subdir in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, subdir))]
    
    for subdir in subdirs:
        input_dir = os.path.join(base_dir, subdir)
        original_count = count_images(input_dir)
        
        augmented_dir = os.path.join(input_dir, "augmented")
        if os.path.exists(augmented_dir):
            augmented_count = count_images(augmented_dir)
        else:
            augmented_count = 0
        
        total_count = original_count + augmented_count
        data.append([subdir, original_count, augmented_count, total_count])
    
    df = pd.DataFrame(data, columns=["Category", "Original Images", "Augmented Images", "Total Images"])
    return df

In [None]:
df = create_image_counts_dataframe(data_path)
print(df)

In [None]:
# Make sure that for each row the number of original images * 3 is equal to the number of augmented images and store true or false in a new cell
df["Augmented Correct"] = df["Original Images"] * 9 == df["Augmented Images"]
print(df)

In [None]:
# Ensure the 'Augmented Correct' column exists
if "Augmented Correct" not in df.columns:
	df["Augmented Correct"] = df["Original Images"] * 9 == df["Augmented Images"]

# Count the trues in Augmented Correct
correct_augmented = df["Augmented Correct"].sum()
print(f"Correctly augmented {correct_augmented} categories.")

In [None]:
# Count the total number of original and total images 
total_original_images = df["Original Images"].sum()
total_augmented_images = df["Augmented Images"].sum()
total_images = df["Total Images"].sum()

print(f"Total original images: {total_original_images}")
print(f"Total augmented images: {total_augmented_images}")
print(f"Total images: {total_images}")

Convert all images to be 224 x 224, this will make it easier to train the CNN. 

In [None]:
# Resize every image in every durectory and sub directory to be 244 x 244 pixels
def resize_images(directory, size):
    """Resize all images in a directory to the given size"""
    image_files = [f for f in os.listdir(directory) if f.endswith(('.png', '.jpg', '.jpeg'))]
    for file in tqdm(image_files, desc=f"Resizing images in {os.path.basename(directory)}"):
        file_path = os.path.join(directory, file)
        img = Image.open(file_path)
        img = img.resize((size, size))
        img.save(file_path)

def resize_all_images(base_dir, size):
    """Resize all images in a directory and subdirectories to the given size"""
    subdirs = [subdir for subdir in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, subdir))]
    for subdir in subdirs:
        subdir_path = os.path.join(base_dir, subdir)
        resize_images(subdir_path, size)
        augmented_dir = os.path.join(subdir_path, "augmented")
        if os.path.exists(augmented_dir):
            resize_images(augmented_dir, size)

In [None]:
resize_all_images(data_path, 244)

Check that the resizing worked and that all images are 244 x 244

In [None]:
# Check how many pictures in each directory are 244 x 244 pixels
def check_image_sizes(directory):
    """Check the size of all images in a directory and return the count of correctly sized images"""
    correct_size_count = 0
    image_files = [f for f in os.listdir(directory) if f.endswith(('.png', '.jpg', '.jpeg'))]
    for file in tqdm(image_files, desc=f"Checking image sizes in {os.path.basename(directory)}"):
        file_path = os.path.join(directory, file)
        img = Image.open(file_path)
        if img.size == (244, 244):
            correct_size_count += 1
    return correct_size_count

def check_all_image_sizes(base_dir):
    """Check the size of all images in a directory and subdirectories and update the dataframe"""
    correct_sizes = []
    subdirs = [subdir for subdir in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, subdir))]
    for subdir in subdirs:
        subdir_path = os.path.join(base_dir, subdir)
        correct_size_count = check_image_sizes(subdir_path)
        augmented_dir = os.path.join(subdir_path, "augmented")
        if os.path.exists(augmented_dir):
            correct_size_count += check_image_sizes(augmented_dir)
        correct_sizes.append(correct_size_count)
    
    df["Correct Size Images"] = correct_sizes

In [None]:
check_all_image_sizes(data_path)

In [None]:
print(df)

In [None]:
df["All Correct Size"] = df["Total Images"] == df["Correct Size Images"]
print(df)

In [None]:
all_correct_size = df["All Correct Size"].sum()
print(f"All images are the correct size: {all_correct_size}")

In [None]:
total_img_count = df["Total Images"].sum()
print(f"Total images: {total_img_count}")

total_aug_count = df["Augmented Images"].sum()
print(f"Total augmented images: {total_aug_count}")

total_original_count = df["Original Images"].sum()
print(f"Total original images: {total_original_count}")

Now we have a data set of 17160 images. From the original 2860, we increased the number of total images by 14300 images. We can start training a CNN using these as the data

In [None]:
# Check the sizes of 100 random images

def check_random_image_sizes(base_dir, num_images):
    """Check the size of a random sample of images in a directory and subdirectories"""
    subdirs = [subdir for subdir in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, subdir))]
    for subdir in subdirs:
        subdir_path = os.path.join(base_dir, subdir)
        image_files = [f for f in os.listdir(subdir_path) if f.endswith(('.png', '.jpg', '.jpeg'))]
        random_files = random.sample(image_files, min(num_images, len(image_files)))
        for file in random_files:
            file_path = os.path.join(subdir_path, file)
            img = Image.open(file_path)
            print(f"Image size for {file}: {img.size}")

In [None]:
check_random_image_sizes(data_path, 100)