# ***Google Captcha image recognition***
**A Deep Learning Project using TensorFlow**

*by Emma Begard, Augustin Bouveau, Gabin Jobert--Rollin, Hugues Boisdon*

## I Data **Fetching**

Our source Dataset can be found at : *https://www.kaggle.com/datasets/mikhailma/test-dataset*

***credits : Mike Mazurov***

### I.1 **Dowloading** Dataset Files from ***KaggleHub*** Source

In [None]:
import kagglehub

DATA_FOLDER_PATH_IF_CACHED = kagglehub.dataset_download("mikhailma/test-dataset")
print("Path to dataset files in cache:", DATA_FOLDER_PATH_IF_CACHED)

The data files are downloaded in the Users **cache** by default.
If the data source folder is moved, ***please update*** the following *path variable*

In [None]:
DATA_FOLDER_PATH_IF_MOVED = "" # Data folder path if data was moved since download

In [None]:
def getDataSourceFolderPath() -> str:
    return DATA_FOLDER_PATH_IF_CACHED if DATA_FOLDER_PATH_IF_MOVED == "" else DATA_FOLDER_PATH_IF_MOVED

def getImagesDataFolderPath() -> str:
    return getDataSourceFolderPath() +"/Google_Recaptcha_V2_Images_Dataset/images"

In [None]:
from PIL import Image
path = DATA_FOLDER_PATH_IF_CACHED if DATA_FOLDER_PATH_IF_MOVED == "" else DATA_FOLDER_PATH_IF_MOVED
Image.open(getImagesDataFolderPath() +"/Bicycle/Bicycle (1).png")

### I.2 First Try at **Loading** the Training and Validation Datasets

In [None]:
from tensorflow.keras.utils import image_dataset_from_directory

BATCH_SIZE = 32
IMG_DIMENSIONS = (120, 120) # pixels per pixels
SEED_RANDOM = 123

VALIDATION_RATIO = 0.2

def getDatasets(batch_size=BATCH_SIZE, 
                img_dims=IMG_DIMENSIONS, 
                validation_ratio=VALIDATION_RATIO, 
                seed=SEED_RANDOM) -> tuple:
  train = image_dataset_from_directory(
    getImagesDataFolderPath(),
    validation_split= validation_ratio,
    subset= "training",
    
    seed=       seed,
    image_size= img_dims,
    batch_size= batch_size)

  validation = image_dataset_from_directory(
    getImagesDataFolderPath(),
    validation_split= validation_ratio,
    subset= "validation",
    
    seed=       seed,
    image_size= img_dims,
    batch_size= batch_size)
  
  return train, validation


The **keras** sublibrary of ***TensorFlow*** allow us to directly load our datasets (and ensure the size normalization of 120px per 120px for all images).

In [None]:
train_dataset, validation_dataset = getDatasets()

CLASS_NAMES = train_dataset.class_names
print(CLASS_NAMES)

Our model is going to be guessing between these classes for each image.

#### Data Sample for Visualization  

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 10))
for images, labels in train_dataset.take(1):
  for i in range(9):
    ax = plt.subplot(3, 3, i + 1)
    plt.imshow(images[i].numpy().astype("uint8"))
    plt.title(CLASS_NAMES[labels[i]])
    plt.axis("off")

## II Data **Preprocessing**

### 1 Preprocessing layers & Optimizations

#### 1.1 **Normalization Layer** for pixel values

In [None]:
from tensorflow.keras.layers import Rescaling

def getNormalizationLayer():
    return Rescaling(1./255)

Pixel values will now be bound from 0 to 1 instead of 0 to 255.

#### 1.2 Data Augmentation Layers

In [None]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import RandomFlip, RandomRotation, RandomZoom

def getAugmentationLayers(maxRotation:float= 0.1, maxZoom:float= 0.1) -> Sequential:
    return Sequential(
    [
        RandomFlip("horizontal",
                        input_shape=(IMG_DIMENSIONS[0],
                                    IMG_DIMENSIONS[1],
                                    3)),
        RandomRotation(maxRotation),
        RandomZoom(maxZoom),
    ]
    )

In [None]:
plt.figure(figsize=(10, 10))
for images, _ in train_dataset.take(1):
  for i in range(9):
    augmented_images = getAugmentationLayers()(images)
    ax = plt.subplot(3, 3, i + 1)
    plt.imshow(augmented_images[0].numpy().astype("uint8"))
    plt.axis("off")

#### 1.3 **Optimizations** of Data memory caching, availibity and randomization

In [None]:
from tensorflow.data import AUTOTUNE

train_dataset_opti        = train_dataset.cache().shuffle(1000).prefetch(buffer_size=AUTOTUNE)
validation_dataset_opti   = validation_dataset.cache().prefetch(buffer_size=AUTOTUNE)

### 2 Organizing & Rebalancing the datasets

#### 2.1 Creating sub folders for test, train and validation for the original dataset

In [None]:
import os
import random
import shutil

def split_dataset_by_class(source_dir, train_dir, val_dir, test_dir, train_size=0.7, val_size=0.2, test_size=0.1, min_images=10):
    # Ensure the directories for train, validation, and test exist
    os.makedirs(train_dir, exist_ok=True)
    os.makedirs(val_dir, exist_ok=True)
    os.makedirs(test_dir, exist_ok=True)

    # Traverse each subdirectory (class folder) in the source directory
    for class_name in os.listdir(source_dir):
        class_folder = os.path.join(source_dir, class_name)
        
        # Skip non-directories (just in case there are files in the source dir)
        if not os.path.isdir(class_folder):
            continue

        # Get all image files in the class folder
        image_files = [os.path.join(class_folder, file) for file in os.listdir(class_folder)
                       if file.lower().endswith(('png', 'jpg', 'jpeg'))]  # Adjust for your image file extensions

        # Skip the class if there are fewer than min_images
        if len(image_files) < min_images:
            print(f"Skipping class {class_name} because it has fewer than {min_images} images.")
            continue

        # Create the same class subdirectories in train, val, and test directories
        os.makedirs(os.path.join(train_dir, class_name), exist_ok=True)
        os.makedirs(os.path.join(val_dir, class_name), exist_ok=True)
        os.makedirs(os.path.join(test_dir, class_name), exist_ok=True)

        # Shuffle the image files for randomness
        random.shuffle(image_files)

        # Calculate the number of images for each set
        total_images = len(image_files)
        train_count = int(total_images * train_size)
        val_count = int(total_images * val_size)
        test_count = total_images - train_count - val_count  # Remaining for test

        # Split the images
        train_images = image_files[:train_count]
        val_images = image_files[train_count:train_count + val_count]
        test_images = image_files[train_count + val_count:]

        # Copy the images into the corresponding directories
        def copy_images(image_list, target_dir):
            for img_path in image_list:
                shutil.copy(img_path, target_dir)

        copy_images(train_images, os.path.join(train_dir, class_name))
        copy_images(val_images, os.path.join(val_dir, class_name))
        copy_images(test_images, os.path.join(test_dir, class_name))

        print(f"Class {class_name}: {train_count} for training, {val_count} for validation, {test_count} for testing.")

#### 2.2 Balancing classes in the training folder and validation folder

In [None]:
import os
import random
from PIL import Image

def duplicate_images_randomly_per_folder(root_dir, target_count_per_folder, min_img=5, max_img=100):
    # Traverse all subdirectories in the root directory
    for root, subdirs, files in os.walk(root_dir):
        # Skip the root directory itself
        if root == root_dir:  
            continue

        # Print the current directory being processed (for debugging)
        print(f"Processing folder: {root}")
        
        # Filter image files (png, jpg, jpeg)
        image_files = [os.path.join(root, file) for file in files if file.lower().endswith(('png', 'jpg', 'jpeg'))]
        current_count = len(image_files)
        
        # If the number of images is less than min_img, delete the folder
        if current_count < min_img:
            print(f"Directory {root} has fewer than {min_img} images, deleting folder.")
            # Delete all the files in the folder before removing it
            for file in image_files:
                os.remove(file)
            os.rmdir(root)  # Remove the folder itself
            continue  # Skip to the next folder
        
        # If the number of images exceeds max_img, randomly delete images to meet max_img limit
        if current_count > max_img:
            print(f"Directory {root} has more than {max_img} images, deleting excess images.")
            images_to_delete = current_count - max_img
            random.shuffle(image_files)  # Shuffle to delete images randomly
            for img_path in image_files[:images_to_delete]:
                os.remove(img_path)  # Remove the selected images
            current_count = max_img  # Update current count after deletion
        
        # If the current count is already greater than or equal to the target, skip duplication
        if current_count >= target_count_per_folder:
            print(f"Directory {root} already has {current_count} images, no duplication needed.")
            continue
        
        # Calculate how many more images are needed
        images_needed = target_count_per_folder - current_count
        print(f"Duplicating {images_needed} images randomly in {root}.")
        
        # Duplicate images randomly until the target count is reached
        while images_needed > 0:
            random.shuffle(image_files)  # Shuffle the list of image files
            for img_path in image_files:
                if images_needed <= 0:
                    break
                
                # Load the image
                img = Image.open(img_path)
                
                # Create a unique name for the duplicated image
                folder_name, img_name = os.path.split(img_path)
                duplicated_img_name = f"{os.path.splitext(img_name)[0]}_dup{images_needed}{os.path.splitext(img_name)[1]}"
                duplicated_img_path = os.path.join(folder_name, duplicated_img_name)
                
                # Save the duplicated image with a new name
                img.save(duplicated_img_path)

                # Decrement the remaining number of images needed
                images_needed -= 1

        print(f"Duplicated images in {root} to reach {target_count_per_folder} images.")

### 3 Image preprocesssing for Resnet

In [None]:
from tensorflow.keras.applications.resnet import preprocess_input

def preprocess_data_for_resnet(image, label):
    image = preprocess_input(image)  # Apply preprocessing
    return image, label

## III Our differents approaches

### 1 Naive CNN models

#### III.1.A Defining and compiling the model

In [None]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense

first_model = Sequential([
  Conv2D(16, 3, padding='same', activation='relu'),
  MaxPooling2D(),
  Conv2D(32, 3, padding='same', activation='relu'),
  MaxPooling2D(),
  Conv2D(64, 3, padding='same', activation='relu'),
  MaxPooling2D(),
  Flatten(),
  Dense(128, activation='relu'),
  Dense(len(CLASS_NAMES), name="outputs")
], name = "First_Model")

In [None]:
from tensorflow.keras.losses import SparseCategoricalCrossentropy

first_model.compile(optimizer='adam',
              loss= SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

first_model.summary()

#### III.1.B Training the model

In [None]:
EPOCHS_FIRST= 10

first_history = first_model.fit(
  train_dataset,
  validation_data=validation_dataset,
  epochs=EPOCHS_FIRST
)

#### III.1.C Visualizing results

In [None]:
acc = first_history.history['accuracy']
val_acc = first_history.history['val_accuracy']

loss = first_history.history['loss']
val_loss = first_history.history['val_loss']

epochs_range = range(EPOCHS_FIRST)

plt.figure(figsize=(8, 8))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Training Accuracy')
plt.plot(epochs_range, val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()

In [None]:
from numpy import argmax
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

tf.math.confusion_matrix(
    labels,
    predictions,
    num_classes=None,
    weights=None,
    dtype=tf.dtypes.int32,
    name=None
)

In [None]:
from sklearn.metrics import classification_report
import numpy as np

# Step 1: Extract true labels from the validation dataset
y_true = []
for images, labels in validation_dataset:
    y_true.extend(labels.numpy())  # Convert the labels from tensor to numpy

# Step 2: Get predictions from the model (do this once, not twice)
predictions = first_model.predict(validation_dataset)

# Convert predictions to class labels (assuming probabilities)
predicted_labels = np.argmax(predictions, axis=1)

# Step 3: Generate the classification report
infos_2 = classification_report(y_true, predicted_labels)
print(infos_2)

#### III.2.A Defining and compiling the model

In [None]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout

second_model = Sequential([
  getAugmentationLayers(),
  getNormalizationLayer(),
  Conv2D(16, 3, padding='same', activation='relu'),
  MaxPooling2D(),
  Conv2D(32, 3, padding='same', activation='relu'),
  MaxPooling2D(),
  Conv2D(64, 3, padding='same', activation='relu'),
  MaxPooling2D(),
  Dropout(0.2),
  Flatten(),
  Dense(128, activation='relu'),
  Dense(len(CLASS_NAMES), name="outputs")
], name = "Second_Model")

In [None]:
from tensorflow.keras.losses import SparseCategoricalCrossentropy

second_model.compile(optimizer='adam',
              loss=SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

second_model.summary()

#### III.2.B Training the model

In [None]:
EPOCHS_SECOND = 15
second_history = second_model.fit(
  train_dataset,
  validation_data= validation_dataset,
  epochs=EPOCHS_SECOND
)

In [None]:
from sklearn.metrics import classification_report
import numpy as np

# Step 1: Extract true labels from the validation dataset
y_true = []
for images, labels in validation_dataset:
    y_true.extend(labels.numpy())  # Convert the labels from tensor to numpy

# Step 2: Get predictions from the model (do this once, not twice)
predictions = second_model.predict(validation_dataset)

# Convert predictions to class labels (assuming probabilities)
predicted_labels = np.argmax(predictions, axis=1)

# Step 3: Generate the classification report
infos_2 = classification_report(y_true, predicted_labels)
print(infos_2)

#### III.2.C Visualizing results

In [None]:
acc = second_history.history['accuracy']
val_acc = second_history.history['val_accuracy']

loss = second_history.history['loss']
val_loss = second_history.history['val_loss']

epochs_range = range(EPOCHS_SECOND)

plt.figure(figsize=(8, 8))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Training Accuracy')
plt.plot(epochs_range, val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()

In [None]:
from tensorflow.image import decode_jpeg, resize
from tensorflow.keras.applications.resnet50 import preprocess_input

def preprocess_data_for_resnet(image, label):
    image = preprocess_input(image)  # Apply preprocessing
    return image, label
train_dataset, validation_dataset = getDatasets()

train_dataset= train_dataset.map(preprocess_data_for_resnet).cache().shuffle(1000).prefetch(buffer_size=AUTOTUNE)
validation_dataset= validation_dataset.map(preprocess_data_for_resnet).cache().prefetch(buffer_size=AUTOTUNE)

### 2 Testing CNN models with new preprocessings

#### 2.1 Loading the new datasets

In [None]:
source_directory = getImagesDataFolderPath()
train_directory = './datas/train'  
val_directory = './datas/validation' 
test_directory = './datas/test' 

# Split the dataset, skipping classes with fewer than 500 images
split_dataset_by_class(source_directory, train_directory, val_directory, test_directory, min_images=500)

# Define your target number of images for each subfolder
target_number_per_folder = 1500  # Set your target number of images per folder

# Set your root image directory (which contains subfolders like 'cars', 'bicycles', etc.)
root_image_directory = getImagesDataFolderPath() 

# for training 
duplicate_images_randomly_per_folder("./datas/training", 1500, min_img=500, max_img=1500)
# for validation 
duplicate_images_randomly_per_folder("./datas/validation", 1500*0.2, min_img=500*0.2, max_img=1500)



train_dataset = image_dataset_from_directory(
 "./datas/train" ,
  label_mode='int',
  image_size= IMG_DIMENSIONS,
  batch_size= BATCH_SIZE)

validation_dataset = image_dataset_from_directory(
  "./datas/validation" ,
  label_mode='int',
  image_size= IMG_DIMENSIONS,
  batch_size= BATCH_SIZE)

test_dataset = image_dataset_from_directory(
  "./datas/test" ,
  label_mode='int',
  seed=       SEED_RANDOM,
  image_size= IMG_DIMENSIONS,
  batch_size= BATCH_SIZE)

In [None]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dense, Dropout, GlobalAveragePooling2D

model_2_1 = Sequential([
    getAugmentationLayers(),
    getNormalizationLayer(),
    Conv2D(16, (3, 3), padding='same', activation='relu'),
    MaxPooling2D((2, 2)),
    Conv2D(32, (3, 3), padding='same', activation='relu'),
    MaxPooling2D((2, 2)),
    Conv2D(64, (3, 3), padding='same', activation='relu'),
    MaxPooling2D((2, 2)),
    Conv2D(128, (3, 3), padding='same', activation='relu'),
    GlobalAveragePooling2D(),  # Summarizes feature maps
    Dropout(0.3),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(len(CLASS_NAMES), activation='softmax', name="outputs")
], name="model_1")


model_2_2 = Sequential([
    getAugmentationLayers(),
    getNormalizationLayer(),
    Conv2D(16, (3, 3), padding='same', activation='relu'),
    MaxPooling2D((2, 2)),
    Conv2D(32, (3, 3), padding='same', activation='relu'),
    MaxPooling2D((2, 2)),
    Conv2D(64, (3, 3), padding='same', activation='relu'),
    MaxPooling2D((2, 2)),
    Conv2D(128, (3, 3), padding='same', activation='relu'),
    MaxPooling2D((2, 2)),  # Add additional MaxPooling layer
    Conv2D(256, (3, 3), padding='same', activation='relu'),  # Additional convolutional layer
    MaxPooling2D((2, 2)),
    GlobalAveragePooling2D(),
    Dropout(0.3),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(len(CLASS_NAMES), activation='softmax', name="outputs")
], name="model_2")

### 3 Testing with Resnet layers