In [35]:
import cv2
import numpy as np
import torch
import pandas as pd


def draw_labels_on_image(image_path, labels_file):
    """
    Loads an image from image_path, retrieves ground truth (x1, y1, x2, y2) from labels.txt,
    and displays the image with red circles at those coordinates.

    Args:
        image_path (str): Path to the image.
        labels_file (str): Path to the labels file (CSV or TXT with x1, y1, x2, y2).
    """
    # Load the image using OpenCV
    image = cv2.imread(image_path)
    if image is None:
        print(f"Unable to load image: {image_path}")
        return

    # Resize the image to (width=340, height=410) to match the model's expected size
    image_resized = image#cv2.resize(image, (340, 410))

    # Read labels from the file
    labels_df = pd.read_csv(labels_file)  # Ensure labels.txt is formatted correctly
    image_name = image_path.split('/')[-1]  # Extract filename from path

    # Find the row corresponding to the image name (assuming there is an 'id' or filename column)
    if 'img_name' in labels_df.columns:
        row = labels_df[labels_df['img_name'] == image_name]
    else:
        row = labels_df.iloc[0]  # If there's no ID column, just use the first row (for testing)

    if row.empty:
        print(f"No labels found for {image_name}")
        return

    # Extract ground truth coordinates
    x1, y1, x2, y2 = row[['x1', 'y1', 'x2', 'y2']].to_numpy().flatten()
    x1 = int(x1)
    x2 = int(x2)
    y1 = int(y1)
    y2 = int(y2)
    print(f"Ground truth coordinates: x1={x1}, y1={y1}, x2={x2}, y2={y2}")

    # Draw red circles at the ground truth coordinates
    image_drawn = image_resized.copy()
    cv2.circle(image_drawn, (x1, y1), radius=5, color=(0, 255, 0), thickness=-1)  # Green circle
    cv2.circle(image_drawn, (x2, y2), radius=5, color=(0, 0, 255), thickness=-1)  # Red circle

    # Display the image with the drawn points
    cv2.imshow("img",image_drawn)
    cv2.waitKey(0)
    cv2.destroyAllWindows()

extract_dir = 'extracted_captchas'
processed_images_dir = "processed_captchas"
image_path = f"{extract_dir}/captchas_saved/captcha_8.png"
labels_file = "labels.txt"
draw_labels_on_image(image_path,labels_file)
image_path = f"truncated_captchas/captcha_8.png"
labels_file = "truncated_labels.csv"
draw_labels_on_image(image_path,labels_file)


Ground truth coordinates: x1=105, y1=175, x2=180, y2=207
Ground truth coordinates: x1=40, y1=75, x2=115, y2=107


In [22]:
import cv2
import numpy as np

def show_pixels(image_path,nbPixelsHaut,nbPixelsBas,nbPixelsGauche,nbPixelsDroite):
    """
    Displays the top 20 rows of an image.

    Args:
        image_path (str): Path to the image file.
    """
    # Load the image
    image = cv2.imread(image_path)

    # Check if the image was loaded correctly
    if image is None:
        print(f"Error: Unable to load image at {image_path}")
        return

    truncated_image = image[nbPixelsBas:nbPixelsHaut, nbPixelsGauche:nbPixelsDroite]

    # Display the cropped section
    cv2.imshow("img",truncated_image)  # Use cv2_imshow in Google Colab, replace with cv2.imshow() for local use
    cv2.waitKey(0)
    cv2.destroyAllWindows()


extract_dir = 'extracted_captchas'
processed_images_dir = "processed_captchas"
image_path = f"{extract_dir}/captchas_saved/captcha_315.png"
labels_file = "labels.txt"

nbPixelsHaut,nbPixelsBas,nbPixelsGauche,nbPixelsDroite = 310,100,65,275 # Images tronquées
# nbPixelsHaut,nbPixelsBas,nbPixelsGauche,nbPixelsDroite = 55,5,185,235 # Premier perso
# nbPixelsHaut, nbPixelsBas, nbPixelsGauche, nbPixelsDroite = 50,10, 265, 305 #Second perso
show_pixels(image_path,nbPixelsHaut,nbPixelsBas,nbPixelsGauche,nbPixelsDroite)

In [34]:
import pandas as pd

def adjust_x_coordinates(label, nbPixelsGauche):
    return label - nbPixelsGauche

def adjust_y_coordinates(label, nbPixelsBas):
    return label - nbPixelsBas

# Load the labels file
df = pd.read_csv("labels.txt")
print(df.columns)
# Ensure the coordinate columns are numeric
df[['x1', 'y1', 'x2', 'y2']] = df[['x1', 'y1', 'x2', 'y2']].apply(pd.to_numeric)

nbPixelsHaut,nbPixelsBas,nbPixelsGauche,nbPixelsDroite = 310,100,65,275 # Images tronquées
# Adjust coordinates
df['x1'] = df['x1'].apply(lambda x: adjust_x_coordinates(x, nbPixelsGauche))
df['x2'] = df['x2'].apply(lambda x: adjust_x_coordinates(x, nbPixelsGauche))
df['y1'] = df['y1'].apply(lambda y: adjust_y_coordinates(y, nbPixelsBas))
df['y2'] = df['y2'].apply(lambda y: adjust_y_coordinates(y, nbPixelsBas))

# Save the adjusted labels
df.to_csv('truncated_labels.csv', index=False)

# Display the updated DataFrame
print(df)


Index(['img_name', 'x1', 'y1', 'x2', 'y2'], dtype='object')
            img_name          x1          y1          x2          y2
0      captcha_1.png   46.816017  146.209957   54.581169   18.639610
1     captcha_10.png   54.581169  143.991342  155.528139   67.449134
2    captcha_100.png  103.390693  192.800866   22.411255  191.691558
3    captcha_101.png  142.216450  125.133117   60.127706  165.068182
4    captcha_102.png   95.625541  130.679654  155.528139  170.614719
..               ...         ...         ...         ...         ...
311   captcha_95.png   83.423160   35.279221  122.248918  182.817100
312   captcha_96.png   49.034632   40.825758   44.597403  172.833333
313   captcha_97.png   45.706710  170.614719  138.888528   92.963203
314   captcha_98.png   45.706710  105.165584  176.604978   49.700216
315   captcha_99.png   42.378788  187.254329   23.520563  117.367965

[316 rows x 5 columns]


In [32]:
import os
import cv2

def crop_and_save_images(input_folder, output_folder="truncated_captchas",nbPixelsHaut= 330, nbPixelsBas= 55, nbPixelsGauche= 20, nbPixelsDroite = 320):
    """
    Crops images in the input_folder according to predefined pixel boundaries
    and saves them to the output_folder with the same filenames.

    Args:
        input_folder (str): Path to the folder containing the original images.
        output_folder (str): Path where the cropped images will be saved.
    """
    # Define cropping boundaries
    

    # Create output folder if it does not exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Process each image in the input folder
    for filename in os.listdir(input_folder):
        input_path = os.path.join(input_folder, filename)
        output_path = os.path.join(output_folder, filename)

        # Read the image
        image = cv2.imread(input_path)
        if image is None:
            print(f"Skipping {filename} (could not load image)")
            continue

        # Apply cropping
        cropped_image = image[nbPixelsBas:nbPixelsHaut, nbPixelsGauche:nbPixelsDroite]

        # Save the cropped image to the new folder
        cv2.imwrite(output_path, cropped_image)
        print(f"Saved cropped image: {output_path}")

# nbPixelsHaut,nbPixelsBas,nbPixelsGauche,nbPixelsDroite = 50,10,190,230 # Premier perso
# nbPixelsHaut, nbPixelsBas, nbPixelsGauche, nbPixelsDroite = 50,10, 265, 305 #Second perso

# Example Usage
nbPixelsHaut,nbPixelsBas,nbPixelsGauche,nbPixelsDroite = 310,100,65,275 # Images tronquées
crop_and_save_images("extracted_captchas/captchas_saved","truncated_captchas",nbPixelsHaut,nbPixelsBas,nbPixelsGauche,nbPixelsDroite) #truncated captchas
# nbPixelsHaut,nbPixelsBas,nbPixelsGauche,nbPixelsDroite = 55,5,185,235
# crop_and_save_images("extracted_captchas/captchas_saved","premier_perso",nbPixelsHaut,nbPixelsBas,nbPixelsGauche,nbPixelsDroite) #premier perso
# nbPixelsHaut, nbPixelsBas, nbPixelsGauche, nbPixelsDroite = 55,5, 260, 310
# crop_and_save_images("extracted_captchas/captchas_saved","second_perso",nbPixelsHaut,nbPixelsBas,nbPixelsGauche,nbPixelsDroite) #second perso


Saved cropped image: truncated_captchas\captcha_1.png
Saved cropped image: truncated_captchas\captcha_10.png
Saved cropped image: truncated_captchas\captcha_100.png
Saved cropped image: truncated_captchas\captcha_101.png
Saved cropped image: truncated_captchas\captcha_102.png
Saved cropped image: truncated_captchas\captcha_103.png
Saved cropped image: truncated_captchas\captcha_104.png
Saved cropped image: truncated_captchas\captcha_105.png
Saved cropped image: truncated_captchas\captcha_106.png
Saved cropped image: truncated_captchas\captcha_107.png
Saved cropped image: truncated_captchas\captcha_108.png
Saved cropped image: truncated_captchas\captcha_109.png
Saved cropped image: truncated_captchas\captcha_11.png
Saved cropped image: truncated_captchas\captcha_110.png
Saved cropped image: truncated_captchas\captcha_111.png
Saved cropped image: truncated_captchas\captcha_112.png
Saved cropped image: truncated_captchas\captcha_113.png
Saved cropped image: truncated_captchas\captcha_114.

In [38]:
import cv2
import numpy as np
import os

# Path to the folder containing the images
image_folder = "truncated_captchas"  # Change this to your actual folder path

# List all image files in the folder
image_files = [f for f in os.listdir(image_folder) if f.endswith('.png')]  # Change extension if needed

# Initialize an accumulator with zeros (assuming images are the same size)
num_images = len(image_files)
if num_images == 0:
    raise ValueError("No images found in the specified folder.")

# Load the first image to get dimensions
first_image = cv2.imread(os.path.join(image_folder, image_files[0]), cv2.IMREAD_COLOR)
h, w, c = first_image.shape
mean_image = np.zeros((h, w, c), dtype=np.float32)

# Compute the sum of all images
for file in image_files:
    img = cv2.imread(os.path.join(image_folder, file), cv2.IMREAD_COLOR)
    if img is None:
        print(f"Warning: Could not read {file}")
        continue
    mean_image += img.astype(np.float32)  # Convert to float to prevent overflow

# Compute the mean by dividing by the number of images
mean_image /= num_images

# Convert back to uint8 format for visualization and saving
mean_image = np.clip(mean_image, 0, 255).astype(np.uint8)

# Save the mean image
cv2.imwrite("mean_image.png", mean_image)

# Display the mean image (optional)
cv2.imshow("Mean Image", mean_image)
cv2.waitKey(0)
cv2.destroyAllWindows()


In [39]:
import cv2
import numpy as np
import os

# Paths
image_folder = "truncated_captchas"  # Change this to your actual folder
output_folder = "filtered_truncated_captchas"  # Folder to save cleaned images
mean_image_path = "mean_image.png"  # Path to saved mean image

# Create output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Load the mean image
mean_image = cv2.imread(mean_image_path, cv2.IMREAD_COLOR).astype(np.float32)

# List all image files in the folder
image_files = [f for f in os.listdir(image_folder) if f.endswith('.png')]  # Change extension if needed

# Process each image
for file in image_files:
    img_path = os.path.join(image_folder, file)
    output_path = os.path.join(output_folder, file)
    
    # Load the image
    img = cv2.imread(img_path, cv2.IMREAD_COLOR).astype(np.float32)

    # Subtract the mean image
    cleaned_img = img - mean_image

    # Normalize back to 0-255
    cleaned_img = np.clip(cleaned_img, 0, 255).astype(np.uint8)

    # Save the cleaned image
    cv2.imwrite(output_path, cleaned_img)

print(f"Cleaned images saved in {output_folder}")


Cleaned images saved in filtered_truncated_captchas


In [40]:
import cv2
import numpy as np
import torch
import pandas as pd


def draw_labels_on_image(image_path, labels_file):
    """
    Loads an image from image_path, retrieves ground truth (x1, y1, x2, y2) from labels.txt,
    and displays the image with red circles at those coordinates.

    Args:
        image_path (str): Path to the image.
        labels_file (str): Path to the labels file (CSV or TXT with x1, y1, x2, y2).
    """
    # Load the image using OpenCV
    image = cv2.imread(image_path)
    if image is None:
        print(f"Unable to load image: {image_path}")
        return

    # Resize the image to (width=340, height=410) to match the model's expected size
    image_resized = image#cv2.resize(image, (340, 410))

    # Read labels from the file
    labels_df = pd.read_csv(labels_file)  # Ensure labels.txt is formatted correctly
    image_name = image_path.split('/')[-1]  # Extract filename from path

    # Find the row corresponding to the image name (assuming there is an 'id' or filename column)
    if 'img_name' in labels_df.columns:
        row = labels_df[labels_df['img_name'] == image_name]
    else:
        row = labels_df.iloc[0]  # If there's no ID column, just use the first row (for testing)

    if row.empty:
        print(f"No labels found for {image_name}")
        return

    # Extract ground truth coordinates
    x1, y1, x2, y2 = row[['x1', 'y1', 'x2', 'y2']].to_numpy().flatten()
    x1 = int(x1)
    x2 = int(x2)
    y1 = int(y1)
    y2 = int(y2)
    print(f"Ground truth coordinates: x1={x1}, y1={y1}, x2={x2}, y2={y2}")

    # Draw red circles at the ground truth coordinates
    image_drawn = image_resized.copy()
    cv2.circle(image_drawn, (x1, y1), radius=5, color=(0, 255, 0), thickness=-1)  # Green circle
    cv2.circle(image_drawn, (x2, y2), radius=5, color=(0, 0, 255), thickness=-1)  # Red circle

    # Display the image with the drawn points
    cv2.imshow("img",image_drawn)
    cv2.waitKey(0)
    cv2.destroyAllWindows()

image_path = f"filtered_truncated_captchas/captcha_8.png"
labels_file = "truncated_labels.csv"
draw_labels_on_image(image_path,labels_file)

Ground truth coordinates: x1=40, y1=75, x2=115, y2=107


In [54]:
import os
import cv2
import numpy as np
from tqdm import tqdm  # Progress bar

def compute_mean_std(input_folder):
    """Compute the mean and std for each RGB channel across all images."""
    image_list = []
    
    for filename in tqdm(os.listdir(input_folder), desc="Computing mean & std"):
        img_path = os.path.join(input_folder, filename)
        img = cv2.imread(img_path)  # Read image in BGR format
        if img is not None:
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)  # Convert to RGB
            image_list.append(img)

    # Convert list to a big numpy array (N, H, W, C)
    image_array = np.stack(image_list, axis=0).astype(np.float32) / 255.0

    # Compute mean and std along (N, H, W) axis → (C,)
    mean = np.mean(image_array, axis=(0, 1, 2))
    std = np.std(image_array, axis=(0, 1, 2))
    print(f"moyenne = {mean} \n std={std}")
    return mean, std

def normalize_images(input_folder, output_folder):
    """Normalize RGB channels of all images in input_folder and save them to output_folder."""
    os.makedirs(output_folder, exist_ok=True)  # Create output folder if not exists

    # Compute mean and std
    mean, std = compute_mean_std(input_folder)

    for filename in tqdm(os.listdir(input_folder), desc="Normalizing images"):
        img_path = os.path.join(input_folder, filename)
        img = cv2.imread(img_path)  # Read image in BGR format
        if img is None:
            print(f"Skipping {filename} (unable to read)")
            continue

        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)  # Convert to RGB
        img = img.astype(np.float32) / 255.0  # Scale to [0,1]

        # Normalize: (pixel - mean) / std
        img = (img - mean) / std

        # Convert back to 0-255 range for saving
        img = ((img - img.min()) / (img.max() - img.min()) * 255).astype(np.uint8)

        # Save processed image
        output_path = os.path.join(output_folder, filename)
        cv2.imwrite(output_path, cv2.cvtColor(img, cv2.COLOR_RGB2BGR))  # Convert back to BGR

    print(f"✅ Normalized images saved in: {output_folder}")


# Example Usage
input_folder = "truncated_captchas"
output_folder = "normalized_images"
normalize_images(input_folder, output_folder)


Computing mean & std: 100%|██████████| 316/316 [00:00<00:00, 2028.29it/s]


moyenne = [0.83977205 0.8524061  0.55467314] 
 std=[0.2027646  0.18541439 0.18301369]


Normalizing images: 100%|██████████| 316/316 [00:00<00:00, 346.66it/s]

✅ Normalized images saved in: normalized_images





In [50]:
import os
import cv2
import numpy as np
from tqdm import tqdm
import pandas as pd
def apply_existing_normalization(input_folder, output_folder, mean, std):
    """
    Applies the same mean and std normalization (computed earlier) to new images.

    Args:
        input_folder (str): Path to the folder containing new images.
        output_folder (str): Path to save the normalized images.
        mean (tuple): Mean values of the original dataset (R, G, B).
        std (tuple): Standard deviation values of the original dataset (R, G, B).
    """
    os.makedirs(output_folder, exist_ok=True)  # Create the output folder if it doesn't exist
    df = pd.DataFrame([])
    for filename in tqdm(os.listdir(input_folder), desc="Processing new images"):
        img_path = os.path.join(input_folder, filename)
        img = cv2.imread(img_path)  # Read image in BGR format
        if img is None:
            print(f"Skipping {filename} (unable to read)")
            continue
        
        # Convert to RGB and scale pixel values to [0,1]
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB).astype(np.float32) / 255.0

        # Apply normalization using precomputed mean & std
        img_normalized = (img - mean) / std  # Standardize the image

        # Rescale to 0-255 for saving
        img_normalized = ((img_normalized - img_normalized.min()) / (img_normalized.max() - img_normalized.min()) * 255).astype(np.uint8)
        df = pd.concat([df, pd.DataFrame({"img_name": [filename], "min": [img_normalized.min()], "max": [img_normalized.max()]})], ignore_index=True)

        # Convert back to BGR before saving (OpenCV expects BGR format)
        output_path = os.path.join(output_folder, filename)
        cv2.imwrite(output_path, cv2.cvtColor(img_normalized, cv2.COLOR_RGB2BGR))
    df.to_csv(input_folder+".csv")
    print(f"✅ Normalized images saved in: {output_folder}")

# Valeurs trouvées:
mean = (0.83977205, 0.8524061, 0.55467314)  # Dataset's mean
std = (0.2027646, 0.18541439, 0.18301369)  # Dataset's std

input_folder = "premier_perso"
output_folder = "normalized_premier_perso"
apply_existing_normalization(input_folder, output_folder, mean, std)

input_folder = "second_perso"
output_folder = "normalized_second_perso"
apply_existing_normalization(input_folder, output_folder, mean, std)


Processing new images: 100%|██████████| 316/316 [00:00<00:00, 1474.27it/s]


✅ Normalized images saved in: normalized_premier_perso


Processing new images: 100%|██████████| 316/316 [00:00<00:00, 1164.04it/s]

✅ Normalized images saved in: normalized_second_perso





In [None]:
import numpy as np
def encode_image(img, original_min, original_max):
    """
    Reverts the min-max scaling applied to an image.

    Args:
        img (numpy.ndarray): The encoded image (after min-max scaling to 255).
        original_min (float): The minimum pixel value before encoding.
        original_max (float): The maximum pixel value before encoding.

    Returns:
        numpy.ndarray: The encoded image with values standardized
    """
    img = img.astype(np.float32)  # Convert to float32 for precision

    # Reverse min-max scaling: 
    img_encoded = img / 255.0 * (original_max - original_min) + original_min

    return img_encoded


In [51]:
import pandas as pd

def standardize_columns(df, columns, output_file="mean_std.txt"):
    """
    Standardizes the selected columns of a DataFrame and saves the mean and std for each column in a text file.

    Args:
        df (pd.DataFrame): Input DataFrame.
        columns (list): List of column names to standardize.
        output_file (str): File name to save the mean and std values.

    Returns:
        pd.DataFrame: DataFrame with standardized columns.
    """
    mean_std_values = {}

    for col in columns:
        mean = df[col].mean()
        std = df[col].std()

        if std == 0:  # Avoid division by zero
            std = 1  

        df[col] = (df[col] - mean) / std
        mean_std_values[col] = {"mean": mean, "std": std}

    # Save mean and std to a text file
    with open(output_file, "w") as f:
        for col, values in mean_std_values.items():
            f.write(f"{col} mean: {values['mean']}, std: {values['std']}\n")

    return df

df = pd.read_csv("truncated_labels.csv")
df = standardize_columns(df,df.columns[1:])
df.to_csv("std_truncated_labels.csv", index=False)
df

Unnamed: 0,img_name,x1,y1,x2,y2
0,captcha_1.png,-1.033635,0.793751,-0.911822,-1.717953
1,captcha_10.png,-0.877801,0.750201,0.950419,-0.778652
2,captcha_100.png,0.101729,1.708298,-1.505283,1.612295
3,captcha_101.png,0.880900,0.380028,-0.809501,1.099949
4,captcha_102.png,-0.054105,0.488902,0.950419,1.206688
...,...,...,...,...,...
311,captcha_95.png,-0.298988,-1.383742,0.336493,1.441513
312,captcha_96.png,-0.989111,-1.274867,-1.095999,1.249383
313,captcha_97.png,-1.055897,1.272800,0.643456,-0.287654
314,captcha_98.png,-1.055897,-0.011921,1.339238,-1.120216


In [53]:
extract_dirs = ['normalized_images',"normalized_premier_perso","normalized_second_perso"]
for extract_dir in extract_dirs:
    image_path = f"{extract_dir}/captcha_8.png"
    image = cv2.imread(image_path)
    print(image.shape)

(210, 210, 3)
(50, 50, 3)
(50, 50, 3)


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import cv2
import pandas as pd
import os
import numpy as np

class CaptchaDataset(Dataset):
    def __init__(self, extract_dirs, label_file, transform=None):
        """
        Args:
            extract_dirs (list of str): List of directories where images are stored.
            label_file (str): Path to the CSV file containing labels.
            transform (callable, optional): Transformations to apply on images.
        """
        self.extract_dirs = extract_dirs  # List of directories
        self.labels = pd.read_csv(label_file)
        self.transform = transform

    def normalize_image(self,image, mean=(0.83977205, 0.8524061, 0.55467314), std=(0.2027646, 0.18541439,0.18301369)):
        """
        Normalize the image with given mean and std for each channel.
        
        Args:
            image (np.array): The input image to be normalized.
            mean (tuple): A tuple containing the mean for each channel (R, G, B).
            std (tuple): A tuple containing the standard deviation for each channel (R, G, B).
            
        Returns:
            np.array: The normalized image.
        """
        # Normalize image by subtracting mean and dividing by std for each channel (RGB)
        image = (image - mean) / std
        return image

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        # Get image ID
        img_id = self.labels.iloc[idx]['img_name']
        
        # Initialize paths for the three parts
        large_img_path = None
        small_img1_path = None
        small_img2_path = None

        # Construct the paths for the images
        large_img_path = os.path.join(self.extract_dirs[0], img_id)
        small_img1_path = os.path.join(self.extract_dirs[1], img_id)
        small_img2_path = os.path.join(self.extract_dirs[2], img_id)

        # Check if we found all parts
        if not os.path.exists(large_img_path) or not os.path.exists(small_img1_path) or not os.path.exists(small_img2_path):
            print(f"Missing images for {img_id}!")
            return None

        # Load images (big and small)
        large_img = cv2.imread(large_img_path)
        small_img1 = cv2.imread(small_img1_path)
        small_img2 = cv2.imread(small_img2_path)

        # Convert to RGB (OpenCV loads as BGR by default)
        large_img = cv2.cvtColor(large_img, cv2.COLOR_BGR2RGB)
        small_img1 = cv2.cvtColor(small_img1, cv2.COLOR_BGR2RGB)
        small_img2 = cv2.cvtColor(small_img2, cv2.COLOR_BGR2RGB)

        # Apply normalization using the provided mean and std
        large_img = self.normalize_image(large_img)
        small_img1 = self.normalize_image(small_img1)
        small_img2 = self.normalize_image(small_img2)

        # Transpose to PyTorch format (C, H, W)
        large_img = np.transpose(large_img, (2, 0, 1))
        small_img1 = np.transpose(small_img1, (2, 0, 1))
        small_img2 = np.transpose(small_img2, (2, 0, 1))

        # Get labels (x1, y1, x2, y2)
        labels = self.labels.iloc[idx][['x1', 'y1', 'x2', 'y2']].values.astype(np.float32)

        # Convert to tensors
        large_img = torch.tensor(large_img)
        small_img1 = torch.tensor(small_img1)
        small_img2 = torch.tensor(small_img2)
        labels = torch.tensor(labels)

        return large_img, small_img1, small_img2, labels

# Example Usage
extract_dirs = ["normalized_images", "normalized_premier_perso", "normalized_second_perso"]
label_file = "std_truncated_labels.csv"

dataset = CaptchaDataset(extract_dirs, label_file)
train_loader = DataLoader(dataset, batch_size=8,shuffle=True)

# Test data loading
for large_img, small_img1, small_img2, labels in train_loader:
    if large_img is not None:
        print("Large image shape:", large_img.shape)   # Expected: (8, 3, 210, 210)
        print("Small image 1 shape:", small_img1.shape)  # Expected: (8, 3, 50, 50)
        print("Small image 2 shape:", small_img2.shape)  # Expected: (8, 3, 50, 50)
        print("Labels:", labels.shape)  # Expected: (8, 4)
    break  # Just checking one batch


Large image shape: torch.Size([8, 3, 210, 210])
Small image 1 shape: torch.Size([8, 3, 50, 50])
Small image 2 shape: torch.Size([8, 3, 50, 50])
Labels: torch.Size([8, 4])


In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

class CaptchaClickCNN(nn.Module):
    def __init__(self):
        super(CaptchaClickCNN, self).__init__()

        # Define the CNN layers for the large image (210, 210, 3)
        self.large_img_cnn = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )

        # Define the CNN layers for the small image (50, 50, 3)
        self.small_img_cnn = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )

        # We'll calculate the exact feature dimensions in the forward pass
        # and lazily initialize the fully connected layers later
        self.fc1 = None
        self.fc2 = None
        self.fc3 = None
        self.initialized = False

    def _initialize_fc_layers(self, input_size):
        self.fc1 = nn.Linear(input_size, 512)
        self.fc2 = nn.Linear(512, 128)
        self.fc3 = nn.Linear(128, 4)
        self.initialized = True

    def forward(self, large_img, small_img1, small_img2):
        # Ensure input images are float32 for consistency
        large_img = large_img.float()
        small_img1 = small_img1.float()
        small_img2 = small_img2.float()

        # Pass large image through its CNN layers
        large_img_features = self.large_img_cnn(large_img)
        
        # Pass small images through their CNN layers
        small_img1_features = self.small_img_cnn(small_img1)
        small_img2_features = self.small_img_cnn(small_img2)

        # Flatten the CNN outputs
        large_img_features = large_img_features.view(large_img_features.size(0), -1)
        small_img1_features = small_img1_features.view(small_img1_features.size(0), -1)
        small_img2_features = small_img2_features.view(small_img2_features.size(0), -1)

        # Concatenate the features from all images
        combined_features = torch.cat((large_img_features, small_img1_features, small_img2_features), dim=1)
        
        # Lazy initialization of FC layers on first forward pass
        if not self.initialized:
            self._initialize_fc_layers(combined_features.shape[1])
            
        # Pass through fully connected layers
        x = F.relu(self.fc1(combined_features))
        x = F.dropout(x, 0.5, training=self.training)
        x = F.relu(self.fc2(x))
        output = self.fc3(x)
        
        return output


# Example usage with dimension tracking
def test_dimensions():
    # Create sample inputs
    batch_size = 8
    large_img = torch.randn(batch_size, 3, 210, 210)
    small_img1 = torch.randn(batch_size, 3, 50, 50)
    small_img2 = torch.randn(batch_size, 3, 50, 50)
    
    # Initialize the model
    model = CaptchaClickCNN()
    
    # Forward pass with print statements to track dimensions
    print(f"Large image input shape: {large_img.shape}")
    large_features = model.large_img_cnn(large_img)
    print(f"Large image after CNN shape: {large_features.shape}")
    large_flattened = large_features.view(large_features.size(0), -1)
    print(f"Large image flattened shape: {large_flattened.shape}")
    
    print(f"\nSmall image input shape: {small_img1.shape}")
    small_features = model.small_img_cnn(small_img1)
    print(f"Small image after CNN shape: {small_features.shape}")
    small_flattened = small_features.view(small_features.size(0), -1)
    print(f"Small image flattened shape: {small_flattened.shape}")
    
    # Calculate total feature size
    total_feature_size = large_flattened.shape[1] + small_flattened.shape[1] * 2
    print(f"\nTotal feature size for FC layer: {total_feature_size}")
    
    # Complete forward pass
    output = model(large_img, small_img1, small_img2)
    print(f"\nFinal output shape: {output.shape}")
    
    return model

# Define the training function
def train_model(model, train_loader, num_epochs=10, lr=1e-4):
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    
    for epoch in range(num_epochs):
        total_loss = 0
        for large_img, small_img1, small_img2, labels in train_loader:
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(large_img, small_img1, small_img2)
            
            # Calculate loss
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            
            # Backpropagation and optimization
            loss.backward()
            optimizer.step()
        
        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.6f}")
    
    return model

# To use the model with your existing train_loader:
# model = CaptchaClickCNN()
# trained_model = train_model(model, train_loader)

In [None]:
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import os
import time
import cv2
# Assuming CaptchaClickCNN is defined as in the previous artifact

def save_model(model, path="captcha_model.pth", save_optimizer=False, optimizer=None, epoch=None):
    """
    Save the trained model along with optional training state.
    
    Args:
        model: The trained CaptchaClickCNN model
        path: Path to save the model
        save_optimizer: Whether to save optimizer state for resuming training
        optimizer: The optimizer used during training
        epoch: Current epoch number
    """
    checkpoint = {
        'model_state_dict': model.state_dict(),
        'initialized': model.initialized,
    }
    
    if save_optimizer and optimizer is not None:
        checkpoint['optimizer_state_dict'] = optimizer.state_dict()
    
    if epoch is not None:
        checkpoint['epoch'] = epoch
    
    print(f"Saving model to {path}...")
    torch.save(checkpoint, path)
    print("Model saved successfully!")


def load_model(path="captcha_model.pth", device="cuda" if torch.cuda.is_available() else "cpu", 
               load_optimizer=False, optimizer=None):
    """
    Load a saved CaptchaClickCNN model.
    
    Args:
        path: Path to the saved model
        device: Device to load the model on ('cuda' or 'cpu')
        load_optimizer: Whether to load optimizer state
        optimizer: The optimizer to load state into
        
    Returns:
        model: The loaded model
        epoch: The epoch at which the model was saved (if saved)
        optimizer: The loaded optimizer (if requested)
    """
    if not os.path.exists(path):
        raise FileNotFoundError(f"No model found at {path}")
    
    # Load checkpoint
    checkpoint = torch.load(path, map_location=device)
    
    # Create a new model instance
    model = CaptchaClickCNN()
    
    # Load model state
    model.load_state_dict(checkpoint['model_state_dict'])
    
    # Set initialization flag
    model.initialized = checkpoint.get('initialized', False)
    
    # Move model to device
    model = model.to(device)
    model.eval()  # Set to evaluation mode
    
    # Load optimizer if requested
    if load_optimizer and 'optimizer_state_dict' in checkpoint and optimizer is not None:
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    
    epoch = checkpoint.get('epoch', None)
    
    print(f"Model loaded successfully from {path}")
    if epoch is not None:
        print(f"Model was saved after epoch {epoch}")
    
    return model, epoch, optimizer if load_optimizer else model


def test_model(model, test_loader, device="cuda" if torch.cuda.is_available() else "cpu"):
    """
    Test the model on the test dataset and compute metrics.
    
    Args:
        model: The trained CaptchaClickCNN model
        test_loader: DataLoader containing test data
        device: Device to run testing on
        
    Returns:
        average_loss: Mean Square Error on test set
        predictions: Model predictions
        ground_truth: True labels
    """
    model.eval()  # Set the model to evaluation mode
    model = model.to(device)
    
    criterion = nn.MSELoss()
    total_loss = 0.0
    all_predictions = []
    all_labels = []
    
    print("Testing model...")
    start_time = time.time()
    
    with torch.no_grad():  # No need to track gradients during testing
        for large_img, small_img1, small_img2, labels in test_loader:
            # Move data to device
            large_img = large_img.to(device)
            small_img1 = small_img1.to(device)
            small_img2 = small_img2.to(device)
            labels = labels.to(device)
            
            # Forward pass
            outputs = model(large_img, small_img1, small_img2)
            
            # Calculate loss
            loss = criterion(outputs, labels)
            total_loss += loss.item() * large_img.size(0)
            
            # Store predictions and labels
            all_predictions.extend(outputs.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    # Calculate average loss
    average_loss = total_loss / len(test_loader.dataset)
    
    # Convert to numpy arrays
    predictions = np.array(all_predictions)
    ground_truth = np.array(all_labels)
    
    test_time = time.time() - start_time
    print(f"Testing completed in {test_time:.2f} seconds")
    print(f"Average MSE Loss: {average_loss:.4f}")
    
    # Calculate Euclidean distance error for click predictions
    if predictions.shape[1] == 4:  # If we have x1, y1, x2, y2 coordinates
        # Calculate Euclidean distance for first point (x1, y1)
        dist1 = np.sqrt(np.square(predictions[:, 0] - ground_truth[:, 0]) + 
                        np.square(predictions[:, 1] - ground_truth[:, 1]))
        
        # Calculate Euclidean distance for second point (x2, y2)
        dist2 = np.sqrt(np.square(predictions[:, 2] - ground_truth[:, 2]) + 
                        np.square(predictions[:, 3] - ground_truth[:, 3]))
        
        # Average distance error
        avg_dist_error = (np.mean(dist1) + np.mean(dist2)) / 2
        print(f"Average Distance Error: {avg_dist_error:.2f} pixels")
    
    return average_loss, predictions, ground_truth


def visualize_predictions(model, sample_data, num_samples=5, device="cuda" if torch.cuda.is_available() else "cpu"):
    """
    Visualize model predictions on sample images.
    
    Args:
        model: Trained CaptchaClickCNN model
        sample_data: Sample data for visualization (large_img, small_img1, small_img2, coords)
        num_samples: Number of samples to visualize
        device: Device to run inference on
    """
    model.eval()
    model = model.to(device)
    
    plt.figure(figsize=(15, num_samples * 5))
    
    with torch.no_grad():
        for i in range(min(num_samples, len(sample_data))):
            large_img, small_img1, small_img2, true_coords = sample_data[i]
            
            # Add batch dimension
            large_img = large_img.unsqueeze(0).to(device)
            small_img1 = small_img1.unsqueeze(0).to(device)
            small_img2 = small_img2.unsqueeze(0).to(device)
            
            # Get model prediction
            pred_coords = model(large_img, small_img1, small_img2).cpu().numpy()[0]
            
            # Convert large image tensor to numpy for display
            large_img_np = large_img[0].cpu().permute(1, 2, 0).numpy()
            
            # Normalize image for display if needed
            if large_img_np.max() > 1.0:
                large_img_np = large_img_np / 255.0
            
            # Plot the image
            plt.subplot(num_samples, 3, i*3 + 1)
            plt.imshow(large_img_np)
            plt.title(f"Large Image {i+1}")
            
            # Plot true coordinates
            plt.plot(true_coords[0], true_coords[1], 'go', markersize=10, label='True Point 1')
            plt.plot(true_coords[2], true_coords[3], 'bo', markersize=10, label='True Point 2')
            
            # Plot predicted coordinates  
            plt.plot(pred_coords[0], pred_coords[1], 'rx', markersize=10, label='Pred Point 1')
            plt.plot(pred_coords[2], pred_coords[3], 'mx', markersize=10, label='Pred Point 2')
            
            if i == 0:
                plt.legend()
            
            # Display the small images
            plt.subplot(num_samples, 3, i*3 + 2)
            small_img1_np = small_img1[0].cpu().permute(1, 2, 0).numpy()
            if small_img1_np.max() > 1.0:
                small_img1_np = small_img1_np / 255.0
            plt.imshow(small_img1_np)
            plt.title(f"Small Image 1")
            
            plt.subplot(num_samples, 3, i*3 + 3)
            small_img2_np = small_img2[0].cpu().permute(1, 2, 0).numpy()
            if small_img2_np.max() > 1.0:
                small_img2_np = small_img2_np / 255.0
            plt.imshow(small_img2_np)
            plt.title(f"Small Image 2")
    
    plt.tight_layout()
    plt.savefig("captcha_predictions.png")
    plt.show()


def inference_on_new_images(model, large_img_path, small_img1_path, small_img2_path, 
                            device="cuda" if torch.cuda.is_available() else "cpu"):
    """
    Run inference on new images loaded from disk.
    
    Args:
        model: Trained CaptchaClickCNN model
        large_img_path: Path to large image file
        small_img1_path: Path to first small image file
        small_img2_path: Path to second small image file
        device: Device to run inference on
        
    Returns:
        predicted_coords: Predicted coordinates [x1, y1, x2, y2]
    """
    model.eval()
    model = model.to(device)

    def normalize_image(image, mean=(0.83977205, 0.8524061, 0.55467314), std=(0.2027646, 0.18541439, 0.18301369)):
        # Convert the image to a NumPy array
        image = np.array(image).astype(np.float32)
        
        # Normalize image by subtracting mean and dividing by std for each channel (RGB)
        image = (image - mean) / std
        
        return image

    # Load and preprocess images
    def load_and_preprocess(img_path, target_size):
        # Open the image, convert it to RGB
        img = Image.open(img_path).convert('RGB')
        
        # Resize the image to the target size
        img = img.resize(target_size)
        
        # Normalize the image
        img = normalize_image(img)
        
        # Convert to torch tensor (channel-first format: C x H x W)
        img_tensor = torch.FloatTensor(img).permute(2, 0, 1)  # Convert from HWC to CHW
        return img_tensor
    
    # Load and preprocess all images
    large_img = load_and_preprocess(large_img_path, (210, 210))
    small_img1 = load_and_preprocess(small_img1_path, (50, 50))
    small_img2 = load_and_preprocess(small_img2_path, (50, 50))
    
    # Add batch dimension (make them 4D tensors)
    large_img = large_img.unsqueeze(0).to(device)
    small_img1 = small_img1.unsqueeze(0).to(device)
    small_img2 = small_img2.unsqueeze(0).to(device)
    
    # Run inference
    with torch.no_grad():
        predicted_coords = model(large_img, small_img1, small_img2).cpu().numpy()[0]
    
    print("Predicted coordinates:", predicted_coords)
    return predicted_coords


# Example usage:
if __name__ == "__main__":
    # Create a dummy model for demonstration
    model = CaptchaClickCNN()
    
    # Create optimizer for demonstration
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    
    # Example: Save model after training
    # save_model(model, path="captcha_model.pth", save_optimizer=True, optimizer=optimizer, epoch=10)
    
    # Example: Load model for inference
    model = load_model(path="captcha_model.pth")
    
    # Example: Test model (assuming you have a test_loader)
    # test_loss, predictions, ground_truth = test_model(model, test_loader)
    
    # Example: Run inference on new images
    predicted_coords = inference_on_new_images(
        model[0], 
        large_img_path="normalized_images/captcha_1.png",
        small_img1_path="normalized_premier_perso/captcha_1.png",
        small_img2_path="normalized_second_perso/captcha_1.png"
    )
    
    print("Script executed successfully!")

Model loaded successfully from captcha_model.pth
Model was saved after epoch 10
Predicted coordinates: [-5.791699  10.908616   4.1962066 48.42848  ]
Script executed successfully!


: 

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split
import torchvision.transforms as transforms
import numpy as np
import matplotlib.pyplot as plt
import os
from PIL import Image

# First import your CaptchaClickCNN model definition
# from captcha_model import CaptchaClickCNN

# Sample Dataset class - replace with your actual dataset
class CaptchaDataset(Dataset):
    def __init__(self, data_dir, transform=None):
        """
        Dummy dataset - replace with your actual dataset loading logic
        
        Args:
            data_dir: Directory containing your data
            transform: Optional transforms to apply to images
        """
        self.data_dir = data_dir
        self.transform = transform
        
        # In a real scenario, you would load your data here
        # For demonstration, we'll create dummy data
        self.data_size = 100  # Number of samples
        
    def __len__(self):
        return self.data_size
        
    def __getitem__(self, idx):
        # In a real scenario, you would load your images and labels here
        # For demonstration, we'll create random data
        
        # Create dummy images
        large_img = torch.randn(3, 210, 210)
        small_img1 = torch.randn(3, 50, 50)
        small_img2 = torch.randn(3, 50, 50)
        
        # Create dummy target coordinates
        # Coordinates should be within the image dimensions
        coords = torch.tensor([
            np.random.randint(0, 210),  # x1
            np.random.randint(0, 210),  # y1
            np.random.randint(0, 210),  # x2
            np.random.randint(0, 210)   # y2
        ], dtype=torch.float32)
        
        return large_img, small_img1, small_img2, coords


# Main workflow function
def captcha_model_workflow():
    # Set random seed for reproducibility
    torch.manual_seed(42)
    np.random.seed(42)
    
    # Set device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    
    # Create dataset and dataloaders
    # Replace with your actual dataset path
    dataset = CaptchaDataset(data_dir="./data")
    
    # Split dataset into train, validation, and test
    train_size = int(0.7 * len(dataset))
    val_size = int(0.15 * len(dataset))
    test_size = len(dataset) - train_size - val_size
    
    train_dataset, val_dataset, test_dataset = random_split(
        dataset, [train_size, val_size, test_size]
    )
    
    # Create dataloaders
    batch_size = 8
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)
    
    # Create model
    model = CaptchaClickCNN().to(device)
    
    # Define loss function and optimizer
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-4)
    
    # Training parameters
    num_epochs = 20
    best_val_loss = float('inf')
    model_save_path = "captcha_model_best.pth"
    
    # Training loop
    print("Starting training...")
    for epoch in range(num_epochs):
        # Training phase
        model.train()
        train_loss = 0.0
        
        for large_img, small_img1, small_img2, labels in train_loader:
            # Move data to device
            large_img = large_img.to(device)
            small_img1 = small_img1.to(device)
            small_img2 = small_img2.to(device)
            labels = labels.to(device)
            
            # Zero the gradients
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(large_img, small_img1, small_img2)
            
            # Calculate loss
            loss = criterion(outputs, labels)
            
            # Backward pass and optimize
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item() * large_img.size(0)
        
        train_loss = train_loss / len(train_loader.dataset)
        
        # Validation phase
        model.eval()
        val_loss = 0.0
        
        with torch.no_grad():
            for large_img, small_img1, small_img2, labels in val_loader:
                # Move data to device
                large_img = large_img.to(device)
                small_img1 = small_img1.to(device)
                small_img2 = small_img2.to(device)
                labels = labels.to(device)
                
                # Forward pass
                outputs = model(large_img, small_img1, small_img2)
                
                # Calculate loss
                loss = criterion(outputs, labels)
                val_loss += loss.item() * large_img.size(0)
        
        val_loss = val_loss / len(val_loader.dataset)
        
        # Print epoch statistics
        print(f"Epoch {epoch+1}/{num_epochs} - "
              f"Train Loss: {train_loss:.4f}, "
              f"Val Loss: {val_loss:.4f}")
        
        # Save the best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            save_model(model, model_save_path, save_optimizer=True, 
                      optimizer=optimizer, epoch=epoch)
            print(f"Saved best model so far with validation loss: {val_loss:.4f}")
    
    print("Training completed!")
    
    # Load the best model for testing
    model, _, _ = load_model(model_save_path, device=device)
    
    # Test the model
    test_loss, predictions, ground_truth = test_model(model, test_loader, device=device)
    
    # Save some sample test data for visualization
    sample_data = []
    for i, (large_img, small_img1, small_img2, labels) in enumerate(test_loader):
        if i >= 5:  # Get 5 samples
            break
        sample_data.append((large_img[0], small_img1[0], small_img2[0], labels[0]))
    
    # Visualize predictions
    visualize_predictions(model, sample_data, device=device)
    
    print("Workflow completed successfully!")


if __name__ == "__main__":
    captcha_model_workflow()