In [35]:
import cv2
import numpy as np
import torch
import pandas as pd


def draw_labels_on_image(image_path, labels_file):
    """
    Loads an image from image_path, retrieves ground truth (x1, y1, x2, y2) from labels.txt,
    and displays the image with red circles at those coordinates.

    Args:
        image_path (str): Path to the image.
        labels_file (str): Path to the labels file (CSV or TXT with x1, y1, x2, y2).
    """
    # Load the image using OpenCV
    image = cv2.imread(image_path)
    if image is None:
        print(f"Unable to load image: {image_path}")
        return

    # Resize the image to (width=340, height=410) to match the model's expected size
    image_resized = image#cv2.resize(image, (340, 410))

    # Read labels from the file
    labels_df = pd.read_csv(labels_file)  # Ensure labels.txt is formatted correctly
    image_name = image_path.split('/')[-1]  # Extract filename from path

    # Find the row corresponding to the image name (assuming there is an 'id' or filename column)
    if 'img_name' in labels_df.columns:
        row = labels_df[labels_df['img_name'] == image_name]
    else:
        row = labels_df.iloc[0]  # If there's no ID column, just use the first row (for testing)

    if row.empty:
        print(f"No labels found for {image_name}")
        return

    # Extract ground truth coordinates
    x1, y1, x2, y2 = row[['x1', 'y1', 'x2', 'y2']].to_numpy().flatten()
    x1 = int(x1)
    x2 = int(x2)
    y1 = int(y1)
    y2 = int(y2)
    print(f"Ground truth coordinates: x1={x1}, y1={y1}, x2={x2}, y2={y2}")

    # Draw red circles at the ground truth coordinates
    image_drawn = image_resized.copy()
    cv2.circle(image_drawn, (x1, y1), radius=5, color=(0, 255, 0), thickness=-1)  # Green circle
    cv2.circle(image_drawn, (x2, y2), radius=5, color=(0, 0, 255), thickness=-1)  # Red circle

    # Display the image with the drawn points
    cv2.imshow("img",image_drawn)
    cv2.waitKey(0)
    cv2.destroyAllWindows()

extract_dir = 'extracted_captchas'
processed_images_dir = "processed_captchas"
image_path = f"{extract_dir}/captchas_saved/captcha_8.png"
labels_file = "labels.txt"
draw_labels_on_image(image_path,labels_file)
image_path = f"truncated_captchas/captcha_8.png"
labels_file = "truncated_labels.csv"
draw_labels_on_image(image_path,labels_file)


Ground truth coordinates: x1=105, y1=175, x2=180, y2=207
Ground truth coordinates: x1=40, y1=75, x2=115, y2=107


In [22]:
import cv2
import numpy as np

def show_pixels(image_path,nbPixelsHaut,nbPixelsBas,nbPixelsGauche,nbPixelsDroite):
    """
    Displays the top 20 rows of an image.

    Args:
        image_path (str): Path to the image file.
    """
    # Load the image
    image = cv2.imread(image_path)

    # Check if the image was loaded correctly
    if image is None:
        print(f"Error: Unable to load image at {image_path}")
        return

    truncated_image = image[nbPixelsBas:nbPixelsHaut, nbPixelsGauche:nbPixelsDroite]

    # Display the cropped section
    cv2.imshow("img",truncated_image)  # Use cv2_imshow in Google Colab, replace with cv2.imshow() for local use
    cv2.waitKey(0)
    cv2.destroyAllWindows()


extract_dir = 'extracted_captchas'
processed_images_dir = "processed_captchas"
image_path = f"{extract_dir}/captchas_saved/captcha_315.png"
labels_file = "labels.txt"

nbPixelsHaut,nbPixelsBas,nbPixelsGauche,nbPixelsDroite = 310,100,65,275 # Images tronquées
# nbPixelsHaut,nbPixelsBas,nbPixelsGauche,nbPixelsDroite = 55,5,185,235 # Premier perso
# nbPixelsHaut, nbPixelsBas, nbPixelsGauche, nbPixelsDroite = 50,10, 265, 305 #Second perso
show_pixels(image_path,nbPixelsHaut,nbPixelsBas,nbPixelsGauche,nbPixelsDroite)

In [34]:
import pandas as pd

def adjust_x_coordinates(label, nbPixelsGauche):
    return label - nbPixelsGauche

def adjust_y_coordinates(label, nbPixelsBas):
    return label - nbPixelsBas

# Load the labels file
df = pd.read_csv("labels.txt")
print(df.columns)
# Ensure the coordinate columns are numeric
df[['x1', 'y1', 'x2', 'y2']] = df[['x1', 'y1', 'x2', 'y2']].apply(pd.to_numeric)

nbPixelsHaut,nbPixelsBas,nbPixelsGauche,nbPixelsDroite = 310,100,65,275 # Images tronquées
# Adjust coordinates
df['x1'] = df['x1'].apply(lambda x: adjust_x_coordinates(x, nbPixelsGauche))
df['x2'] = df['x2'].apply(lambda x: adjust_x_coordinates(x, nbPixelsGauche))
df['y1'] = df['y1'].apply(lambda y: adjust_y_coordinates(y, nbPixelsBas))
df['y2'] = df['y2'].apply(lambda y: adjust_y_coordinates(y, nbPixelsBas))

# Save the adjusted labels
df.to_csv('truncated_labels.csv', index=False)

# Display the updated DataFrame
print(df)


Index(['img_name', 'x1', 'y1', 'x2', 'y2'], dtype='object')
            img_name          x1          y1          x2          y2
0      captcha_1.png   46.816017  146.209957   54.581169   18.639610
1     captcha_10.png   54.581169  143.991342  155.528139   67.449134
2    captcha_100.png  103.390693  192.800866   22.411255  191.691558
3    captcha_101.png  142.216450  125.133117   60.127706  165.068182
4    captcha_102.png   95.625541  130.679654  155.528139  170.614719
..               ...         ...         ...         ...         ...
311   captcha_95.png   83.423160   35.279221  122.248918  182.817100
312   captcha_96.png   49.034632   40.825758   44.597403  172.833333
313   captcha_97.png   45.706710  170.614719  138.888528   92.963203
314   captcha_98.png   45.706710  105.165584  176.604978   49.700216
315   captcha_99.png   42.378788  187.254329   23.520563  117.367965

[316 rows x 5 columns]


In [32]:
import os
import cv2

def crop_and_save_images(input_folder, output_folder="truncated_captchas",nbPixelsHaut= 330, nbPixelsBas= 55, nbPixelsGauche= 20, nbPixelsDroite = 320):
    """
    Crops images in the input_folder according to predefined pixel boundaries
    and saves them to the output_folder with the same filenames.

    Args:
        input_folder (str): Path to the folder containing the original images.
        output_folder (str): Path where the cropped images will be saved.
    """
    # Define cropping boundaries
    

    # Create output folder if it does not exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Process each image in the input folder
    for filename in os.listdir(input_folder):
        input_path = os.path.join(input_folder, filename)
        output_path = os.path.join(output_folder, filename)

        # Read the image
        image = cv2.imread(input_path)
        if image is None:
            print(f"Skipping {filename} (could not load image)")
            continue

        # Apply cropping
        cropped_image = image[nbPixelsBas:nbPixelsHaut, nbPixelsGauche:nbPixelsDroite]

        # Save the cropped image to the new folder
        cv2.imwrite(output_path, cropped_image)
        print(f"Saved cropped image: {output_path}")

# nbPixelsHaut,nbPixelsBas,nbPixelsGauche,nbPixelsDroite = 50,10,190,230 # Premier perso
# nbPixelsHaut, nbPixelsBas, nbPixelsGauche, nbPixelsDroite = 50,10, 265, 305 #Second perso

# Example Usage
nbPixelsHaut,nbPixelsBas,nbPixelsGauche,nbPixelsDroite = 310,100,65,275 # Images tronquées
crop_and_save_images("extracted_captchas/captchas_saved","truncated_captchas",nbPixelsHaut,nbPixelsBas,nbPixelsGauche,nbPixelsDroite) #truncated captchas
# nbPixelsHaut,nbPixelsBas,nbPixelsGauche,nbPixelsDroite = 55,5,185,235
# crop_and_save_images("extracted_captchas/captchas_saved","premier_perso",nbPixelsHaut,nbPixelsBas,nbPixelsGauche,nbPixelsDroite) #premier perso
# nbPixelsHaut, nbPixelsBas, nbPixelsGauche, nbPixelsDroite = 55,5, 260, 310
# crop_and_save_images("extracted_captchas/captchas_saved","second_perso",nbPixelsHaut,nbPixelsBas,nbPixelsGauche,nbPixelsDroite) #second perso


Saved cropped image: truncated_captchas\captcha_1.png
Saved cropped image: truncated_captchas\captcha_10.png
Saved cropped image: truncated_captchas\captcha_100.png
Saved cropped image: truncated_captchas\captcha_101.png
Saved cropped image: truncated_captchas\captcha_102.png
Saved cropped image: truncated_captchas\captcha_103.png
Saved cropped image: truncated_captchas\captcha_104.png
Saved cropped image: truncated_captchas\captcha_105.png
Saved cropped image: truncated_captchas\captcha_106.png
Saved cropped image: truncated_captchas\captcha_107.png
Saved cropped image: truncated_captchas\captcha_108.png
Saved cropped image: truncated_captchas\captcha_109.png
Saved cropped image: truncated_captchas\captcha_11.png
Saved cropped image: truncated_captchas\captcha_110.png
Saved cropped image: truncated_captchas\captcha_111.png
Saved cropped image: truncated_captchas\captcha_112.png
Saved cropped image: truncated_captchas\captcha_113.png
Saved cropped image: truncated_captchas\captcha_114.

In [38]:
import cv2
import numpy as np
import os

# Path to the folder containing the images
image_folder = "truncated_captchas"  # Change this to your actual folder path

# List all image files in the folder
image_files = [f for f in os.listdir(image_folder) if f.endswith('.png')]  # Change extension if needed

# Initialize an accumulator with zeros (assuming images are the same size)
num_images = len(image_files)
if num_images == 0:
    raise ValueError("No images found in the specified folder.")

# Load the first image to get dimensions
first_image = cv2.imread(os.path.join(image_folder, image_files[0]), cv2.IMREAD_COLOR)
h, w, c = first_image.shape
mean_image = np.zeros((h, w, c), dtype=np.float32)

# Compute the sum of all images
for file in image_files:
    img = cv2.imread(os.path.join(image_folder, file), cv2.IMREAD_COLOR)
    if img is None:
        print(f"Warning: Could not read {file}")
        continue
    mean_image += img.astype(np.float32)  # Convert to float to prevent overflow

# Compute the mean by dividing by the number of images
mean_image /= num_images

# Convert back to uint8 format for visualization and saving
mean_image = np.clip(mean_image, 0, 255).astype(np.uint8)

# Save the mean image
cv2.imwrite("mean_image.png", mean_image)

# Display the mean image (optional)
cv2.imshow("Mean Image", mean_image)
cv2.waitKey(0)
cv2.destroyAllWindows()


In [39]:
import cv2
import numpy as np
import os

# Paths
image_folder = "truncated_captchas"  # Change this to your actual folder
output_folder = "filtered_truncated_captchas"  # Folder to save cleaned images
mean_image_path = "mean_image.png"  # Path to saved mean image

# Create output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Load the mean image
mean_image = cv2.imread(mean_image_path, cv2.IMREAD_COLOR).astype(np.float32)

# List all image files in the folder
image_files = [f for f in os.listdir(image_folder) if f.endswith('.png')]  # Change extension if needed

# Process each image
for file in image_files:
    img_path = os.path.join(image_folder, file)
    output_path = os.path.join(output_folder, file)
    
    # Load the image
    img = cv2.imread(img_path, cv2.IMREAD_COLOR).astype(np.float32)

    # Subtract the mean image
    cleaned_img = img - mean_image

    # Normalize back to 0-255
    cleaned_img = np.clip(cleaned_img, 0, 255).astype(np.uint8)

    # Save the cleaned image
    cv2.imwrite(output_path, cleaned_img)

print(f"Cleaned images saved in {output_folder}")


Cleaned images saved in filtered_truncated_captchas


In [1]:
import cv2
import numpy as np
import torch
import pandas as pd


def draw_labels_on_image(image_path, labels_file):
    """
    Loads an image from image_path, retrieves ground truth (x1, y1, x2, y2) from labels.txt,
    and displays the image with red circles at those coordinates.

    Args:
        image_path (str): Path to the image.
        labels_file (str): Path to the labels file (CSV or TXT with x1, y1, x2, y2).
    """
    # Load the image using OpenCV
    image = cv2.imread(image_path)
    if image is None:
        print(f"Unable to load image: {image_path}")
        return

    # Resize the image to (width=340, height=410) to match the model's expected size
    image_resized = image#cv2.resize(image, (340, 410))

    # Read labels from the file
    labels_df = pd.read_csv(labels_file)  # Ensure labels.txt is formatted correctly
    image_name = image_path.split('/')[-1]  # Extract filename from path

    # Find the row corresponding to the image name (assuming there is an 'id' or filename column)
    if 'img_name' in labels_df.columns:
        row = labels_df[labels_df['img_name'] == image_name]
    else:
        row = labels_df.iloc[0]  # If there's no ID column, just use the first row (for testing)

    if row.empty:
        print(f"No labels found for {image_name}")
        return

    # Extract ground truth coordinates
    x1, y1, x2, y2 = row[['x1', 'y1', 'x2', 'y2']].to_numpy().flatten()
    x1 = int(x1)
    x2 = int(x2)
    y1 = int(y1)
    y2 = int(y2)
    print(f"Ground truth coordinates: x1={x1}, y1={y1}, x2={x2}, y2={y2}")

    # Draw red circles at the ground truth coordinates
    image_drawn = image_resized.copy()
    cv2.circle(image_drawn, (x1, y1), radius=5, color=(0, 255, 0), thickness=-1)  # Green circle
    cv2.circle(image_drawn, (x2, y2), radius=5, color=(0, 0, 255), thickness=-1)  # Red circle

    # Display the image with the drawn points
    cv2.imshow("img",image_drawn)
    cv2.waitKey(0)
    cv2.destroyAllWindows()

image_path = f"filtered_truncated_captchas/captcha_8.png"
labels_file = "truncated_labels.csv"
draw_labels_on_image(image_path,labels_file)

Ground truth coordinates: x1=40, y1=75, x2=115, y2=107


In [4]:
import os
import cv2
import numpy as np
from tqdm import tqdm  # Progress bar

def compute_mean_std(input_folder):
    """Compute the mean and std for each RGB channel across all images."""
    image_list = []
    
    for filename in tqdm(os.listdir(input_folder), desc="Computing mean & std"):
        img_path = os.path.join(input_folder, filename)
        img = cv2.imread(img_path)  # Read image in BGR format
        if img is not None:
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)  # Convert to RGB
            image_list.append(img)

    # Convert list to a big numpy array (N, H, W, C)
    image_array = np.stack(image_list, axis=0).astype(np.float32) 

    # Compute mean and std along (N, H, W) axis → (C,)
    mean = np.mean(image_array, axis=(0, 1, 2))
    std = np.std(image_array, axis=(0, 1, 2))
    print(f"moyenne = {mean} \n std={std}")
    return mean, std

def normalize_images(input_folder, output_folder):
    """Normalize RGB channels of all images in input_folder and save them to output_folder."""
    os.makedirs(output_folder, exist_ok=True)  # Create output folder if not exists

    # Compute mean and std
    mean, std = compute_mean_std(input_folder)
    print(mean,std)
    for filename in tqdm(os.listdir(input_folder), desc="Normalizing images"):
        img_path = os.path.join(input_folder, filename)
        img = cv2.imread(img_path)  # Read image in BGR format
        if img is None:
            print(f"Skipping {filename} (unable to read)")
            continue

        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)  # Convert to RGB
        img = img.astype(np.float32) 

        # Normalize: (pixel - mean) / std
        img = (img - mean) / std

        # Convert back to 0-255 range for saving
        img = ((img - img.min()) / (img.max() - img.min()) * 255).astype(np.uint8)

        # Save processed image
        output_path = os.path.join(output_folder, filename)
        cv2.imwrite(output_path, cv2.cvtColor(img, cv2.COLOR_RGB2BGR))  # Convert back to BGR

    print(f"✅ Normalized images saved in: {output_folder}")


# Example Usage
input_folder = "truncated_captchas"
output_folder = "normalized_images"
normalize_images(input_folder, output_folder)


Computing mean & std: 100%|██████████| 316/316 [00:00<00:00, 2012.76it/s]


moyenne = [214.72969 217.93405 141.7747 ] 
 std=[51.95303  47.542946 46.740856]
[214.72969 217.93405 141.7747 ] [51.95303  47.542946 46.740856]


Normalizing images: 100%|██████████| 316/316 [00:00<00:00, 367.85it/s]

✅ Normalized images saved in: normalized_images





In [5]:
import os
import cv2
import numpy as np
from tqdm import tqdm
import pandas as pd
def apply_existing_normalization(input_folder, output_folder, mean, std):
    """
    Applies the same mean and std normalization (computed earlier) to new images.

    Args:
        input_folder (str): Path to the folder containing new images.
        output_folder (str): Path to save the normalized images.
        mean (tuple): Mean values of the original dataset (R, G, B).
        std (tuple): Standard deviation values of the original dataset (R, G, B).
    """
    os.makedirs(output_folder, exist_ok=True)  # Create the output folder if it doesn't exist
    df = pd.DataFrame([])
    for filename in tqdm(os.listdir(input_folder), desc="Processing new images"):
        img_path = os.path.join(input_folder, filename)
        img = cv2.imread(img_path)  # Read image in BGR format
        if img is None:
            print(f"Skipping {filename} (unable to read)")
            continue
        
        # Convert to RGB and scale pixel values to [0,1]
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB).astype(np.float32) / 255.0

        # Apply normalization using precomputed mean & std
        img_normalized = (img - mean) / std  # Standardize the image

        # Rescale to 0-255 for saving
        img_normalized = ((img_normalized - img_normalized.min()) / (img_normalized.max() - img_normalized.min()) * 255).astype(np.uint8)
        df = pd.concat([df, pd.DataFrame({"img_name": [filename], "min": [img_normalized.min()], "max": [img_normalized.max()]})], ignore_index=True)

        # Convert back to BGR before saving (OpenCV expects BGR format)
        output_path = os.path.join(output_folder, filename)
        cv2.imwrite(output_path, cv2.cvtColor(img_normalized, cv2.COLOR_RGB2BGR))
    df.to_csv(input_folder+".csv")
    print(f"✅ Normalized images saved in: {output_folder}")

# Valeurs trouvées:
mean = (0.83977205, 0.8524061, 0.55467314)  # Dataset's mean
std = (0.2027646, 0.18541439, 0.18301369)  # Dataset's std

input_folder = "premier_perso"
output_folder = "normalized_premier_perso"
apply_existing_normalization(input_folder, output_folder, mean, std)

input_folder = "second_perso"
output_folder = "normalized_second_perso"
apply_existing_normalization(input_folder, output_folder, mean, std)


Processing new images: 100%|██████████| 316/316 [00:00<00:00, 880.68it/s]


✅ Normalized images saved in: normalized_premier_perso


Processing new images: 100%|██████████| 316/316 [00:00<00:00, 889.32it/s]

✅ Normalized images saved in: normalized_second_perso





In [None]:
import numpy as np
def encode_image(img, original_min, original_max):
    """
    Reverts the min-max scaling applied to an image.

    Args:
        img (numpy.ndarray): The encoded image (after min-max scaling to 255).
        original_min (float): The minimum pixel value before encoding.
        original_max (float): The maximum pixel value before encoding.

    Returns:
        numpy.ndarray: The encoded image with values standardized
    """
    img = img.astype(np.float32)  # Convert to float32 for precision

    # Reverse min-max scaling: 
    img_encoded = img / 255.0 * (original_max - original_min) + original_min

    return img_encoded


In [51]:
import pandas as pd

def standardize_columns(df, columns, output_file="mean_std.txt"):
    """
    Standardizes the selected columns of a DataFrame and saves the mean and std for each column in a text file.

    Args:
        df (pd.DataFrame): Input DataFrame.
        columns (list): List of column names to standardize.
        output_file (str): File name to save the mean and std values.

    Returns:
        pd.DataFrame: DataFrame with standardized columns.
    """
    mean_std_values = {}

    for col in columns:
        mean = df[col].mean()
        std = df[col].std()

        if std == 0:  # Avoid division by zero
            std = 1  

        df[col] = (df[col] - mean) / std
        mean_std_values[col] = {"mean": mean, "std": std}

    # Save mean and std to a text file
    with open(output_file, "w") as f:
        for col, values in mean_std_values.items():
            f.write(f"{col} mean: {values['mean']}, std: {values['std']}\n")

    return df

df = pd.read_csv("truncated_labels.csv")
df = standardize_columns(df,df.columns[1:])
df.to_csv("std_truncated_labels.csv", index=False)
df

Unnamed: 0,img_name,x1,y1,x2,y2
0,captcha_1.png,-1.033635,0.793751,-0.911822,-1.717953
1,captcha_10.png,-0.877801,0.750201,0.950419,-0.778652
2,captcha_100.png,0.101729,1.708298,-1.505283,1.612295
3,captcha_101.png,0.880900,0.380028,-0.809501,1.099949
4,captcha_102.png,-0.054105,0.488902,0.950419,1.206688
...,...,...,...,...,...
311,captcha_95.png,-0.298988,-1.383742,0.336493,1.441513
312,captcha_96.png,-0.989111,-1.274867,-1.095999,1.249383
313,captcha_97.png,-1.055897,1.272800,0.643456,-0.287654
314,captcha_98.png,-1.055897,-0.011921,1.339238,-1.120216


In [53]:
extract_dirs = ['normalized_images',"normalized_premier_perso","normalized_second_perso"]
for extract_dir in extract_dirs:
    image_path = f"{extract_dir}/captcha_8.png"
    image = cv2.imread(image_path)
    print(image.shape)

(210, 210, 3)
(50, 50, 3)
(50, 50, 3)


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import cv2
import pandas as pd
import os
import numpy as np

class CaptchaDataset(Dataset):
    def __init__(self, extract_dirs, label_file, transform=None):
        """
        Args:
            extract_dirs (list of str): List of directories where images are stored.
            label_file (str): Path to the CSV file containing labels.
            transform (callable, optional): Transformations to apply on images.
        """
        self.extract_dirs = extract_dirs  # List of directories
        self.labels = pd.read_csv(label_file)
        self.transform = transform

    def normalize_image(self,image, mean=(0.83977205, 0.8524061, 0.55467314), std=(0.2027646, 0.18541439,0.18301369)):
        """
        Normalize the image with given mean and std for each channel.
        
        Args:
            image (np.array): The input image to be normalized.
            mean (tuple): A tuple containing the mean for each channel (R, G, B).
            std (tuple): A tuple containing the standard deviation for each channel (R, G, B).
            
        Returns:
            np.array: The normalized image.
        """
        # Normalize image by subtracting mean and dividing by std for each channel (RGB)
        image = (image - mean) / std
        return image

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        # Get image ID
        img_id = self.labels.iloc[idx]['img_name']
        
        # Initialize paths for the three parts
        large_img_path = None
        small_img1_path = None
        small_img2_path = None

        # Construct the paths for the images
        large_img_path = os.path.join(self.extract_dirs[0], img_id)
        small_img1_path = os.path.join(self.extract_dirs[1], img_id)
        small_img2_path = os.path.join(self.extract_dirs[2], img_id)

        # Check if we found all parts
        if not os.path.exists(large_img_path) or not os.path.exists(small_img1_path) or not os.path.exists(small_img2_path):
            print(f"Missing images for {img_id}!")
            return None

        # Load images (big and small)
        large_img = cv2.imread(large_img_path)
        small_img1 = cv2.imread(small_img1_path)
        small_img2 = cv2.imread(small_img2_path)

        # Convert to RGB (OpenCV loads as BGR by default)
        large_img = cv2.cvtColor(large_img, cv2.COLOR_BGR2RGB)
        small_img1 = cv2.cvtColor(small_img1, cv2.COLOR_BGR2RGB)
        small_img2 = cv2.cvtColor(small_img2, cv2.COLOR_BGR2RGB)

        # Apply normalization using the provided mean and std
        large_img = self.normalize_image(large_img)
        small_img1 = self.normalize_image(small_img1)
        small_img2 = self.normalize_image(small_img2)

        # Transpose to PyTorch format (C, H, W)
        large_img = np.transpose(large_img, (2, 0, 1))
        small_img1 = np.transpose(small_img1, (2, 0, 1))
        small_img2 = np.transpose(small_img2, (2, 0, 1))

        # Get labels (x1, y1, x2, y2)
        labels = self.labels.iloc[idx][['x1', 'y1', 'x2', 'y2']].values.astype(np.float32)

        # Convert to tensors
        large_img = torch.tensor(large_img)
        small_img1 = torch.tensor(small_img1)
        small_img2 = torch.tensor(small_img2)
        labels = torch.tensor(labels)

        return large_img, small_img1, small_img2, labels

# Example Usage
extract_dirs = ["normalized_images", "normalized_premier_perso", "normalized_second_perso"]
label_file = "std_truncated_labels.csv"

dataset = CaptchaDataset(extract_dirs, label_file)
train_loader = DataLoader(dataset, batch_size=8,shuffle=True)

# Test data loading
for large_img, small_img1, small_img2, labels in train_loader:
    if large_img is not None:
        print("Large image shape:", large_img.shape)   # Expected: (8, 3, 210, 210)
        print("Small image 1 shape:", small_img1.shape)  # Expected: (8, 3, 50, 50)
        print("Small image 2 shape:", small_img2.shape)  # Expected: (8, 3, 50, 50)
        print("Labels:", labels.shape)  # Expected: (8, 4)
    break  # Just checking one batch


Large image shape: torch.Size([8, 3, 210, 210])
Small image 1 shape: torch.Size([8, 3, 50, 50])
Small image 2 shape: torch.Size([8, 3, 50, 50])
Labels: torch.Size([8, 4])


In [6]:
import pandas as pd 
df = pd.read_csv("truncated_labels.csv")
df=df.drop(["x2","y2"],axis=1)
df.columns = ["img_name","x","y"]
# Normalize x and y coordinates (assuming max width = 210, max height = 210)
df["x"] = df["x"] / 210.0
df["y"] = df["y"] / 210.0

df.to_csv("labels.csv",index=False)
df

Unnamed: 0,img_name,x,y
0,captcha_1.png,0.222933,0.696238
1,captcha_10.png,0.259910,0.685673
2,captcha_100.png,0.492337,0.918099
3,captcha_101.png,0.677221,0.595872
4,captcha_102.png,0.455360,0.622284
...,...,...,...
311,captcha_95.png,0.397253,0.167996
312,captcha_96.png,0.233498,0.194408
313,captcha_97.png,0.217651,0.812451
314,captcha_98.png,0.217651,0.500788


In [1]:
import os
from PIL import Image
import torch
from torch.utils.data import Dataset
import pandas as pd
from torchvision import transforms
class CaptchaDetectionDataset(Dataset):
    def __init__(self, field_folder, draw_folder, label_csv, 
                 transform_field=None, transform_draw=None, transform_label=None):
        """
        Args:
            field_folder (str): Path to the folder containing field images (210x210).
            draw_folder (str): Path to the folder containing draw images (50x50).
            label_csv (str): Path to the CSV file containing labels with columns:
                             "img_name", "x", "y".
            transform_field (callable, optional): Transform to apply to field images.
            transform_draw (callable, optional): Transform to apply to draw images.
            transform_label (callable, optional): Transform to apply to labels.
        """
        self.field_folder = field_folder
        self.draw_folder = draw_folder
        self.labels_df = pd.read_csv(label_csv)
        
        # Optional transforms for images/labels
        self.transform_field = transform_field
        self.transform_draw = transform_draw
        self.transform_label = transform_label
        
        # Create a list of image names from the CSV
        self.img_names = self.labels_df["img_name"].tolist()
    
    def __len__(self):
        return len(self.img_names)
    
    def __getitem__(self, idx):
        # Get the image file name from CSV
        img_name = self.img_names[idx]
        
        # Construct full paths for field and draw images
        field_img_path = os.path.join(self.field_folder, img_name)
        draw_img_path = os.path.join(self.draw_folder, img_name)
        
        # Load images using PIL and convert them to RGB
        field_img = Image.open(field_img_path).convert('RGB')
        draw_img = Image.open(draw_img_path).convert('RGB')
        
        # Apply transformations if provided (for example, resizing, normalization, converting to tensor)
        if self.transform_field:
            field_img = self.transform_field(field_img)
        if self.transform_draw:
            draw_img = self.transform_draw(draw_img)
        
        # Extract label (x, y) from the CSV row and convert to a float tensor
        row = self.labels_df.iloc[idx]
        x = float(row["x"])
        y = float(row["y"])
        label = torch.tensor([x, y], dtype=torch.float32)
        if self.transform_label:
            label = self.transform_label(label)
        
        return field_img, draw_img, label

# Example usage:
if __name__ == '__main__':


    # Define transforms: Convert images to tensors, resize if needed, etc.
    transform_field = transforms.Compose([
        transforms.Resize((210, 210)),
        transforms.ToTensor()  # Converts to tensor and scales pixel values to [0, 1]
    ])
    
    transform_draw = transforms.Compose([
        transforms.Resize((50, 50)),
        transforms.ToTensor()
    ])
    
    # Folder paths
    field_folder = "filtered_truncated_captchas"   # e.g., images of size 210x210
    draw_folder = "normalized_premier_perso"       # e.g., images of size 50x50
    label_csv = "labels.csv"          # CSV with columns: "img_name", "x", "y"
    
    # Create dataset
    dataset = CaptchaDetectionDataset(field_folder, draw_folder, label_csv,
                                      transform_field=transform_field,
                                      transform_draw=transform_draw)
    
    # Create DataLoader for batching and shuffling
    from torch.utils.data import DataLoader
    data_loader = DataLoader(dataset, batch_size=8, shuffle=True)
    
    # Iterate over one batch and print shapes
    for field_img, draw_img, label in data_loader:
        print("Field image batch shape:", field_img.shape)  # Expected: (B, 3, 210, 210)
        print("Draw image batch shape:", draw_img.shape)    # Expected: (B, 3, 50, 50)
        print("Label batch shape:", label.shape)            # Expected: (B, 2)
        break


Field image batch shape: torch.Size([8, 3, 210, 210])
Draw image batch shape: torch.Size([8, 3, 50, 50])
Label batch shape: torch.Size([8, 2])


In [None]:
from torch.utils.data import random_split, DataLoader

# Define train-test split ratio
train_ratio = 0.8  # 80% train, 20% test
test_ratio = 1 - train_ratio

# Compute sizes
dataset_size = len(dataset)
train_size = int(train_ratio * dataset_size)
test_size = dataset_size - train_size  # Ensures all samples are used

# Split dataset
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Print dataset sizes
print(f"Train dataset size: {len(train_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

Train dataset size: 252
Test dataset size: 64


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import random_split, DataLoader

# Define train-test split ratio
train_ratio = 0.8  # 80% train, 20% test
test_ratio = 1 - train_ratio

# Compute sizes
dataset_size = len(dataset)
train_size = int(train_ratio * dataset_size)
test_size = dataset_size - train_size  # Ensures all samples are used

# Split dataset
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize the model (using the DrawLocatorNet defined previously)
model = DrawLocatorNet().to(device)

# Define loss function and optimizer
criterion = nn.MSELoss()  # Since we're doing coordinate regression
optimizer = optim.Adam(model.parameters(), lr=1e-4)

num_epochs = 10

for epoch in range(num_epochs):
    # Training phase
    model.train()
    running_loss = 0.0
    for field_img, draw_img, labels in train_loader:
        # Move data to device and ensure dtype is float32
        field_img = field_img.to(device).float()
        draw_img = draw_img.to(device).float()
        labels = labels.to(device).float()  # Expected shape: (B,2)

        optimizer.zero_grad()

        # Forward pass
        outputs = model(field_img, draw_img)  # Expected shape: (B, 2)

        # Compute loss
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * field_img.size(0)

    epoch_train_loss = running_loss / len(train_loader.dataset)
    print(f"Epoch {epoch+1}/{num_epochs}, Training Loss: {epoch_train_loss:.4f}")

    # Validation phase
    model.eval()
    running_val_loss = 0.0
    with torch.no_grad():
        for field_img, draw_img, labels in val_loader:
            field_img = field_img.to(device).float()
            draw_img = draw_img.to(device).float()
            labels = labels.to(device).float()

            outputs = model(field_img, draw_img)
            loss = criterion(outputs, labels)
            running_val_loss += loss.item() * field_img.size(0)

    epoch_val_loss = running_val_loss / len(val_loader.dataset)
    print(f"Epoch {epoch+1}/{num_epochs}, Validation Loss: {epoch_val_loss:.4f}")

# Optionally, save the trained model
torch.save(model.state_dict(), "draw_locator_net.pth")
