In [21]:
import cv2
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
from torchvision import transforms
from PIL import Image

class DrawLocatorNet(nn.Module):
    def __init__(self):
        super(DrawLocatorNet, self).__init__()
        # Field branch: for the field image (210x210)
        # We use three convolutional blocks.
        self.field_cnn = nn.Sequential(
            # Block 1: 210x210 -> 105x105
            nn.Conv2d(3, 32, kernel_size=3, padding=1),  # (B,32,210,210)
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(2),                           # (B,32,105,105)
            
            # Block 2: 105x105 -> 52x52 (approx.)
            nn.Conv2d(32, 64, kernel_size=3, padding=1), # (B,64,105,105)
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2),                           # (B,64,52,52)
            
            # Block 3: 52x52 -> 26x26
            nn.Conv2d(64, 128, kernel_size=3, padding=1),# (B,128,52,52)
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(2)                            # (B,128,26,26)
        )
        
        # Draw branch: for the draw image (50x50)
        # We use three convolutional blocks and an adaptive average pooling to get a feature vector.
        self.draw_cnn = nn.Sequential(
            # Block 1: 50x50 -> 25x25
            nn.Conv2d(3, 32, kernel_size=3, padding=1),   # (B,32,50,50)
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(2),                              # (B,32,25,25)
            
            # Block 2: 25x25 -> 12x12 (approx.)
            nn.Conv2d(32, 64, kernel_size=3, padding=1),   # (B,64,25,25)
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2),                              # (B,64,12,12)
            
            # Block 3: 12x12 remains 12x12 but increases channels
            nn.Conv2d(64, 128, kernel_size=3, padding=1),  # (B,128,12,12)
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.AdaptiveAvgPool2d((1, 1))                    # (B,128,1,1)
        )
    
    def forward(self, field_img, draw_img):
        """
        Args:
            field_img: Tensor of shape (B,3,210,210)
            draw_img: Tensor of shape (B,3,50,50)
        Returns:
            preds: Tensor of shape (B,2) with the predicted (x, y) coordinates normalized in [0,1].
        """
        # Extract features from the field image; shape: (B,128,26,26)
        field_feat = self.field_cnn(field_img)
        
        # Extract feature vector from the draw image; shape: (B,128,1,1)
        draw_feat = self.draw_cnn(draw_img)
        draw_feat = draw_feat.view(draw_feat.size(0), -1)  # (B,128)
        
        # Compute the correlation map:
        # For each spatial location in the field feature map, compute the dot product with the draw feature vector.
        # Result: (B,26,26)
        correlation = (field_feat * draw_feat.view(draw_feat.size(0), draw_feat.size(1), 1, 1)).sum(dim=1)
        
        # Get spatial dimensions (H, W) of the correlation map
        B, H, W = correlation.size()
        
        # Create a coordinate grid corresponding to the correlation map.
        device = correlation.device
        grid_y, grid_x = torch.meshgrid(torch.arange(H, device=device), torch.arange(W, device=device), indexing='ij')
        grid_x = grid_x.float()
        grid_y = grid_y.float()
        
        # Flatten the correlation map and compute softmax to obtain a probability distribution over spatial locations.
        correlation_flat = correlation.view(B, -1)  # shape: (B, H*W)
        prob = F.softmax(correlation_flat, dim=1).view(B, H, W)  # shape: (B, H, W)
        
        # Compute the expected coordinates (soft-argmax) in the feature map grid.
        pred_x = (prob * grid_x).view(B, -1).sum(dim=1)
        pred_y = (prob * grid_y).view(B, -1).sum(dim=1)
        
        # The predicted coordinates are in the feature map scale (26x26).
        # To normalize them to [0,1], we simply divide by the width/height of the feature map.
        pred_x = pred_x / W
        pred_y = pred_y / H
        
        preds = torch.stack([pred_x, pred_y], dim=1)  # shape: (B,2)
        return preds


def get_information_zone(img):
    top_pixels,bot_pixels,left_pixels,right_pixels = 310,100,65,275 # Field coordonates
    field_img = img[bot_pixels:top_pixels, left_pixels:right_pixels]

    top_pixels,bot_pixels,left_pixels,right_pixels = 55,5,185,235 # Draw 1 coordonates
    draw1_img = img[bot_pixels:top_pixels, left_pixels:right_pixels] 

    top_pixels, bot_pixels, left_pixels, right_pixels = 50,10, 265, 305 #Draw 2 coordonates
    draw2_img = img[bot_pixels:top_pixels, left_pixels:right_pixels]

    return field_img, draw1_img, draw2_img


def normalize_img(img, mean = (0.83977205, 0.8524061, 0.55467314), std = (0.2027646, 0.18541439, 0.18301369)): #mean and std comes from a study in the dataset used for training model
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)  # Convert to RGB
    img = img.astype(np.float32) 

    # Normalize: (pixel - mean) / std
    img = (img - mean) / std

    # Convert back to 0-255 range 
    img = ((img - img.min()) / (img.max() - img.min()) * 255).astype(np.uint8)

    return img


def make_predictions(field_img, draw1_img, draw2_img, model_path, device="cuda" if torch.cuda.is_available() else "cpu"):
    transform = transforms.Compose([
        transforms.ToTensor(),   # Convert to tensor (C, H, W) and scale to [0,1]
    ])
    field_img = transform(field_img).unsqueeze(0).to(device)  # Shape: (1, 3, 210, 210)
    draw1_img = transform(draw1_img).unsqueeze(0).to(device)    # Shape: (1, 3, 50, 50)
    draw2_img = transform(draw2_img).unsqueeze(0).to(device)    # Shape: (1, 3, 50, 50)
    model = DrawLocatorNet()
    model.load_state_dict(torch.load(model_path, map_location="cuda" if torch.cuda.is_available() else "cpu"))
    model.eval()
    model.to(device)
    # Run inference
    with torch.no_grad():
        pred1_coords = model(field_img, draw1_img).cpu().numpy()[0]  # Shape: (2,)
        pred2_coords = model(field_img, draw2_img).cpu().numpy()[0]  # Shape: (2,)
    # Extract and return the (x, y) coordinates
    pred_x1, pred_y1 = pred1_coords*210
    pred_x2, pred_y2 = pred2_coords*210
    print(pred_x1, pred_y1, pred_x2, pred_y2)
    #Give useful coordonates
    bot_pixels, left_pixels = 100, 65
    x1, x2 = int(pred_x1) + left_pixels, int(pred_x2) + left_pixels
    y1, y2 = int(pred_y1) + bot_pixels, int(pred_y2) + bot_pixels
    return x1, y1, x2, y2


def run(img, model_path = "Dev/training/e100_m20_g50.pth"):

    field_img, draw1_img, draw2_img = get_information_zone(img)

    field_img, draw1_img, draw2_img = cv2.cvtColor(normalize_img(field_img), cv2.COLOR_RGB2BGR), cv2.cvtColor(normalize_img(draw1_img), cv2.COLOR_RGB2BGR), cv2.cvtColor(normalize_img(draw2_img), cv2.COLOR_RGB2BGR)
    for img in [field_img, draw1_img, draw2_img]:
        cv2.imshow("img",img)
        cv2.waitKey(0)
        cv2.destroyAllWindows()
        
    x1, y1, x2, y2 = make_predictions(field_img, draw1_img, draw2_img, model_path)

    return x1, y1, x2, y2


image_path = 'Dev\extracted_captchas\captchas_saved\captcha_87.png'
image = cv2.imread(image_path)
print(run(img=image))


8.082991 84.090775 50.159946 87.09453
(73, 184, 115, 187)
