In [1]:
import os
import os.path
import cv2
import glob
import imutils
import numpy as np

### Converting sequences of characters into single characters

In [2]:
SIMPLE_CAPTCHA_FOLDER = 'generated_captcha_images'
SIMPLE_LETTERS_FOLDER = 'extracted_letter_images'
HARD_CAPTCHA_FOLDER = 'kaggle-captchas'
HARD_LETTERS_FOLDER = 'kaggle-letters'

In [3]:
def simple_captcha_preprocess(input_folder, output_folder):
    # Get a list of all the captcha images we need to process
    captcha_image_files = glob.glob(os.path.join(input_folder, "*"))
    counts = {}

    # loop over the image paths
    for (i, captcha_image_file) in enumerate(captcha_image_files):
        print("[INFO] processing image {}/{}".format(i + 1, len(captcha_image_files)))

        # Since the filename contains the captcha text (i.e. "2A2X.png" has the text "2A2X"),
        # grab the base filename as the text
        filename = os.path.basename(captcha_image_file)
        captcha_correct_text = os.path.splitext(filename)[0]

        # Load the image and convert it to grayscale
        image = cv2.imread(captcha_image_file)
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

        # Add some extra padding around the image
        gray = cv2.copyMakeBorder(gray, 8, 8, 8, 8, cv2.BORDER_REPLICATE)

        # threshold the image (convert it to pure black and white)
        thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)[1]

        # find the contours (continuous blobs of pixels) the image
        contours = cv2.findContours(thresh.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        # Hack for compatibility with different OpenCV versions
        contours = contours[1] if imutils.is_cv3() else contours[0]

        letter_image_regions = []

        # Now we can loop through each of the four contours and extract the letter
        # inside of each one
        for contour in contours:
            # Get the rectangle that contains the contour
            (x, y, w, h) = cv2.boundingRect(contour)

            # Compare the width and height of the contour to detect letters that
            # are conjoined into one chunk
            if w / h > 1.25:
                # This contour is too wide to be a single letter!
                # Split it in half into two letter regions!
                half_width = int(w / 2)
                letter_image_regions.append((x, y, half_width, h))
                letter_image_regions.append((x + half_width, y, half_width, h))
            else:
                # This is a normal letter by itself
                letter_image_regions.append((x, y, w, h))

        # If we found more or less than 4 letters in the captcha, our letter extraction
        # didn't work correcly. Skip the image instead of saving bad training data!
        if len(letter_image_regions) != 4:
            continue

        # Sort the detected letter images based on the x coordinate to make sure
        # we are processing them from left-to-right so we match the right image
        # with the right letter
        letter_image_regions = sorted(letter_image_regions, key=lambda x: x[0])

        # Save out each letter as a single image
        for letter_bounding_box, letter_text in zip(letter_image_regions, captcha_correct_text):
            # Grab the coordinates of the letter in the image
            x, y, w, h = letter_bounding_box

            # Extract the letter from the original image with a 2-pixel margin around the edge
            letter_image = gray[y - 2:y + h + 2, x - 2:x + w + 2]

            # Get the folder to save the image in
            save_path = os.path.join(output_folder, letter_text)

            # if the output directory does not exist, create it
            if not os.path.exists(save_path):
                os.makedirs(save_path)

            # write the letter image to a file
            count = counts.get(letter_text, 1)
            p = os.path.join(save_path, "{}.png".format(str(count).zfill(6)))
            cv2.imwrite(p, letter_image)

            # increment the count for the current key
            counts[letter_text] = count + 1

In [4]:
simple_captcha_preprocess(SIMPLE_CAPTCHA_FOLDER, SIMPLE_LETTERS_FOLDER)

[INFO] processing image 1/9955
[INFO] processing image 2/9955
[INFO] processing image 3/9955
[INFO] processing image 4/9955
[INFO] processing image 5/9955
[INFO] processing image 6/9955
[INFO] processing image 7/9955
[INFO] processing image 8/9955
[INFO] processing image 9/9955
[INFO] processing image 10/9955
[INFO] processing image 11/9955
[INFO] processing image 12/9955
[INFO] processing image 13/9955
[INFO] processing image 14/9955
[INFO] processing image 15/9955
[INFO] processing image 16/9955
[INFO] processing image 17/9955
[INFO] processing image 18/9955
[INFO] processing image 19/9955
[INFO] processing image 20/9955
[INFO] processing image 21/9955
[INFO] processing image 22/9955
[INFO] processing image 23/9955
[INFO] processing image 24/9955
[INFO] processing image 25/9955
[INFO] processing image 26/9955
[INFO] processing image 27/9955
[INFO] processing image 28/9955
[INFO] processing image 29/9955
[INFO] processing image 30/9955
[INFO] processing image 31/9955
[INFO] processing

In [5]:
def captcha_kaggle_preprocess(input_folder, output_folder):
    # Get a list of all the captcha images we need to process
    captcha_image_files = glob.glob(os.path.join(input_folder, "*"))
    counts = {}

    for (i, captcha_image_file) in enumerate(captcha_image_files):
        print(f"INFO: processing image {i+1}/{len(captcha_image_files)}")

        # Extract name of the file since it contains the captcha characters
        filename = os.path.basename(captcha_image_file)
        captcha_characters = os.path.splitext(filename)[0]

        img = cv2.imread(os.path.join(input_folder, filename))

        # convert image to gray
        img_gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)

        # threshold the image with a costum mask because it is the best way to process this specific images
        lower = np.array([220,220,220])
        upper = np.array([255,255,255])
        my_mask = cv2.inRange(img, lower, upper)
        thresh = cv2.threshold(my_mask, 0, 255,cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)[1]

        # find letters contours
        img_contours = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
        img_contours = img_contours[1] if imutils.is_cv3() else img_contours[0]

        letter_image_regions =[]

        # loop through contours and extract letter inside of each one
        for contour in img_contours:
            area = cv2.contourArea(contour)
            if area < 90:
                continue

            # get the rectangle that contains the contour
            (x, y, w, h) = cv2.boundingRect(contour)

            # When a contour is including more than one letter
            if (((w / h) > 1.35) and (w > 22)) or (h > 28):
                half_width =  int(w / 2)
                letter_image_regions.append((x, y, half_width, h))
                letter_image_regions.append((x + half_width, y, half_width, h))
            else:
                letter_image_regions.append((x, y, w, h))


        # if we did not found 10 letters then letter extraction did not work and we skip it
        if len(letter_image_regions) != 10:
            continue

        # sort regions from left to right
        letter_image_regions = sorted(letter_image_regions, key = lambda x: x[0])

        # save each letter on a separate image
        for region, letter in zip(letter_image_regions, captcha_characters):
            x, y, w, h = region
            
            # extract the letter from the original image with a 2 pixels margin around the edges
            letter_image  = img_gray[y-2 : y+h+2, x-2 : x+w+2]
            
            rect = cv2.rectangle(img_gray, (x-2, y-2), (x + w+2, y + h+2), (0, 255, 0), 2)

            # save letter
            save_path = os.path.join(output_folder, letter)

            # if the output directory does not exist, create it
            if not os.path.exists(save_path):
                os.makedirs(save_path)

            # for numeration of images
            count = counts.get(letter, 1)

            p = os.path.join(save_path, "{}.png".format(str(count).zfill(6)))
            t = cv2.imwrite(p, letter_image)

            # update number of times each character was seen
            counts[letter] = count + 1

In [7]:
captcha_kaggle_preprocess(HARD_CAPTCHA_FOLDER, HARD_LETTERS_FOLDER)

INFO: processing image 1/10000
INFO: processing image 2/10000
INFO: processing image 3/10000
INFO: processing image 4/10000
INFO: processing image 5/10000
INFO: processing image 6/10000
INFO: processing image 7/10000
INFO: processing image 8/10000
INFO: processing image 9/10000
INFO: processing image 10/10000
INFO: processing image 11/10000
INFO: processing image 12/10000
INFO: processing image 13/10000
INFO: processing image 14/10000
INFO: processing image 15/10000
INFO: processing image 16/10000
INFO: processing image 17/10000
INFO: processing image 18/10000
INFO: processing image 19/10000
INFO: processing image 20/10000
INFO: processing image 21/10000
INFO: processing image 22/10000
INFO: processing image 23/10000
INFO: processing image 24/10000
INFO: processing image 25/10000
INFO: processing image 26/10000
INFO: processing image 27/10000
INFO: processing image 28/10000
INFO: processing image 29/10000
INFO: processing image 30/10000
INFO: processing image 31/10000
INFO: processing 