<a href="https://colab.research.google.com/github/kartika-nair/CAPTCHA-Solver/blob/master/DistortedText.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Distorted Text CAPTCHA solver - using PyTorch

In [1]:
import os
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from torchvision.models import resnet18
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import StepLR

import string
from tqdm.notebook import tqdm
import cv2
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import multiprocessing as mp

import cv2
import imutils
import tqdm as tq
import pickle
import os.path
from imutils import paths
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from keras.models import Sequential, load_model
from keras.layers.convolutional import Conv2D, MaxPooling2D
from keras.layers.core import Flatten, Dense
from google.colab.patches import cv2_imshow

import matplotlib.pyplot as plt
import matplotlib.image as mpimg

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**PROJECT DATA**

In [3]:
cpu_count = mp.cpu_count()

IMAGES_FOLDER = "/content/drive/MyDrive/Colab Notebooks/Images"

EXTRACT_IMAGES = 0 #SET THIS TO TRUE TO GENERATE IMAGES READBLE BY CODE
NUMBER_OF_LETTERS = 4

EXTRACTED_FOLDER = "/content/drive/MyDrive/Colab Notebooks/EXTRACTED"

IMAGE_SIZE = 32

**EXTRACT IMAGES FROM CAPTCHA IMAGES**

In [4]:
def extract_image(path = IMAGES_FOLDER, output = EXTRACTED_FOLDER):
  captcha_image_files = glob.glob(os.path.join(path, "*"))
  counts = {}
  for (i, captcha_image_file) in tq.tqdm(enumerate(captcha_image_files)):
    # print(f"[DEBUG]{i}: {os.path.basename(captcha_image_file)}")

    try:
      filename = os.path.basename(captcha_image_file)
      captcha_correct_text = os.path.splitext(filename)[0]

      image = cv2.imread(captcha_image_file)
      gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

      gray = cv2.copyMakeBorder(gray, 8, 8, 8, 8, cv2.BORDER_REPLICATE)

      thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)[1]

      contours = cv2.findContours(thresh.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

      contours = contours[1] if imutils.is_cv3() else contours[0]

      letter_image_regions = []

      for contour in contours:
          (x, y, w, h) = cv2.boundingRect(contour)

          if w / h > 1.25:

              half_width = int(w / 2)
              letter_image_regions.append((x, y, half_width, h))
              letter_image_regions.append((x + half_width, y, half_width, h))

          else:

              letter_image_regions.append((x, y, w, h))

      if len(letter_image_regions) != NUMBER_OF_LETTERS:
          continue

      letter_image_regions = sorted(letter_image_regions, key=lambda x: x[0])

      for letter_bounding_box, letter_text in zip(letter_image_regions, captcha_correct_text):
          x, y, w, h = letter_bounding_box
          letter_image = gray[y - 2:y + h + 2, x - 2:x + w + 2]

          save_path = os.path.join(output, "")

      if not os.path.exists(save_path):
          os.makedirs(save_path)

      count = counts.get(letter_text, 1)
      p = os.path.join(save_path, "{}_{}.png".format(letter_text, str(count).zfill(6)))
      cv2.imwrite(p, letter_image)

      counts[letter_text] = count + 1
    except Exception as e:
      pass

In [5]:
if EXTRACT_IMAGES:
  extract_image()

9955it [1:05:53,  2.52it/s]


In [6]:
def resize_to_fit(image, width = IMAGE_SIZE, height = IMAGE_SIZE):

  (h, w) = image.shape[:2]

  if w > h:
    image = imutils.resize(image, width=width)
  else:
    image = imutils.resize(image, height=height)

  padW = int((width - image.shape[1]) / 2.0)
  padH = int((height - image.shape[0]) / 2.0)

  image = cv2.copyMakeBorder(image, padH, padH, padW, padW, cv2.BORDER_REPLICATE)
  image = cv2.resize(image, (width, height))

  return image

**CUSTOM DATASET CLASS**

In [7]:
class CAPTCHADataset(Dataset):

  def __init__(self, images, data_dir = EXTRACTED_FOLDER):
    self.data_dir = data_dir
    self.images = images
  
  def __len__(self):
    return len(self.images)
  
  def __getitem__(self, index):
    image_fn = self.images[index]
    image = cv2.imread(f"{self.data_dir}/{image_fn}")
    # image = Image.open(image_fp).convert('RGB')
    image = resize_to_fit(image)
    image = self.transform(image)
    text = image_fn.split("_")[0]
    return image, text
  
  def transform(self, image):
    transform_ops = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
    ])
    return transform_ops(image)

**ENCODE DATASET**

In [8]:
images = os.listdir(EXTRACTED_FOLDER)
image_fns_train, image_fns_test = train_test_split(images, random_state=0)
image_ns = [image.split("_")[0] for image in images]
image_ns = "".join(image_ns)
letters = sorted(list(set(list(image_ns))))
vocabulary = ["-"] + letters
idx2char = {k:v for k,v in enumerate(vocabulary, start=0)}
char2idx = {v:k for k,v in idx2char.items()}

**MODEL PARAMETERS**

In [9]:
batch_size = 6
num_epochs = 50
lr = 0.001
log_interval = 100
gamma = 0.7
num_chars = len(char2idx)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

**DEFINE TEST AND TRAIN LOADER**

In [10]:
trainset = CAPTCHADataset(images = image_fns_train, data_dir =  EXTRACTED_FOLDER) 
testset = CAPTCHADataset(images = image_fns_test, data_dir = EXTRACTED_FOLDER)
train_loader = DataLoader(trainset, batch_size=batch_size, num_workers=cpu_count, shuffle=True)
test_loader = DataLoader(testset, batch_size=batch_size, num_workers=cpu_count, shuffle=False)

In [11]:
image_batch, text_batch = iter(train_loader).next()
num_chars = len(char2idx)

In [12]:
class Net(nn.Module):
  def __init__(self, num_chars = num_chars):
    super(Net, self).__init__()
    self.conv1 = nn.Conv2d(3, 32, 3, 1)
    self.conv2 = nn.Conv2d(32, 64, 3, 1)
    self.dropout1 = nn.Dropout(0.25)
    self.dropout2 = nn.Dropout(0.5)
    self.fc1 = nn.Linear(12544, 128)
    self.fc2 = nn.Linear(128, num_chars)

  def forward(self, x):
    x = self.conv1(x)
    x = F.relu(x)
    x = self.conv2(x)
    x = F.relu(x)
    x = F.max_pool2d(x, 2)
    x = self.dropout1(x)
    x = torch.flatten(x, 1)
    x = self.fc1(x)
    x = F.relu(x)
    x = self.dropout2(x)
    x = self.fc2(x)
    output = F.log_softmax(x, dim=1)
    return output

In [13]:
modelTest = Net()
modelTest(image_batch)

tensor([[-3.7295, -3.7067, -3.5759, -3.8884, -3.7532, -3.7660, -3.8893, -3.9657,
         -3.8400, -3.6846, -3.8432, -3.8274, -3.7876, -3.7650, -4.0837, -4.1099,
         -3.6069, -3.8272, -3.8621, -3.6646, -4.0093, -3.7186, -4.1738, -4.0952,
         -3.7059, -3.8672, -4.0765, -3.8492, -3.7806, -3.9071, -3.8525, -3.8461,
         -3.7133, -3.5364, -3.7088, -3.9179, -3.8034, -3.7278, -3.9314, -3.8192,
         -3.7876, -3.6111, -3.7608, -3.5669, -3.8232],
        [-3.9640, -3.8047, -3.5731, -3.7277, -3.7980, -3.6339, -3.8768, -3.7852,
         -3.7918, -3.8851, -3.8808, -3.8785, -3.6579, -3.5244, -3.8954, -3.9758,
         -3.7337, -3.7925, -3.7934, -3.6685, -3.9604, -3.9075, -3.8709, -3.9319,
         -3.8031, -3.9915, -4.0029, -4.1132, -3.7749, -3.9844, -3.8587, -3.8530,
         -3.8359, -3.8541, -3.6955, -3.9491, -3.6976, -3.7705, -3.9908, -3.6807,
         -3.4974, -3.6970, -3.9059, -3.6375, -3.7998],
        [-3.9097, -3.6195, -3.7800, -3.9016, -3.9628, -3.7504, -3.8237, -4.1111,

In [14]:
model = Net().to(device)

In [15]:
def encode_text_batch(text_batch, device):
    
  text_batch_targets_lens = [len(text) for text in text_batch]
  text_batch_targets_lens = torch.LongTensor(text_batch_targets_lens)
  
  text_batch_concat = "".join(text_batch)
  text_batch_targets = [char2idx[c] for c in text_batch_concat]
  text_batch_targets = torch.LongTensor(text_batch_targets)
  
  return text_batch_targets.to(device), text_batch_targets_lens.to(device)

In [16]:
optimizer = optim.Adadelta(model.parameters(), lr=lr)
scheduler = StepLR(optimizer, step_size=1, gamma=gamma)

In [17]:
loss_func = nn.CrossEntropyLoss()

In [18]:
def train(epoch, train_loader = train_loader, device = device, log_interval = log_interval, model = model):
  model.train()
  for batch_idx, (data, target) in enumerate(train_loader):
    data, target = data.to(device), target
    optimizer.zero_grad()
    output = model(data)
    target, _ = encode_text_batch(text_batch, device)
    if (data.size()[0] != 6 or (target.size()[0] >= num_chars or target.size()[0] <= 0)):
      continue
    loss = loss_func(output, target)
    loss.backward()
    optimizer.step()
    if batch_idx % log_interval == 0:
      print(f'Train Epoch: {epoch} [{batch_idx * len(data)}/{len(train_loader.dataset)} ({round(100. * batch_idx / len(train_loader), 2)}%)]\tLoss: {round(loss.item(), 6)}')

In [19]:
def test(model = model, device = device, test_loader = test_loader):
  model.eval()
  test_loss = 0
  correct = 0
  with torch.no_grad():
    for data, target in test_loader:
      data, target = data.to(device), target
      target, _ = encode_text_batch(text_batch, device)
      if (data.size()[0] != 6 or (target.size()[0] >= num_chars or target.size()[0] <= 0)):
        continue
      output = model(data)
      test_loss += loss_func(output, target).item()  # sum up batch loss
      pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
      correct += pred.eq(target.view_as(pred)).sum().item()

  test_loss /= len(test_loader.dataset)

  print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
    test_loss, correct, len(test_loader.dataset),
    100. * correct / len(test_loader.dataset)))

In [20]:
for epoch in range(1, num_epochs + 1):
  train(epoch)
  test()
  # model.eval()
  # test_loss = 0
  # correct = 0
  # with torch.no_grad():
  #     for data, target in test_loader:
  #         data, target = data.to(device), target
  #         output = model(data)
  #         target, _ = encode_text_batch(text_batch, device)
  #         test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
  #         pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
  #         correct += pred.eq(target.view_as(pred)).sum().item()

  # test_loss /= len(test_loader.dataset)

  # print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
  #     test_loss, correct, len(test_loader.dataset),
  #     100. * correct / len(test_loader.dataset)))



error: ignored