We use model v5 to solve real captcha images. Then we check if each image is correctly solved. If it is we add this image to a new training dataset.

In [10]:
%matplotlib inline

import sys
sys.path.append("../../")

from pathlib import Path
import io
import urllib.parse as urlparse
import string
import random

import matplotlib.pyplot as plt
import numpy as np

import tensorflow as tf
import tensorflow_datasets as tfds

from tqdm.auto import trange
from tqdm.auto import tqdm
import requests
from scrapy.http import TextResponse
from PIL import Image

import captchanet

data_dir = Path('/home/hadim/.data/Neural_Network/captchanet')

dataset_dir = data_dir / 'dataset_v6'
dataset_dir.mkdir(exist_ok=True, parents=True)

train_data_dir = dataset_dir / 'training'
train_data_dir.mkdir(exist_ok=True, parents=True)
val_data_dir = dataset_dir / 'validation'
val_data_dir.mkdir(exist_ok=True, parents=True)

tokenizer_path = dataset_dir / "tokenizer.json"

In [8]:
# Load tokenizer and model

model_dir = Path("/home/hadim/.data/Neural_Network/captchanet/model/")
model_name = 'v5'
model_path = model_dir / model_name

with open(model_path / 'tokenizer.json') as f:
  #tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(f.read())
  from keras_preprocessing import text
  tokenizer = text.tokenizer_from_json(f.read())
model = tf.keras.models.load_model(str(model_path / 'model'))

def captcha_solver(image):
    
  image = np.array(image)
  images = np.array([image])
  
  # The images need to be normalized t [0, 1] and resize to (224, 224).
  image_size = (224, 224)
  batch = tf.cast(images, 'float32')
  batch = tf.image.per_image_standardization(batch)
  batch = tf.image.resize(batch, image_size)

  # Run inference
  labels = model(batch)

  # Postprocess results (decode labels)
  labels = tf.argmax(labels, axis=2)
  words = tokenizer.sequences_to_texts(labels.numpy())
  words = [word.replace(' ', '') for word in words]
  
  return words[0]

In [11]:
root_url = "https://www.referendum.interieur.gouv.fr/consultation_publique/8/"

n_images = 5000
n = n_images * 3
success = 0
rate = 0
train_size = 0.8
word_length_max = 10

with open(tokenizer_path, 'w') as f:
  f.write(tokenizer.to_json())

train_file_index = len(list(train_data_dir.iterdir()))
val_file_index = len(list(val_data_dir.iterdir()))
  
for i in trange(n):

  letter1 = random.choice(string.ascii_uppercase)
  letter2 = random.choice(string.ascii_uppercase)
  letter3 = random.choice(string.ascii_uppercase)
  url = urlparse.urljoin(root_url, f'{letter1}/{letter2}{letter3}')

  cookies = {}
  cookies['incap_ses_700_2043128'] = 'z00hQBP2mS515CRSzvK2CarcKV0AAAAA54RCSzqGm0bptUV4OJjvlA=='
  session = requests.Session()
  requests.utils.add_dict_to_cookiejar(session.cookies, cookies)

  req = session.get(url)
  response = TextResponse(str(req.url), body=req.text, encoding='utf-8')
  
  iframe_src = response.css('iframe').xpath('@src').get()
  if iframe_src and 'Incapsula' in iframe_src:
    raise Exception("Incapsula issue.")

  captcha = response.css('img#captcha').xpath('@src').get()

  if not captcha:
    raise Exception("Captcha not here.")

  # Get the token
  token = response.css('#form__token').xpath('@value').get()

  # Get the image captcha URL
  captcha_uri = response.css('img#captcha').xpath('@src').get()
  captcha_url = urlparse.urljoin(root_url, captcha_uri)

  # Download the image
  req = session.get(captcha_url)
  captcha_image = Image.open(io.BytesIO(req.content))

  # Solve the captcha
  captcha_solution = captcha_solver(captcha_image)

  # Send captcha solution.
  form_data = {}
  form_data['form[captcha]'] = captcha_solution
  form_data['form[_token]'] = token

  # Get the actual page.
  req = session.post(url, data=form_data)
  response = TextResponse(str(req.url), body=req.text, encoding='utf-8')
  
  iframe_src = response.css('iframe').xpath('@src').get()
  if iframe_src and 'Incapsula' in iframe_src:
    raise Exception("Incapsula issue.")

  captcha = response.css('img#captcha').xpath('@src').get()

  if not captcha:
    success += 1
    
    # Create TF feature and save it
    if np.random.random() < train_size:
      writer_path = train_data_dir / f"{train_file_index:08d}.tfrecord"
      train_file_index += 1
    else:
      writer_path = val_data_dir / f"{val_file_index:08d}.tfrecord"
      val_file_index += 1

    # Save data to TFRecord.
    writer = tf.io.TFRecordWriter(str(writer_path))
    example = captchanet.encode_data(np.asarray(captcha_image), captcha_solution, tokenizer, word_length_max)  
    writer.write(example.SerializeToString())
    writer.close()
    
  rate = (success / (i+1)) * 100
  tqdm.write(f"{rate:.1f} ({success}/{i+1})")

HBox(children=(IntProgress(value=0, max=15000), HTML(value='')))

0.0 (0/1)
50.0 (1/2)
33.3 (1/3)
25.0 (1/4)
20.0 (1/5)
16.7 (1/6)
14.3 (1/7)
12.5 (1/8)


KeyboardInterrupt: 