In [1]:
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from PIL import Image
import requests
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import ToTensor

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [3]:
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed")

In [4]:
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-printed")

Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "qkv_bias": false,
  "transformers_version": "4.47.1"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> is overwritten by shared decoder config: TrOCRConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_cross_attention": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "cross_attention_hidden_size": 768,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder

### Dataset building

In [5]:
#Dataset building
f = open("values_captchat.txt", "r")
f_test = open("values_captchat_test.txt", "r")
class CustomImageDataset(Dataset):
    def __init__(self, file_for_labels, img_dir, processor, max_length = 14):
        self.img_labels = [label for label in file_for_labels]
        self.img_dir = img_dir
        self.max_length = max_length
        self.processor = processor
    
    def __len__(self):
        return len(self.img_labels)
    
    def __getitem__(self, idx):
        #transform to integer all character
        characters_labels = self.img_labels[idx]
        labels = self.processor.tokenizer(characters_labels, padding="max_length", max_length=self.max_length).input_ids
        # important: make sure that PAD tokens are ignored by the loss function
        labels = [label if label != self.processor.tokenizer.pad_token_id else -100 for label in labels]
        img = Image.open(f"{self.img_dir}/captcha_image_{idx}.png").convert("RGB")
        pixel_values = self.processor(img, return_tensors="pt").pixel_values
        encoding = {"pixel_values": pixel_values.squeeze(), "labels": torch.tensor(labels)}
        return encoding

dataset = CustomImageDataset(f, "images_from_sous_pref", processor, 14)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

dataset_test = CustomImageDataset(f_test, "images_from_sous_pref_test", processor, 14)
dataloader_test = DataLoader(dataset_test, batch_size=4, shuffle=True)
f.close()
f_test.close()

### Config model

In [6]:
# set special tokens used for creating the decoder_input_ids from the labels
model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
model.config.pad_token_id = processor.tokenizer.pad_token_id
# make sure vocab size is set correctly
model.config.vocab_size = model.config.decoder.vocab_size

# set beam search parameters
model.config.eos_token_id = processor.tokenizer.sep_token_id
model.config.max_length = 14
model.config.early_stopping = True
model.config.no_repeat_ngram_size = 3
model.config.length_penalty = 2.0
model.config.num_beams = 4

In [7]:
from evaluate import load
cer_metric = load("cer")

Using the latest cached version of the module from C:\Users\kouas\.cache\huggingface\modules\evaluate_modules\metrics\evaluate-metric--cer\9cb90b752d5f15fb41161efdbefd13570adb3f32fa157290d8a55093c47428e1 (last modified on Fri Jan  3 22:36:09 2025) since it couldn't be found locally at evaluate-metric--cer, or remotely on the Hugging Face Hub.


In [8]:
def compute_cer(pred_ids, label_ids):
    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id
    label_str = processor.batch_decode(label_ids, skip_special_tokens=True)

    cer = cer_metric.compute(predictions=pred_str, references=label_str)

    return cer

In [9]:
model = model.to(device)

In [10]:
from transformers import AdamW
from tqdm.notebook import tqdm

optimizer = AdamW(model.parameters(), lr=5e-5)

for epoch in range(8):  # loop over the dataset multiple times
   # train
   model.train()
   train_loss = 0.0
   for batch in tqdm(dataloader):
      # get the inputs
      for k,v in batch.items():
        batch[k] = v.to(device)

      # forward + backward + optimize
      outputs = model(**batch)
      loss = outputs.loss
      loss.backward()
      optimizer.step()
      optimizer.zero_grad()

      train_loss += loss.item()

   print(f"Loss after epoch {epoch}:", train_loss/len(dataloader))
  
   # evaluate
   model.eval()
   valid_cer = 0.0
   with torch.no_grad():
     for batch in tqdm(dataloader_test):
       # run batch generation
       outputs = model.generate(batch["pixel_values"].to(device))
       # compute metrics
       cer = compute_cer(pred_ids=outputs, label_ids=batch["labels"])
       valid_cer += cer 

   print("Validation CER:", valid_cer / len(dataloader_test))

model.save_pretrained(".")



  0%|          | 0/158 [00:00<?, ?it/s]

Loss after epoch 0: 2.0246742797803274


  0%|          | 0/3 [00:00<?, ?it/s]



Validation CER: 0.36115246760408054


  0%|          | 0/158 [00:00<?, ?it/s]

Loss after epoch 1: 1.2664605154644084


  0%|          | 0/3 [00:00<?, ?it/s]

Validation CER: 0.21147421931735658


  0%|          | 0/158 [00:00<?, ?it/s]

Loss after epoch 2: 0.9095523945892914


  0%|          | 0/3 [00:00<?, ?it/s]

Validation CER: 0.2692031982354563


  0%|          | 0/158 [00:00<?, ?it/s]

Loss after epoch 3: 0.805818394015107


  0%|          | 0/3 [00:00<?, ?it/s]

Validation CER: 0.24107142857142852


  0%|          | 0/158 [00:00<?, ?it/s]

Loss after epoch 4: 0.7872593256680271


  0%|          | 0/3 [00:00<?, ?it/s]

Validation CER: 0.18768561187916025


  0%|          | 0/158 [00:00<?, ?it/s]

Loss after epoch 5: 0.5949077980380647


  0%|          | 0/3 [00:00<?, ?it/s]

Validation CER: 0.25900383141762456


  0%|          | 0/158 [00:00<?, ?it/s]

Loss after epoch 6: 0.7695975538484657


  0%|          | 0/3 [00:00<?, ?it/s]

Validation CER: 0.19534050179211468


  0%|          | 0/158 [00:00<?, ?it/s]

Loss after epoch 7: 0.31828259033021294


  0%|          | 0/3 [00:00<?, ?it/s]

Validation CER: 0.11123470522803114




In [21]:
model.save_pretrained("./model_save/model_trocr_finetuned_v0")

In [22]:
#attempt to load the model
model_v0 = VisionEncoderDecoderModel.from_pretrained("./model_save/model_trocr_finetuned_v0")

Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "qkv_bias": false,
  "transformers_version": "4.47.1"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> is overwritten by shared decoder config: TrOCRConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_cross_attention": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "cross_attention_hidden_size": 768,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder

In [26]:
image = Image.open("images_from_sous_pref/captcha_image_404.png").convert("RGB")
pixel_values = processor(image, return_tensors="pt").pixel_values
generated_ids = model.generate(pixel_values.to(device))
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
print("Predicted text:", generated_text)

Predicted text: R4666S49U



In [2]:
import base64
print(base64.b64encode(b"kajo225@gmx.com").decode())  # Replace with your email
print(base64.b64encode(b"2IED5CEXNIHMJJ7IJB2Z").decode())  # Replace with your password


a2FqbzIyNUBnbXguY29t
MklFRDVDRVhOSUhNSko3SUpCMlo=


In [15]:
import smtplib
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart

# Ethereal credentials (replace with your own details)
smtp_server = 'smtp.ethereal.email'
smtp_port = 587
smtp_user = 'herman.ward@ethereal.email'
smtp_password = 'gCTQ2K4nEWE8dbprht'

# Email content
sender_email = 'herman.ward@ethereal.email'
receiver_email = 'akpossan.kouassi@centrale-casablanca.ma'
subject = 'Test Email'
body = 'This is a test email sent via Ethereal server using Python.'

# Create the email message
message = MIMEMultipart()
message['From'] = sender_email
message['To'] = receiver_email
message['Subject'] = subject

# Attach the body to the email
message.attach(MIMEText(body, 'plain'))

# Send the email using the SMTP server
try:
    server = smtplib.SMTP(smtp_server, smtp_port)
    server.starttls()  # Secure connection
    server.login(smtp_user, smtp_password)
    server.sendmail(sender_email, receiver_email, message.as_string())
    print('Email sent successfully!')
except Exception as e:
    print(f'Error sending email: {e}')
finally:
    server.quit()


Email sent successfully!


In [5]:
import base64
#5CMUOU2ZQWDWX45RP64C
username = "kajo225@gmx.com"
password = "2IED5CEXNIHMJJ7IJB2Z"  # Replace with your app-specific password
auth_plain = f"\0{username}\0{password}".encode()
print("AUTH PLAIN:", base64.b64encode(auth_plain).decode())
print("AUTH LOGIN username:", base64.b64encode(username.encode()).decode())
print("AUTH LOGIN password:", base64.b64encode(password.encode()).decode())


AUTH PLAIN: AGtham8yMjVAZ214LmNvbQAySUVENUNFWE5JSE1KSjdJSkIyWg==
AUTH LOGIN username: a2FqbzIyNUBnbXguY29t
AUTH LOGIN password: MklFRDVDRVhOSUhNSko3SUpCMlo=
