# PIPELINE COMPLETE FINE TUNE TR OCR MEME STRATEGIE QUE LE MODEL FROM SCRATCH

In [None]:
# =========================================================
# TrOCR CAPTCHA PIPELINE â€” ONE BLOCK (CPU friendly, HF-safe)
# Train/Val split + Augmentation + Phase A (fast) + Phase B (benchmark)
# Metrics: CER + Exact (Phase B only)
# =========================================================

import os, random, shutil
import numpy as np
import pandas as pd
from PIL import Image, ImageEnhance, ImageFilter

import torch
from torch.utils.data import Dataset

from transformers import (
    VisionEncoderDecoderModel,
    TrOCRProcessor,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    default_data_collator,
    GenerationConfig
)

# -------------------------
# 0) CONFIG
# -------------------------
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

DATA_DIR = r"C:\Users\jbche\OneDrive - UniversitÃ© Paris 1 PanthÃ©on-Sorbonne\MOSEF\projets\webscrapping\projet\data\data_OCR_Captcha-20260117T105614Z-1-001\finetune_russie\data_russie"
TRAIN_DIR = os.path.join(DATA_DIR, "train")
VAL_DIR   = os.path.join(DATA_DIR, "val")

TRAIN_FRAC = 0.9
MAX_LEN = 6  # captcha souvent 4-6 -> accÃ©lÃ¨re la gÃ©nÃ©ration

MODEL_CKPT = "microsoft/trocr-small-printed"
OUT_DIR_A = "./trocr_phaseA"
OUT_DIR_B = "./trocr_phaseB"
FINAL_DIR = "./trocr_final"

# -------------------------
# 1) TRAIN / VAL SPLIT (COPY)
# -------------------------
os.makedirs(TRAIN_DIR, exist_ok=True)
os.makedirs(VAL_DIR, exist_ok=True)

if len(os.listdir(TRAIN_DIR)) == 0 and len(os.listdir(VAL_DIR)) == 0:
    files = [f for f in os.listdir(DATA_DIR) if f.lower().endswith((".png",".jpg",".jpeg"))]
    random.shuffle(files)
    n_train = int(TRAIN_FRAC * len(files))

    train_files = files[:n_train]
    val_files   = files[n_train:]

    for f in train_files:
        shutil.copy(os.path.join(DATA_DIR, f), os.path.join(TRAIN_DIR, f))
    for f in val_files:
        shutil.copy(os.path.join(DATA_DIR, f), os.path.join(VAL_DIR, f))

print("Train:", len([f for f in os.listdir(TRAIN_DIR) if f.lower().endswith(('.png','.jpg','.jpeg'))]))
print("Val  :", len([f for f in os.listdir(VAL_DIR) if f.lower().endswith(('.png','.jpg','.jpeg'))]))

# -------------------------
# 2) BUILD LABEL DFS (label = filename stem)
# -------------------------
def build_df(folder):
    files = [f for f in os.listdir(folder) if f.lower().endswith((".png",".jpg",".jpeg"))]
    df = pd.DataFrame(files, columns=["file_name"])
    df["text"] = df["file_name"].apply(lambda x: x.rsplit(".", 1)[0].lower())
    return df

train_df = build_df(TRAIN_DIR)
val_df   = build_df(VAL_DIR)

# -------------------------
# 3) AUGMENTATION (captcha-ish)
# -------------------------
def augment_image(img: Image.Image) -> Image.Image:
    if random.random() < 0.5:
        img = ImageEnhance.Brightness(img).enhance(random.uniform(0.8, 1.2))
    if random.random() < 0.5:
        img = ImageEnhance.Contrast(img).enhance(random.uniform(0.8, 1.3))
    if random.random() < 0.2:
        arr = np.array(img).astype(np.float32)
        noise = np.random.normal(0, 5, arr.shape)
        arr = np.clip(arr + noise, 0, 255).astype(np.uint8)
        img = Image.fromarray(arr)
    if random.random() < 0.15:
        img = img.filter(ImageFilter.GaussianBlur(radius=random.uniform(0.3, 0.8)))
    return img

# -------------------------
# 4) PROCESSOR
# -------------------------
processor = TrOCRProcessor.from_pretrained(MODEL_CKPT)

# -------------------------
# 5) DATASET (robust pixel_values indexing)
# -------------------------
class CaptchaDataset(Dataset):
    def __init__(self, folder, df, processor, max_length=6, augment=False):
        self.folder = folder
        self.df = df.reset_index(drop=True)
        self.processor = processor
        self.max_length = max_length
        self.augment = augment

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        file_name = self.df.loc[idx, "file_name"]
        text = self.df.loc[idx, "text"]

        img_path = os.path.join(self.folder, file_name)
        image = Image.open(img_path).convert("RGB")
        if self.augment:
            image = augment_image(image)

        pixel_values = self.processor(image, return_tensors="pt").pixel_values[0]  # âœ… safe

        labels = self.processor.tokenizer(
            text,
            padding="max_length",
            max_length=self.max_length,
            truncation=True
        ).input_ids
        labels = [l if l != self.processor.tokenizer.pad_token_id else -100 for l in labels]

        return {"pixel_values": pixel_values, "labels": torch.tensor(labels, dtype=torch.long)}

train_ds = CaptchaDataset(TRAIN_DIR, train_df, processor, max_length=MAX_LEN, augment=True)
val_ds   = CaptchaDataset(VAL_DIR, val_df, processor, max_length=MAX_LEN, augment=False)

# -------------------------
# 6) METRICS (Phase B only)
# -------------------------
def levenshtein(a: str, b: str) -> int:
    n, m = len(a), len(b)
    if n == 0: return m
    if m == 0: return n
    prev = list(range(m + 1))
    for i in range(1, n + 1):
        cur = [i] + [0] * m
        ai = a[i - 1]
        for j in range(1, m + 1):
            cost = 0 if ai == b[j - 1] else 1
            cur[j] = min(prev[j] + 1, cur[j - 1] + 1, prev[j - 1] + cost)
        prev = cur
    return prev[m]

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    pred_str = processor.batch_decode(preds, skip_special_tokens=True)

    labels = labels.copy()
    labels[labels == -100] = processor.tokenizer.pad_token_id
    label_str = processor.batch_decode(labels, skip_special_tokens=True)

    exact = float(np.mean([p == l for p, l in zip(pred_str, label_str)]))

    edits, chars = 0, 0
    for p, l in zip(pred_str, label_str):
        edits += levenshtein(p, l)
        chars += len(l)

    cer = edits / max(1, chars)
    return {"exact_acc": exact, "cer": cer}

# -------------------------
# 7) MODEL + HF-SAFE GENERATION CONFIG
# -------------------------
model = VisionEncoderDecoderModel.from_pretrained(MODEL_CKPT)

model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
model.config.pad_token_id = processor.tokenizer.pad_token_id
model.config.eos_token_id = processor.tokenizer.sep_token_id

model.generation_config = GenerationConfig.from_model_config(model.config)
model.generation_config.max_length = MAX_LEN
model.generation_config.num_beams = 1  # greedy

# =========================================================
# 8) PHASE A â€” FAST ADAPT (freeze encoder, loss only, NO generate)
# =========================================================
for p in model.encoder.parameters():
    p.requires_grad = False

args_A = Seq2SeqTrainingArguments(
    output_dir=OUT_DIR_A,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=3e-5,
    num_train_epochs=4,
    warmup_steps=20,

    eval_strategy="epoch",
    save_strategy="epoch",

    predict_with_generate=False,     
    logging_strategy="epoch",

    fp16=False,
    report_to="none",

    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=args_A,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    data_collator=default_data_collator,
    compute_metrics=None,            
)

print("\n===== PHASE A (freeze encoder, fast) =====")
trainer.train()
print("\n===== VAL RESULTS (Phase A) =====")
print(trainer.evaluate())

# =========================================================
# 9) PHASE B â€” BENCHMARK (unfreeze, generate ON, CER/Exact)
# =========================================================
for p in model.encoder.parameters():
    p.requires_grad = True

args_B = Seq2SeqTrainingArguments(
    output_dir=OUT_DIR_B,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=1e-5,
    num_train_epochs=3,
    warmup_steps=10,

    eval_strategy="epoch",
    save_strategy="epoch",

    predict_with_generate=True,      
    logging_strategy="epoch",

    fp16=False,
    report_to="none",

    load_best_model_at_end=True,
    metric_for_best_model="cer",
    greater_is_better=False,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=args_B,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    data_collator=default_data_collator,
    compute_metrics=compute_metrics,  
)

print("\n===== PHASE B (unfreeze encoder, benchmark) =====")
trainer.train()
print("\n===== FINAL VAL RESULTS (TrOCR) =====")
print(trainer.evaluate())

trainer.save_model(FINAL_DIR)
processor.save_pretrained(FINAL_DIR)
print(f"\nSaved model+processor to: {FINAL_DIR}")


Train: 527
Val  : 65


Loading weights: 100%|â–ˆ| 360/360 [00:01<00:00, 188.99it/s, Materializing param=encoder
VisionEncoderDecoderModel LOAD REPORT from: microsoft/trocr-small-printed
Key                         | Status  | 
----------------------------+---------+-
encoder.pooler.dense.bias   | MISSING | 
encoder.pooler.dense.weight | MISSING | 

Notes:
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.



===== PHASE A (freeze encoder, fast) =====


  super().__init__(loader)


Epoch,Training Loss,Validation Loss
1,4.533872,2.683761
2,2.584024,2.363312
3,2.242824,2.15906
4,2.049827,2.101222


Writing model shards: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:00<00:00,  1.14it/s]
  super().__init__(loader)
Writing model shards: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:00<00:00,  1.14it/s]
  super().__init__(loader)
Writing model shards: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:00<00:00,  1.05it/s]
  super().__init__(loader)
Writing model shards: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:00<00:00,  1.26it/s]



===== VAL RESULTS (Phase A) =====


  super().__init__(loader)


{'eval_loss': 2.1012215614318848, 'eval_runtime': 50.4296, 'eval_samples_per_second': 1.289, 'eval_steps_per_second': 0.337, 'epoch': 4.0}

===== PHASE B (unfreeze encoder, benchmark) =====


Epoch,Training Loss,Validation Loss


OverflowError: can't convert negative int to unsigned

In [None]:
# ===============================
# PHASE B â€” continue finetuning from Phase A
# ===============================

import numpy as np
import torch
from transformers import (
    VisionEncoderDecoderModel,
    TrOCRProcessor,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    default_data_collator,
    GenerationConfig
)

# -------- CONFIG --------
BASE_MODEL = "microsoft/trocr-small-printed"
MODEL_A_DIR = "trocr_phaseA/checkpoint-528"   # ðŸ”¥ ton checkpoint
OUT_DIR_B = "./trocr_phaseB"
MAX_LEN = 6

# -------- LOAD MODEL + PROCESSOR --------
processor = TrOCRProcessor.from_pretrained(BASE_MODEL)
model = VisionEncoderDecoderModel.from_pretrained(MODEL_A_DIR)

model.generation_config = GenerationConfig.from_model_config(model.config)
model.generation_config.max_length = MAX_LEN
model.generation_config.num_beams = 1

# -------- UNFREEZE ENCODER --------
for p in model.encoder.parameters():
    p.requires_grad = True

# -------- METRICS --------
def levenshtein(a, b):
    n, m = len(a), len(b)
    if n == 0: return m
    if m == 0: return n
    prev = list(range(m + 1))
    for i in range(1, n + 1):
        cur = [i] + [0] * m
        for j in range(1, m + 1):
            cost = 0 if a[i-1] == b[j-1] else 1
            cur[j] = min(prev[j] + 1, cur[j-1] + 1, prev[j-1] + cost)
        prev = cur
    return prev[m]

def compute_metrics(eval_pred):
    preds, labels = eval_pred

    # preds may contain -100 -> cast to int
    preds = np.clip(preds, 0, processor.tokenizer.vocab_size - 1)

    pred_str = processor.batch_decode(preds, skip_special_tokens=True)

    labels = labels.copy()
    labels[labels == -100] = processor.tokenizer.pad_token_id
    label_str = processor.batch_decode(labels, skip_special_tokens=True)

    exact = np.mean([p == l for p, l in zip(pred_str, label_str)])

    edits, chars = 0, 0
    for p, l in zip(pred_str, label_str):
        edits += levenshtein(p, l)
        chars += len(l)

    cer = edits / max(1, chars)
    return {"exact_acc": exact, "cer": cer}

# -------- TRAINING ARGS --------
args_B = Seq2SeqTrainingArguments(
    output_dir=OUT_DIR_B,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=1e-5,
    num_train_epochs=3,
    warmup_steps=10,

    eval_strategy="epoch",
    save_strategy="epoch",

    predict_with_generate=True,
    logging_strategy="epoch",

    fp16=False,
    report_to="none",

    load_best_model_at_end=True,
    metric_for_best_model="cer",
    greater_is_better=False,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=args_B,
    train_dataset=train_ds,   
    eval_dataset=val_ds,
    data_collator=default_data_collator,
    compute_metrics=compute_metrics,
)

print("===== PHASE B =====")
trainer.train()

print("===== FINAL RESULTS =====")
print(trainer.evaluate())

trainer.save_model("./trocr_final")
print("Model saved in ./trocr_final")


Loading weights: 100%|â–ˆ| 362/362 [00:01<00:00, 195.90it/s, Materializing param=encoder


===== PHASE B =====


Epoch,Training Loss,Validation Loss,Exact Acc,Cer
1,1.40679,0.995753,0.261538,0.290196
2,0.855281,0.869637,0.4,0.239216
3,0.600352,0.807607,0.446154,0.243137


Writing model shards: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:01<00:00,  1.44s/it]
  super().__init__(loader)
Writing model shards: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:00<00:00,  1.14it/s]
  super().__init__(loader)
Writing model shards: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:00<00:00,  1.16it/s]


===== FINAL RESULTS =====


  super().__init__(loader)


{'eval_loss': 0.8696371912956238, 'eval_exact_acc': 0.4, 'eval_cer': 0.23921568627450981, 'eval_runtime': 105.9611, 'eval_samples_per_second': 0.613, 'eval_steps_per_second': 0.16, 'epoch': 3.0}


Writing model shards: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:00<00:00,  1.23it/s]

Model saved in ./trocr_final



