In [4]:
!git clone https://github.com/htrnguyen/compare_ocr_benchmark.git

fatal: destination path 'compare_ocr_benchmark' already exists and is not an empty directory.


In [5]:
!pip install python-Levenshtein jiwer



In [6]:
import sys, os, time
import pandas as pd
import torch
from PIL import Image
from transformers import VisionEncoderDecoderModel, TrOCRProcessor

# Đảm bảo sys.path tới các hàm utils và metrics chung
sys.path.append('/kaggle/working/compare_ocr_benchmark/common')
from metrics import compute_metrics
from utils import save_results

In [7]:
# Đường dẫn data/model
DATA_ROOT = '/kaggle/input/nckh-2425-crops'
CSV_ANN = '/kaggle/input/nckh-2425-crops/crops_gt.csv'
MODEL_PATH = '/kaggle/input/trocr_fineturning/pytorch/default/1/final_model'

# Load model và processor
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

model = VisionEncoderDecoderModel.from_pretrained(MODEL_PATH).to(device)
processor = TrOCRProcessor.from_pretrained(MODEL_PATH)
model.eval()

Device: cuda


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


VisionEncoderDecoderModel(
  (encoder): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=False)
              (key): Linear(in_features=768, out_features=768, bias=False)
              (value): Linear(in_features=768, out_features=768, bias=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
            (i

In [8]:
def load_and_correct_image(img_path):
    image = Image.open(img_path).convert("RGB")
    try:
        exif = image._getexif()
        orientation_key = 274
        if exif and orientation_key in exif:
            orientation = exif[orientation_key]
            if orientation == 3:
                image = image.rotate(180, expand=True)
            elif orientation == 6:
                image = image.rotate(270, expand=True)
            elif orientation == 8:
                image = image.rotate(90, expand=True)
    except:
        pass
    return image

def trocr_predict(img_path):
    image = load_and_correct_image(img_path)
    pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(device)
    with torch.no_grad():
        generated_ids = model.generate(pixel_values)
        prediction = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return prediction

In [9]:
df = pd.read_csv(CSV_ANN)
results = []

for idx, row in df.iterrows():
    fname = row['filename']
    desc_gt = row['description_gt']
    label = row.get('label', '')
    img_path = os.path.join(DATA_ROOT, fname)

    try:
        t1 = time.perf_counter()
        pred = trocr_predict(img_path)
        t2 = time.perf_counter()
        infer_time = round(t2 - t1, 3)
    except Exception as e:
        pred = f"OCR_Error: {e}"
        infer_time = 0.0

    metrics = compute_metrics(desc_gt, pred)

    results.append({
        "filename": fname,
        "label": label,
        "ground_truth": desc_gt,
        "predicted_text": pred,
        "cer": metrics["cer"],
        "wer": metrics["wer"],
        "lev": metrics["lev"],
        "acc": metrics["acc"],
        "time": infer_time
    })
    if idx % 50 == 0:
        print(f"Processed {idx}/{len(df)}")

Processed 0/2284
Processed 50/2284
Processed 100/2284
Processed 150/2284
Processed 200/2284
Processed 250/2284
Processed 300/2284
Processed 350/2284
Processed 400/2284
Processed 450/2284
Processed 500/2284
Processed 550/2284
Processed 600/2284
Processed 650/2284
Processed 700/2284
Processed 750/2284
Processed 800/2284
Processed 850/2284
Processed 900/2284
Processed 950/2284
Processed 1000/2284
Processed 1050/2284
Processed 1100/2284
Processed 1150/2284
Processed 1200/2284
Processed 1250/2284
Processed 1300/2284
Processed 1350/2284
Processed 1400/2284
Processed 1450/2284
Processed 1500/2284
Processed 1550/2284
Processed 1600/2284
Processed 1650/2284
Processed 1700/2284
Processed 1750/2284
Processed 1800/2284
Processed 1850/2284
Processed 1900/2284
Processed 1950/2284
Processed 2000/2284
Processed 2050/2284
Processed 2100/2284
Processed 2150/2284
Processed 2200/2284
Processed 2250/2284


In [10]:
import os
OUT_CSV = '/kaggle/working/compare_ocr_benchmark/results/trocr_pretrain_results.csv'
os.makedirs(os.path.dirname(OUT_CSV), exist_ok=True)
save_results(results, OUT_CSV)

Lưu thành công: /kaggle/working/compare_ocr_benchmark/results/trocr_pretrain_results.csv
