In [1]:
!pip install fuzzywuzzy[speedup]


Collecting python-levenshtein>=0.12 (from fuzzywuzzy[speedup])
  Downloading python_Levenshtein-0.25.1-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.25.1 (from python-levenshtein>=0.12->fuzzywuzzy[speedup])
  Downloading Levenshtein-0.25.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.3 kB)
Collecting rapidfuzz<4.0.0,>=3.8.0 (from Levenshtein==0.25.1->python-levenshtein>=0.12->fuzzywuzzy[speedup])
  Downloading rapidfuzz-3.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading python_Levenshtein-0.25.1-py3-none-any.whl (9.4 kB)
Downloading Levenshtein-0.25.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (177 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m177.4/177.4 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hDownloading rapidfuzz-3.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [4]:
import cv2
import pytesseract
import time
import pandas as pd
from tqdm.notebook import tqdm
import os
import re

# Đường dẫn thư mục chứa ảnh
image_dir = './valid/'  # Thay bằng đường dẫn thư mục chứa ảnh của bạn

# Hàm để tính khoảng cách Levenshtein
def levenshtein_distance(a, b):
    """Calculates the Levenshtein distance between a and b."""
    n, m = len(a), len(b)
    if n > m:
        # Make sure n <= m, to use O(min(n, m)) space
        a, b = b, a
        n, m = m, n

    current_row = range(n + 1)  # Keep current and previous row, not full matrix
    for i in range(1, m + 1):
        previous_row, current_row = current_row, [i] + [0] * n
        for j in range(1, n + 1):
            add, delete, change = previous_row[j] + 1, current_row[j - 1] + 1, previous_row[j - 1]
            if a[j - 1] != b[i - 1]:
                change += 1
            current_row[j] = min(add, delete, change)

    return current_row[n]

# Hàm để nhận diện biển số xe từ ảnh
def recognize_license_plate(image_path):
    image = cv2.imread(image_path)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    # Preprocessing có thể thay đổi tùy theo dữ liệu
    result = pytesseract.image_to_string(gray, config='--psm 8')  # config '--psm 8' tốt cho nhận diện vùng nhỏ
    # Sử dụng regex để chỉ giữ lại chữ cái và số
    result = re.sub(r'[^A-Za-z0-9]', '', result)
    return result.strip()

results = []
times = []

# Lấy danh sách file ảnh
image_files = [f for f in os.listdir(image_dir) if f.endswith(('.png', '.jpg', '.jpeg'))]

# Duyệt qua từng ảnh trong thư mục
for image_file in tqdm(image_files):
    image_path = os.path.join(image_dir, image_file)
    # Lấy các ký tự 1, 2, 3, 5, 6, 7, 8, 9 từ tên file làm ground truth
    expected_text = ''.join([os.path.splitext(image_file)[0][i] for i in [0, 1, 2, 4, 5, 6, 7, 8]])

    start_time = time.time()
    detected_text = recognize_license_plate(image_path)
    end_time = time.time()

    processing_time = end_time - start_time
    # Tính Levenshtein Distance
    lev_distance = levenshtein_distance(expected_text, detected_text)
    is_correct = expected_text == detected_text

    # In kết quả nhận diện
    print(f"Image: {image_file}")
    print(f"Expected: {expected_text}")
    print(f"Detected: {detected_text}")
    print(f"Levenshtein Distance: {lev_distance}")
    print(f"Correct: {is_correct}")
    print(f"Processing Time: {processing_time:.4f} seconds\n")

    results.append({
        'image_name': image_file,
        'expected_text': expected_text,
        'detected_text': detected_text,
        'is_correct': is_correct,
        'levenshtein_distance': lev_distance,
        'processing_time': processing_time
    })

# Chuyển kết quả thành DataFrame
results_df = pd.DataFrame(results)

# Tính toán các giá trị trung bình, bao gồm tất cả các khoảng cách Levenshtein
accuracy = results_df['is_correct'].mean()
total_levenshtein_distance = results_df['levenshtein_distance'].sum()
average_levenshtein_distance = total_levenshtein_distance / len(results_df)
average_processing_time = results_df['processing_time'].mean()

print(f'Overall Accuracy: {accuracy * 100:.2f}%')
print(f'Average Levenshtein Distance: {average_levenshtein_distance:.2f}')
print(f'Average Processing Time: {average_processing_time:.4f} seconds')

# Lưu kết quả vào file CSV
results_df.to_csv('Tesseract_VietNam_Result.csv', index=False)


  0%|          | 0/1000 [00:00<?, ?it/s]

Image: 51A_72444_02.jpg
Expected: 51A72444
Detected: 51A72444
Levenshtein Distance: 0
Correct: True
Processing Time: 0.1312 seconds

Image: 92A_06625_28.jpg
Expected: 92A06625
Detected: 92A06625
Levenshtein Distance: 0
Correct: True
Processing Time: 0.1204 seconds

Image: 51A_35059_11.jpg
Expected: 51A35059
Detected: 51A35059
Levenshtein Distance: 0
Correct: True
Processing Time: 0.1203 seconds

Image: 75A_16868_01.jpg
Expected: 75A16868
Detected: 75A16868
Levenshtein Distance: 0
Correct: True
Processing Time: 0.1260 seconds

Image: 98A_17339_14.jpg
Expected: 98A17339
Detected: 98A17339
Levenshtein Distance: 0
Correct: True
Processing Time: 0.1239 seconds

Image: 92A_08922_07.jpg
Expected: 92A08922
Detected: 92A08922
Levenshtein Distance: 0
Correct: True
Processing Time: 0.1236 seconds

Image: 51A_89714_10.jpg
Expected: 51A89714
Detected: 51A89714
Levenshtein Distance: 0
Correct: True
Processing Time: 0.1257 seconds

Image: 30E_64379_30.jpg
Expected: 30E64379
Detected: 30E64379
Levensh