## Import libraries

In [1]:
import os
import json 
import pandas as pd
import numpy as np
from PIL import Image

## Install requirements

In [2]:
!pip install google-cloud-bigquery[bqstorage,pandas]==3.10.0
!pip install google-cloud-storage>=2.0.0
!pip install pandas<2.1.4,>=1.5.0
!pip install numpy<3.0,>=1.20
!pip install pydantic~=1.10.0
!pip install albumentations==1.4.10
!pip install scipy<1.14,>=1.4.1
!pip install -q paddleocr paddlepaddle
!pip install -q vietocr
!pip install -q easyocr
!pip install einops 
!pip install --quiet scikit-learn==1.2.2

  pid, fd = os.forkpty()


Collecting google-cloud-bigquery==3.10.0 (from google-cloud-bigquery[bqstorage,pandas]==3.10.0)
  Downloading google_cloud_bigquery-3.10.0-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting google-cloud-bigquery-storage<3.0.0dev,>=2.6.0 (from google-cloud-bigquery[bqstorage,pandas]==3.10.0)
  Downloading google_cloud_bigquery_storage-2.27.0-py2.py3-none-any.whl.metadata (5.6 kB)
Downloading google_cloud_bigquery-3.10.0-py2.py3-none-any.whl (218 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m218.4/218.4 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading google_cloud_bigquery_storage-2.27.0-py2.py3-none-any.whl (240 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m240.0/240.0 kB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: google-cloud-bigquery-storage, google-cloud-bigquery
  Attempting uninstall: google-cloud-bigquery
    Found existing installation: google-cloud-bigquery 3.25.0
    Uninstall

In [6]:
from paddleocr import PaddleOCR
import easyocr

## OCR code

In [7]:
def read_json(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        data = json.load(file)
    
        data_list = [{**value} for key, value in data.items()]
        
        df = pd.DataFrame(data_list)
        
    return df

In [9]:
class OCRProcessor:
    def __init__(self):
        # Initializes the OCRProcessor class with PaddleOCR and EasyOCR.
        # PaddleOCR is set to use GPU, angle classification, and support for Vietnamese language.
        # EasyOCR is initialized to handle English and Vietnamese languages.
        self.paddle_ocr = PaddleOCR(use_angle_cls=True, lang='vi', use_gpu=True)
        self.reader = easyocr.Reader(['en', 'vi'])

    def process_images(self, directory, output_csv_file_path="/kaggle/working/ocr_text.csv"):
        """
        Processes images in the specified directory to extract text using OCR tools.
        The extracted text is saved to a CSV file.
        
        Args:
            directory (str): Path to the directory containing the images.
            output_csv_file_path (str): Path where the resulting CSV file will be saved.
        
        Returns:
            pd.DataFrame: A DataFrame containing image names and the corresponding OCR-extracted combined text.
        """
        df = pd.DataFrame(columns=["image_name", "combined_text"])  # Initialize an empty DataFrame for results
        total_images = len(os.listdir(directory))  # Count total images in the directory

        # Iterate through each file in the directory
        for i, filename in enumerate(os.listdir(directory), 1):
            image_path = os.path.join(directory, filename)

            # Skip files that are not images
            if not os.path.isfile(image_path) or not filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif')):
                continue

            print(f"Processing image {i}/{total_images} ({filename})", end='\r')

            combined_text = None  # Placeholder for combined OCR text of the current image
            try:
                # Perform OCR using PaddleOCR to detect text boxes and text content
                ocr_results = self.paddle_ocr.ocr(image_path, cls=True)
                recognized_texts = []  # List to store text from cropped regions
                boxes = []  # List to store bounding boxes for detected text regions

                image = Image.open(image_path)  # Load the image using PIL

                # Parse the OCR results to extract bounding boxes and text
                for line in ocr_results[0]:
                    if len(line) == 2:
                        bbox, text = line  # For results without confidence values
                        confidence = None
                    elif len(line) == 3:
                        bbox, text, confidence = line  # For results with confidence values
                    else:
                        continue

                    # Calculate padding around the detected text box
                    padding_ratio_y = 0.25  # Percentage of padding added to height
                    padding_ratio_x = 0.015  # Percentage of padding added to width
                    width = bbox[2][0] - bbox[0][0]  # Width of the bounding box
                    height = bbox[2][1] - bbox[0][1]  # Height of the bounding box

                    # Adjust the bounding box with calculated padding
                    padding_width = int(width * padding_ratio_x)
                    padding_height = int(height * padding_ratio_y)

                    x_min = max(0, bbox[0][0] - padding_width)
                    y_min = max(0, bbox[0][1] - padding_height)
                    x_max = bbox[2][0] + padding_width
                    y_max = bbox[2][1] + padding_height

                    boxes.append([x_min, y_min, x_max, y_max])  # Append adjusted bounding box to the list

                merged_boxes = boxes  # Retain the same boxes without additional merging for this implementation

                recognized_texts = []  # Reset list to store text from individual boxes

                # Use EasyOCR to extract text from cropped regions based on merged bounding boxes
                for idx, box in enumerate(merged_boxes):
                    x_min, y_min, x_max, y_max = map(int, box)
                    cropped_region = image.crop((x_min, y_min, x_max, y_max))  # Crop the region from the image
                    cropped_region_np = np.array(cropped_region)  # Convert cropped region to NumPy array
                    recognized_text = self.reader.readtext(cropped_region_np)  # Perform OCR with EasyOCR
                    recognized_texts.append(' '.join([text for (_, text, _) in recognized_text]))  # Collect recognized text

                combined_text = "\n".join(recognized_texts)  # Combine texts from all regions into one string

            except Exception as ocr_error:
                # Handle errors during OCR processing for an image
                print(f"\nError extracting OCR text for image {filename}: {str(ocr_error)}")
                combined_text = ""  # Set an empty string if an error occurs

            # Create a new row with the image name and extracted text, then append it to the DataFrame
            new_row = pd.DataFrame({"image_name": [filename], "combined_text": [combined_text]})
            df = pd.concat([df, new_row], ignore_index=True)

        # Save the results to a CSV file
        df.to_csv(output_csv_file_path, index=False, encoding="utf-8")
        return df  # Return the DataFrame containing OCR results

In [10]:
ocr_processor = OCRProcessor()

download https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar to /root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer/en_PP-OCRv3_det_infer.tar


100%|██████████| 3910/3910 [00:17<00:00, 220.98it/s] 


download https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/latin_PP-OCRv3_rec_infer.tar to /root/.paddleocr/whl/rec/latin/latin_PP-OCRv3_rec_infer/latin_PP-OCRv3_rec_infer.tar


100%|██████████| 9930/9930 [00:19<00:00, 500.15it/s] 


download https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar to /root/.paddleocr/whl/cls/ch_ppocr_mobile_v2.0_cls_infer/ch_ppocr_mobile_v2.0_cls_infer.tar


100%|██████████| 2138/2138 [00:08<00:00, 264.66it/s]

[2025/01/19 15:41:03] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/root/.paddleocr/whl/rec/latin/latin_PP-OCRv3_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6, max_text_length=25,




In [11]:
ocr_processor.process_images('/kaggle/input/vimmsd-dataset/private-test-images/test-images', '/kaggle/working/private_test_ocr.csv')

[2025/01/19 15:41:08] ppocr DEBUG: dt_boxes num : 7, elapsed : 0.17268085479736328
[2025/01/19 15:41:08] ppocr DEBUG: cls num  : 7, elapsed : 0.05406332015991211
[2025/01/19 15:41:08] ppocr DEBUG: rec_res num  : 7, elapsed : 0.28561925888061523
[2025/01/19 15:41:09] ppocr DEBUG: dt_boxes num : 12, elapsed : 0.1659996509552002
[2025/01/19 15:41:09] ppocr DEBUG: cls num  : 12, elapsed : 0.020054101943969727
[2025/01/19 15:41:10] ppocr DEBUG: rec_res num  : 12, elapsed : 0.47991299629211426
[2025/01/19 15:41:10] ppocr DEBUG: dt_boxes num : 13, elapsed : 0.17227864265441895
[2025/01/19 15:41:10] ppocr DEBUG: cls num  : 13, elapsed : 0.02773141860961914
[2025/01/19 15:41:11] ppocr DEBUG: rec_res num  : 13, elapsed : 0.2548859119415283
[2025/01/19 15:41:11] ppocr DEBUG: dt_boxes num : 8, elapsed : 0.04970288276672363
[2025/01/19 15:41:11] ppocr DEBUG: cls num  : 8, elapsed : 0.035797834396362305
[2025/01/19 15:41:11] ppocr DEBUG: rec_res num  : 8, elapsed : 0.15406203269958496
[2025/01/19 15

Unnamed: 0,image_name,combined_text
0,3cf39586f606b05725d746603605d891c3a6d334116d55...,Bác sĩ: Vén á0 lên tôi tiêm cái nào\n'muhangxo...
1,13a5019ff17a2ab0c70b1712ab5b8a2e61356e1be2d9aa...,if\nSenior Member\n30 minutes ago\n#5\nNằm ở b...
2,8012aeb49536661782e4133c27b71f7e97a12ae24f5afe...,ĐlỀN TỪ vÀO CHÔ TRÔNG\nMUOT\nTkoi\nAPP MYYIB T...
3,c9a4ddaa4db1c5611050b3ea50523a10d33666a630686e...,chu hang xom\nMe\nCon\nBa\nYu Nèee\nbãi cỏ x ô...
4,ad007ec9f2f4794e27fd85f819e2f0bfd5610c06952739...,
...,...,...
1499,56025b2c8c6942300367dd3c50f8a951c1ad384c813fb7...,
1500,72d656aeec63be8692b97d8c4db472834dcdbecd1c9eeb...,Phuong Troi Xa\nNam mô a di Đà Phật\n78\n2h\nL...
1501,3cbede4d070312e7be8b4d934c9ca9af6c3f060d8e9504...,LANTOA
1502,2070fad4e6cfcac95fbceb22fb6f9388599514190789f3...,Thèng Huy\ntrollx\nAnh Huy\nÔng Huy


In [12]:
ocr_processor.process_images('/kaggle/input/vimmsd-dataset/public-test-images/dev-images', '/kaggle/working/public_test_ocr.csv')

[2025/01/19 15:54:53] ppocr DEBUG: dt_boxes num : 11, elapsed : 0.22444748878479004
[2025/01/19 15:54:53] ppocr DEBUG: cls num  : 11, elapsed : 0.031568050384521484
[2025/01/19 15:54:53] ppocr DEBUG: rec_res num  : 11, elapsed : 0.1654679775238037
[2025/01/19 15:54:54] ppocr DEBUG: dt_boxes num : 11, elapsed : 0.1561732292175293
[2025/01/19 15:54:54] ppocr DEBUG: cls num  : 11, elapsed : 0.026590585708618164
[2025/01/19 15:54:54] ppocr DEBUG: rec_res num  : 11, elapsed : 0.25307202339172363
[2025/01/19 15:54:54] ppocr DEBUG: dt_boxes num : 25, elapsed : 0.18297696113586426
[2025/01/19 15:54:54] ppocr DEBUG: cls num  : 25, elapsed : 0.07740235328674316
[2025/01/19 15:54:55] ppocr DEBUG: rec_res num  : 25, elapsed : 0.8742985725402832
[2025/01/19 15:54:56] ppocr DEBUG: dt_boxes num : 6, elapsed : 0.16359925270080566
[2025/01/19 15:54:56] ppocr DEBUG: cls num  : 6, elapsed : 0.017157316207885742
[2025/01/19 15:54:56] ppocr DEBUG: rec_res num  : 6, elapsed : 0.13149523735046387
[2025/01/19

Unnamed: 0,image_name,combined_text
0,2d06d8c77c741d001916199346cc112847e6bcf61b3dce...,ĐÓM CON MÀY\nTHÌ NHỚ\nDA!\nCÚP CÁI\nRA NGOÀI Đ...
1,c981f23fc77cebd06ea872ea2c0ff6ec43a9d2517366ed...,"DUNG GALAXY FOLD3 56 HỌC BÀI\nĐÃ LẮM, ĐA NHIỆM..."
2,342c9a8f91adeacde0f2c26dee3e6b86861b43e948d10b...,Cuộc bâu cửcó ảnh hưởng thế nào?\nCác cuộc bầu...
3,2aa95c65c0a6444caff0657ed21e27fbc403af1727749a...,"Lục Mẫn Gia\n29 thg 12, 2020\nMuốn sang thì bắ..."
4,9d6ebb26087b8d6051f77ef7cbf3e9a0d750baa41b45d7...,"Với một bình nhiên liệu đây 1OOL, khi chạy ở t..."
...,...,...
1408,a2f9c7a5d3106637ea1ccec91d1e30fbc08e3a99c15b89...,UÍcH cỰ\nIPHÒNG cH\nBỌ GẬ
1409,6fc0ea8c729f6158ad1faf4ea3c7dc083b7e3826bd458d...,Lổi cúa em\n5O0k nhé\n450 thôi anh chốt nhanh\...
1410,d9c5f3253447e2f6e6e8660b4c462bebd7b9b232dcbf97...,
1411,244034c260154f60f14b7adb16c292c1b9127904d18262...,~~quannha~\nước đc reppp\n4 giờ\nTrả lời\n31\n...


In [13]:
ocr_processor.process_images('/kaggle/input/vimmsd-dataset/training-images/train-images', '/kaggle/working/train_ocr.csv')

[2025/01/19 16:07:50] ppocr DEBUG: dt_boxes num : 10, elapsed : 0.052596092224121094
[2025/01/19 16:07:50] ppocr DEBUG: cls num  : 10, elapsed : 0.029742717742919922
[2025/01/19 16:07:50] ppocr DEBUG: rec_res num  : 10, elapsed : 0.2560582160949707
[2025/01/19 16:07:50] ppocr DEBUG: dt_boxes num : 0, elapsed : 0.1544628143310547
[2025/01/19 16:07:50] ppocr DEBUG: cls num  : 0, elapsed : 0
[2025/01/19 16:07:50] ppocr DEBUG: rec_res num  : 0, elapsed : 1.9073486328125e-06

Error extracting OCR text for image 733ef474a9c5a33c867728388aaf3f7247c90b34a86dec7ebb4bede260275871.jpg: 'NoneType' object is not iterable
[2025/01/19 16:07:50] ppocr DEBUG: dt_boxes num : 8, elapsed : 0.04977607727050781
[2025/01/19 16:07:50] ppocr DEBUG: cls num  : 8, elapsed : 0.05086517333984375
[2025/01/19 16:07:51] ppocr DEBUG: rec_res num  : 8, elapsed : 0.26622462272644043
[2025/01/19 16:07:51] ppocr DEBUG: dt_boxes num : 6, elapsed : 0.050531625747680664
[2025/01/19 16:07:51] ppocr DEBUG: cls num  : 6, elapse

Unnamed: 0,image_name,combined_text
0,98c78da23f651d72af4ab7100d0f826fde43704c8273e7...,\nE\n1\nDORTMUND ĐÃ CÓ MÙA GIẢI RẤT HAY\nNHUNG...
1,733ef474a9c5a33c867728388aaf3f7247c90b34a86dec...,
2,9f32b0eee517243bf80a0b1f9c9e52a44af9589135360a...,NẾU BUÔC 1 SƠI DÂY THÙNG DÀI DƯÓI ĐUÔI\nCỦA CH...
3,c83c7c3f5936299e3d9f20952042e6c4454d8105b54683...,stercard\nPHÚT 87: REAL 0-1 BAYERN\nPHÚT 90+1:...
4,e32b566d3f41372526cc59d3a47ecbe4d72a38817416eb...,
...,...,...
10800,c1c1b4c07aa9224ddc1c949fac863523355d7545184302...,TOP NHÂN BIẾT NGƯỜl ĐÀN ÔNG\nTRƯỞNG THÀNH\n*TOPI:
10801,2b6b43705482d258ced6ff68ffc0deeafddd3a4a605a7f...,NŨ ĐlÊU DUÕNG BỊ ĐlÊU TRA VÌ\nQU.ẤY RỐ1 JIN (B...
10802,3d552d33ee924fd7e66027f79c843ca460d97d7f8386f5...,ĐAI DIÊN CCV ĐÃ LÊN TIÊNG XIN LÔl VÀ\nTẶNG 2 V...
10803,8dea7a70d36a0bb00a558d87c76dbe46c192bbc253bd92...,NÊU ĐUƠC XEM TRUC TIÊP PHA BÓNG NÀY\n\nTHÌXINT...
