In [50]:
from transformers import AutoTokenizer, VisionEncoderDecoderModel

tokenizer = AutoTokenizer.from_pretrained("kha-white/manga-ocr-base")
model = VisionEncoderDecoderModel.from_pretrained("kha-white/manga-ocr-base")

#Jap OCR

In [51]:
import paddle

print("CUDA:", paddle.device.is_compiled_with_cuda())
print("PaddlePaddle device:", paddle.device.get_device())

#Chi OCR

CUDA: False
PaddlePaddle device: cpu


In [55]:
from PIL import Image
import paddle
import os
from paddleocr import PaddleOCR
import matplotlib.pyplot as plt
import numpy as np

def resize_image(image_path, scale_factor=0.5):
    """
    resize image may help chinese ocr accuracy
    """
    with Image.open(image_path) as img:
        original_size = img.size
        new_size = (int(original_size[0] * scale_factor), int(original_size[1] * scale_factor))
        resized_img = img.resize(new_size, Image.LANCZOS)
        return resized_img

def chi_ocr(image_path):
    image = Image.open(image_path)
    result = ocr.ocr(np.array(image), cls=True)
    res = result[-1] #get last character
    if res is None: # need resize
        image1 = resize_image(image_path)
        image1_np = np.array(image1)
        res = ocr.ocr(image1_np, cls=True)[-1]#get last character
    # extract text info only
    if res is not None:
        for line in res:
            for word_info in line:
                if isinstance(word_info, tuple): 
                    text, confidence = word_info
                    return text
    else:
        return None

img_path = 'testocr/1.jpg'  
folder_path = 'testocr'
ocr = PaddleOCR(use_angle_cls=True, use_gpu=False, show_log=False)  
results1 = []
for i in os.listdir(folder_path):
    if i.endswith(('.jpg', '.jpeg', '.png', '.bmp')):
        results1.append(chi_ocr(os.path.join(folder_path,i)))
print(results1)


['敢', None, '豫', '八', '日', '十', '何', None, '何']


In [54]:
import os
from PIL import Image
import torch
import torchvision.transforms as transforms
from torchvision.transforms import Resize, ToTensor
from tqdm import tqdm
import openpyxl

def jap_ocr(image_path):
    image = Image.open(image_path)
    resize = Resize((224, 224))
    image = resize(image)
    image_tensor = transform(image)
    image_tensor = image_tensor.unsqueeze(0)
    
    generated_ids = model.generate(image_tensor)
    generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return generated_text
    
dirpath = "testocr2/"
results2 = []
transform = transforms.Compose([
            transforms.ToTensor(),  # Convert PIL Image to tensor
            # Add any additional transformations as needed
        ])
for i in os.listdir(dirpath):
    if i.endswith(('.jpg', '.jpeg', '.png', '.bmp')):
        image_path = dirpath + i
        results2.append(jap_ocr(image_path))
print(results2)

['え', 'え', 'か', 'へ 、', 'で', 'に', 'は', 'は', 'へ', 'ち', 'w o r d s']


In [56]:
import pandas as pd
import glob

# Specify the path to your CSV files
path = "../../pilot data/classifypilot"  # Update this to the path of your CSV files
all_files = glob.glob(path + "/*.csv")

# List to hold data from each CSV
li = []

# Loop through all files and read them into a dataframe, then append to the list
for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

# Concatenate all dataframes in the list into one dataframe
frame = pd.concat(li, axis=0, ignore_index=True)

# Optionally, you can save the merged dataframe to a new CSV file
frame.to_csv('merged.csv', index=False)

In [57]:
import pandas as pd

# image_folder_name = 'seg_letter'
image_folder_path = '../../pilot data/data/'# dir/dir/image
type_classifier = "merged.csv"

workbook = openpyxl.Workbook()
sheet = workbook.active
transform = transforms.Compose([
            transforms.ToTensor(),  # Convert PIL Image to tensor
            # Add any additional transformations as needed
        ])
label_map = pd.read_csv(type_classifier, index_col=0).to_dict()['Class']
result = []
for img_dir_name in tqdm(os.listdir(image_folder_path)): 
    for filename in os.listdir(os.path.join(image_folder_path, img_dir_name)): 
        if filename.endswith(('.jpg', '.jpeg', '.png', '.bmp')):
            image_path = os.path.join(image_folder_path, img_dir_name, filename)
            # 根据label_map中的标签选择OCR函数
            label = label_map.get(filename)
            if label == 'Kana':
                text = jap_ocr(image_path)
            elif label == 'Kanji':
                text = chi_ocr(image_path)
            else:
                text = "No OCR due to unknown label"
            result.append((img_dir_name, filename, text))
    
for (img_dir_name,filename,text) in result:
    
    sheet.append((img_dir_name,filename,text))
        
workbook.save("../pilot_data_ocr.xlsx")
workbook.close()

100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [06:39<00:00, 49.93s/it]


In [59]:
import pandas as pd
df = pd.read_excel('../pilot_data_ocr.xlsx', header=None)
df.columns = ['group', 'filename', 'ocr']
print(df.head())
df.to_excel('../pilot_data_ocr.xlsx', index=False)

   group     filename  ocr
0  group     filename  ocr
1      B    B_1_1.jpg    敢
2      B   B_1_10.jpg    增
3      B  B_1_100.jpg    /
4      B  B_1_101.jpg    國
