In [1]:
import re
import os
import json
from io import BytesIO
from dotenv import load_dotenv

import cv2
import numpy as np
import pandas as pd
from tqdm import tqdm
from glob import glob
from PIL import Image
from minio import Minio
from paddleocr import PaddleOCR

In [2]:
load_dotenv()

True

In [3]:
ocr = PaddleOCR(lang="korean", max_text_length=50, show_log=False)

In [4]:
client = Minio(
    os.environ.get('S3_ENDPOINTS'),
    access_key=os.environ.get('S3_ACCESS_KEY'),
    secret_key=os.environ.get('S3_SECRET_KEY'),
    secure=False
)

In [5]:
# reader = Reader(['ko', 'en'], gpu=True)

In [6]:
file_paths = glob('./data/semi-structured/*.json')

In [7]:
file_paths

['./data/semi-structured/홍대 회식 맛집.json',
 './data/semi-structured/홍대 데이트 맛집.json',
 './data/semi-structured/성수 데이트 맛집.json',
 './data/semi-structured/압구정 회식 맛집.json',
 './data/semi-structured/강남역 데이트 맛집.json',
 './data/semi-structured/영등포 회식 맛집.json',
 './data/semi-structured/성수 회식 맛집.json',
 './data/semi-structured/압구정 데이트 맛집.json',
 './data/semi-structured/강남역 회식 맛집.json',
 './data/semi-structured/범계 회식 맛집.json',
 './data/semi-structured/범계 데이트 맛집.json',
 './data/semi-structured/영등포 데이트 맛집.json']

In [8]:
bucket_name = 'jcwee.study'
path_template = 'skku/capstone/fake_review_detect/data/photo/{target}/{article_id}/{no}.png'

In [15]:
for path in file_paths:
    with open(path, 'r', encoding='utf-8') as f:
        blog = json.load(f)
    
    file_nm = path.split('/')[-1]
    target = file_nm.split('.')[0].replace(' ', '')
    fail = 0
    
    for article in tqdm(blog, desc=target):
        article_id = article['id']
        
        for p in article['contents']:
            if p['info'] not in  ['img', 'gif']: continue
            
            try:
                res = client.get_object(
                    bucket_name,
                    path_template.format(
                        target=target,
                        article_id=article_id,
                        no=p['no']
                    )
                )
            except Exception as e:
                fail += 1
                continue
            
            nparr = np.frombuffer(res.data, np.uint8)
            img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
            
            
            text = ocr.ocr(img, cls=False)
            
            if text[0]:
                text = ' '.join([t[1][0] for t in text[0] if t])
                
            else:
                text = ''
                
            p['ocr_text'] = text

    print(target, 'fail img count:', fail)
    with open(f'./data/semi_struct_ocr/{file_nm}', 'w', encoding='utf-8') as f:
        json.dump(blog, f, ensure_ascii=False)









홍대회식맛집: 100%|██████████| 888/888 [35:04<00:00,  2.37s/it]


홍대회식맛집 fail img count: 61














홍대데이트맛집: 100%|██████████| 855/855 [35:07<00:00,  2.47s/it]


홍대데이트맛집 fail img count: 80






성수데이트맛집: 100%|██████████| 985/985 [35:37<00:00,  2.17s/it]


성수데이트맛집 fail img count: 51




















압구정회식맛집: 100%|██████████| 914/914 [35:00<00:00,  2.30s/it]


압구정회식맛집 fail img count: 31
















강남역데이트맛집: 100%|██████████| 858/858 [39:00<00:00,  2.73s/it]


강남역데이트맛집 fail img count: 21












영등포회식맛집: 100%|██████████| 932/932 [31:05<00:00,  2.00s/it]


영등포회식맛집 fail img count: 71










성수회식맛집: 100%|██████████| 981/981 [36:34<00:00,  2.24s/it]


성수회식맛집 fail img count: 49














압구정데이트맛집: 100%|██████████| 945/945 [40:00<00:00,  2.54s/it]


압구정데이트맛집 fail img count: 39
























강남역회식맛집: 100%|██████████| 950/950 [39:17<00:00,  2.48s/it]


강남역회식맛집 fail img count: 91








범계회식맛집: 100%|██████████| 945/945 [33:27<00:00,  2.12s/it]


범계회식맛집 fail img count: 40


범계데이트맛집: 100%|██████████| 902/902 [31:29<00:00,  2.10s/it]


범계데이트맛집 fail img count: 23








영등포데이트맛집: 100%|██████████| 927/927 [34:20<00:00,  2.22s/it]

영등포데이트맛집 fail img count: 22





In [None]:
ocr.ocr(img, cls=False)

In [14]:
bool(text[0])

False