# upstage ocr 사용


In [213]:
import requests
import os
import json
import pandas as pd
from dotenv import load_dotenv
from utils import load_yaml

load_dotenv()

api_key = os.getenv("UPSTAGE_API_KEY")
config = load_yaml("../config/ocr.yaml")
pattern = config["settings"]["pattern"]
base_path = config["settings"]["base_path"]
data_path = os.path.join(config["settings"]["png_path"], pattern)
file_list = [x for x in os.listdir(data_path) if not x.startswith(".")]

for filename in file_list:
    json_filename = f"{base_path}/{pattern}/{filename.split('.png')[0]}.json"
    if os.path.exists(json_filename):
        print(f"{json_filename}이 이미 존재합니다. 건너뜁니다.")
        continue

    # upstage ocr 사용
    url = "https://api.upstage.ai/v1/document-ai/ocr"
    headers = {"Authorization": f"Bearer {api_key}"}
    files = {"document": open(f"{data_path}/{filename}", "rb")}
    response = requests.post(url, headers=headers, files=files)

    with open(
        f"{base_path}/{pattern}/{filename.split('.png')[0]}.json",
        "w",
    ) as f:
        json.dump(response.json(), f)

# OCR 텍스트를 테이블로 변환 using batch api


In [217]:
import yaml
from openai import OpenAI
import time
from dotenv import load_dotenv

load_dotenv()


class ConvertTable:
    def __init__(self, config):
        self.base_path = config["settings"]["base_path"]
        self.pattern = config["settings"]["pattern"]
        self.csv_path = config["settings"]["csv_path"]

        if self.pattern == "pattern_1":
            self.prompt = config["prompts"]["prompt_1"]
        elif self.pattern == "pattern_2":
            self.prompt = config["prompts"]["prompt_2"]

        self.json_list = os.listdir(os.path.join(self.base_path, self.pattern))
        self.model = config["settings"]["model"]
        self.client = OpenAI()

    def create_jsonl(self):
        with open(
            f"{self.base_path}/{self.pattern}.jsonl", encoding="utf-8", mode="w"
        ) as file:
            for i, json_file in enumerate(self.json_list):
                with open(
                    os.path.join(self.base_path, self.pattern, json_file),
                    "r",
                    encoding="utf-8",
                ) as f:
                    text = json.load(f)["text"]

                task = {
                    "custom_id": f"TableCorrection_{self.pattern}_{i}",
                    "method": "POST",
                    "url": "/v1/chat/completions",
                    "body": {
                        "model": self.model,
                        "temperature": 0.1,
                        "response_format": {"type": "json_object"},
                        "messages": [
                            {"role": "system", "content": self.prompt},
                            {"role": "user", "content": text},
                        ],
                    },
                }

                file.write(json.dumps(task))  # JSON 객체를 문자열로 변환
                file.write("\n")
        return print("jsonl 파일 생성 완료")

    def upload_batch(self):
        batch_file = self.client.files.create(
            file=open(f"{self.base_path}/{self.pattern}.jsonl", "rb"), purpose="batch"
        )

        batch_job = self.client.batches.create(
            input_file_id=batch_file.id,
            endpoint="/v1/chat/completions",
            completion_window="24h",
        )
        print("upload batch complete")
        return batch_job

    def save_result(self, result):
        with open(f"{self.base_path}/{self.pattern}_processed.jsonl", "wb") as file:
            file.write(result)
        print("save result complete")

    def process_batch(self):
        # jsonl 파일 생성
        self.create_jsonl()
        # batch 파일 생성 및 업로드
        batch_job = self.upload_batch()

        while True:
            batch_job = self.client.batches.retrieve(batch_job.id)
            if batch_job.status == "completed":
                print(batch_job.status)
                break
            time.sleep(60)

        result_file_id = batch_job.output_file_id
        result = self.client.files.content(result_file_id).content
        self.save_result(result)
        print("process batch complete")
        return result

    def load_result(self):
        with open(f"{self.base_path}/{self.pattern}_processed.jsonl", "r") as file:
            return file.readlines()

    def convert_korean_date(self, date_str):
        year, month = date_str.split("년")
        year = year.strip()
        month = month.strip().replace("월", "")
        return f"{year}-{month.zfill(2)}-01"

    def convert_to_table(self):
        results = self.load_result()
        all_data = pd.DataFrame()

        for res in results:
            data = json.loads(res)["response"]["body"]["choices"][0]["message"][
                "content"
            ]
            json_data = json.loads(data)["data"]
            df = pd.DataFrame(json_data)

            # 현재 결과를 all_data에 추가
            all_data = pd.concat([all_data, df], ignore_index=True)

        # 'date' 열을 날짜 형식으로 변환
        all_data["날짜"] = all_data["날짜"].apply(self.convert_korean_date)
        all_data["날짜"] = pd.to_datetime(all_data["날짜"])

        # 날짜를 기준으로 정렬
        all_data = all_data.sort_values("날짜")

        # 인덱스 재설정
        all_data = all_data.reset_index(drop=True)

        # 모든 데이터가 포함된 DataFrame을 df에 할당
        df = all_data
        df.to_csv(f"{self.csv_path}/{self.pattern}.csv", index=False)

In [218]:
config = yaml.load(open("../config/ocr.yaml", "r"), Loader=yaml.FullLoader)

table = ConvertTable(config)
table.process_batch()
table.convert_to_table()

jsonl 파일 생성 완료
upload batch complete
