In [1]:
import os
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_teddynote import logging
import yaml

load_dotenv()


class SentenceCorrection:
    def __init__(self, config):
        self.category_id = config["settings"]["category_id"]
        self.base_path = config["settings"]["base_path"]
        self.folder_path = os.path.join(self.base_path, self.category_id)
        self.llm = ChatOpenAI(temperature=0.1, model_name="gpt-4o-mini-2024-07-18")
        self.prompt_template = self.load_yaml(
            "../prompts/summarization/sentence_correction.yaml"
        )["prompt"]
        self.prompt = PromptTemplate.from_template(self.prompt_template)

    def load_yaml(self, file_path):
        with open(file_path, "r", encoding="utf-8") as file:
            return yaml.safe_load(file)

    def save_correction_file(self, txt, answer, file_path):
        txt_path = os.path.join(file_path, txt)
        with open(txt_path, "a", encoding="utf-8") as file:
            file.write(answer)

    def run(self):
        # logging.langsmith(f"TextCorrection{self.category_id.upper()}")

        chain = self.prompt | self.llm | StrOutputParser()
        origin_txt_path = os.path.join(self.folder_path, "origin_txt")
        eidted_txt_path = os.path.join(self.folder_path, "txt")
        txts = [x for x in os.listdir(origin_txt_path) if x.endswith("txt")]
        edited_txt_list = [x for x in os.listdir(eidted_txt_path) if x.endswith("txt")]

        for txt in txts:
            if txt in edited_txt_list:
                print(f"이미 {txt} 가 존재 합니다")
                continue

            loader = TextLoader(f"{origin_txt_path}/{txt}")
            docs = loader.load()

            splitter = CharacterTextSplitter(
                separator=". ", chunk_size=5000, chunk_overlap=0, length_function=len
            )
            split_docs = splitter.split_documents(docs)

            inputs = [{"docs": chunk} for chunk in split_docs]
            corrected_texts = chain.batch(inputs)

            final_corrected_text = "\n".join(corrected_texts)

            self.save_correction_file(txt, final_corrected_text, eidted_txt_path)
            print(f"제목 '{txt}'에 대한 교정된 파일이 추가되었습니다.")

In [None]:
from utils import load_yaml


config = load_yaml("../config/sentence_correction.yaml")
sc = SentenceCorrection(config)
sc.run()