In [None]:
# !pip install transformers
# !pip install langchain
# !pip install python-dotenv
# !pip install pypdf
# !pip install chromadb
# !pip install sentence-transformers
# !pip install openai
# !pip install -qU langchain-openai

In [None]:
from langchain.document_loaders import PyPDFLoader
from langchain.vectorstores import Chroma
from langchain_community.llms import HuggingFaceEndpoint
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import RetrievalQA, ConversationalRetrievalChain
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain_community.embeddings import (
    HuggingFaceEmbeddings,
    HuggingFaceBgeEmbeddings,
)
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import Docx2txtLoader
from langchain_community.document_loaders import TextLoader
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import create_history_aware_retriever
from langchain_core.prompts import MessagesPlaceholder
from langchain_core.messages import HumanMessage, AIMessage
from langchain import LLMChain
from langchain.prompts import PromptTemplate
from langchain.llms import HuggingFaceHub
from dotenv import load_dotenv
import os
import fitz
import re

In [None]:
def find_text_by_red_strikethrough_status(pdf_path):
    document = fitz.open(pdf_path)
    strikethrough_texts = []
    non_strikethrough_texts = []
    full_texts = []

    for page_number in range(len(document)):
        page = document[page_number]
        words = page.get_text("words")  # 단어와 그 위치를 반환
        paths = page.get_drawings()  # 페이지의 그래픽 요소를 추출

        strikethrough_lines = []

        # 그림 요소 중에서 선과 사각형을 검사하여 빨간색 취소선으로 판단
        for path in paths:
            color = path["color"]
            # 선의 색상이 빨간색인 경우에만 처리
            if color == (1, 0, 0):  # RGB 색상으로 빨간색 확인
                for item in path["items"]:
                    if item[0] == "l":  # 선인 경우
                        p1, p2 = item[1:]
                        if p1.y == p2.y:  # 수평선이면
                            rect = fitz.Rect(p1.x, p1.y - 1, p2.x, p2.y + 1)
                            strikethrough_lines.append(rect)
                    elif item[0] == "re":  # 사각형인 경우
                        rect = item[1]
                        if rect.width > rect.height and rect.height < 3:  # 넓이가 높이보다 많이 크고 높이가 3pt 이하이면
                            strikethrough_lines.append(rect)

        # 각 단어와 취소선이 겹치는지 검사
        same_line = words[0][5]
        previous_strike = False
        strike_line = ''
        line = ''
        for word in words:
            word_rect = fitz.Rect(word[:4])  # 단어의 위치
            strikethrough_found = False
            for line_rect in strikethrough_lines:
                if word_rect.intersects(line_rect):  # 겹치면
                    strikethrough_found = True
                    break
            if not strikethrough_found:  # 취소선이 없으면
                non_strikethrough_texts.append(word[4:6])  # 취소선이 적용되지 않은 단어 추가
                if same_line != word[5]:
                    same_line = word[5]
                    line += '\n'

                line = line + ' ' + word[4]
                
                if strikethrough_found != previous_strike:
                    full_texts.append('말소기록(' + strike_line + ')')
                    strike_line = ''
                previous_strike = False
            else:
                strikethrough_texts.append(word[4:6])  # 취소선이 적용된 단어 추가
                strike_line = strike_line  + ' ' + word[4]
                if strikethrough_found != previous_strike:
                    full_texts.append(line + '\n')
                    line=''
                previous_strike = True
        full_texts.append(line)
    document.close()
    return strikethrough_texts, non_strikethrough_texts, full_texts

def re_text(full_texts):
    full = ''
    for text in full_texts:
        cleaned_text = re.sub(r'\.\s*\.', '', text)
        full += cleaned_text
    return full

pdf_path = 'real_data_ex.pdf'
strikethrough_texts, non_strikethrough_texts, full_texts = find_text_by_red_strikethrough_status(pdf_path)
texts = re_text(full_texts)
print(texts)

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap  = 200,
    length_function = len,
)

split_texts = text_splitter.split_text(texts)
pages = text_splitter.create_documents(split_texts)
pages

In [None]:
directory = 'chroma_store'

embeddings_open = OpenAIEmbeddings(model="text-embedding-3-small")

vector_index = Chroma.from_documents(
    pages, # Documents
    embedding = embeddings_open , # Text embedding model
    persist_directory=directory # persists the vectors to the file system
    )
vector_index.persist()
print('count: ', vector_index._collection.count())

In [None]:
chat_history = []

In [None]:
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain_openai import ChatOpenAI

memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

prompt_korean = ChatPromptTemplate.from_template("""
너는 문서를 보고 대답을 하는 전문가야 모르는 답은 모른다고 답을 해줘
문서를 보면 "말소기록()" 괄호로 묶인 텍스트가 있는데 이거는 말소된 기록이란 뜻이야.
내가 말소기록을 요청할 경우에만 말소기록()을 사용하고 나머지 경우는 사용하지 마.
답변은 무조건 한국말로 해줘
<context>
{context}
</context>

Question: {question}
""")

retriever = vector_index.as_retriever(
    search_type="similarity", # Cosine Similarity
    search_kwargs={
        "k": 5, # Select top k search results
    } 
)

open_llm = ChatOpenAI(
    temperature=0.5,  # 창의성 (0.0 ~ 2.0)
    max_tokens=2048,  # 최대 토큰수
    model_name="gpt-3.5-turbo",  # 모델명
)

conv_chain = ConversationalRetrievalChain.from_llm(
    open_llm, 
    retriever=retriever,
    combine_docs_chain_kwargs={"prompt": prompt_korean},
    memory=memory
)
# query_list = ["이 회사의 상호가 뭐지","회사 상호를 영어로 바꾸면 뭐지?", "본점이 어디지",
#                "발행주식의 총수가 뭐지", "발행할 주식의 총수는 뭐지?", "액면가는 얼마지",
#                "말소된 기록은 뭐가 있지", "회사성립연월인은 언제지", "등기번호가 뭐지", "등록번호는 뭐지"
#                ]
query_list = ["회사의 상호, 상호의 영어명, 본점, 발행한 주식의 총수, 발행할 주식의 총수, 액면가, 회사성립연월일, 등기번호, 등록번호에 대해 알려줘"]

for query in query_list:
    result = conv_chain.invoke({"question": query, "chat_history": chat_history})
    print(result)
    chat_history.append((query, result["answer"]))

In [None]:
open_llm = ChatOpenAI(
    temperature=0.5,  # 창의성 (0.0 ~ 2.0)
    max_tokens=2048,  # 최대 토큰수
    model_name="gpt-3.5-turbo",  # 모델명
)

conv_chain = ConversationalRetrievalChain.from_llm(
    open_llm, 
    retriever=retriever,
    combine_docs_chain_kwargs={"prompt": prompt_korean},
    memory=memory
)

query = "종류주식의 내용에 대해 요약해줘"
result = conv_chain.invoke({"question": query, "chat_history": chat_history})
print(result)

In [None]:
chat_history