<a href="https://colab.research.google.com/github/kyoujinkim/NH_CustomKnowledgeBase_GPT/blob/master/CustomKnowledgeBaseGPT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install openai
!pip install langchain[llms]
!pip install PyMuPDF
!pip install sentence_transformers
!pip install pip install chromadb
!git clone https://github.com/kyoujinkim/NH_CustomKnowledgeBase_GPT.git

import numpy as np
from glob import glob
from tqdm import tqdm
import json
import random
import os
import sys
sys.path.append('/content/NH_CustomKnowledgeBase_GPT')

import openai
from langchain import PromptTemplate
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
#https://github.com/BM-K/Sentence-Embedding-is-all-you-need
from langchain.chat_models import ChatOpenAI
from langchain.vectorstores import Chroma
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain.llms import OpenAI
from langchain.docstore.document import Document

from readPDF import PDFReader
from promptTemplate import loadTemplate
from quoteChecker import change_quote_num, print_quote

def setting(apiKey        :str                                  ,
            pdfFolderPath :str                                  ,
            embeddingAi   :str  ='BM-K/KoSimCSE-bert-multitask' ,
            docSeparator  :str  ='. '                           ,
            docSize       :int  =2                              ,
            docOverlap    :int  =0                              ,
            ):
  '''
  기본 세팅을 위한 전처리 프로세스
  :param apiKey: OPEN AI에서 발급받은 api Key 입력
  :param pdfFolderPath: colab 기준 백데이터(pdf) 폴더의 위치
  :param embeddingAi: 임베딩에 사용할 hugging face 기반 ai의 directory
  :param docSeparator: PDF 문서를 분할하기 위한 분할자
  :param docSize: PDF 문서 분할 문장 단위
  :param docOverlap: PDF 문서 분할시 오버랩 문장수
  :return: langchain.vectorstores.chroma.Chroma 임베딩과 함께 저장된 Document 데이터베이스
  '''

  os.environ["OPENAI_API_KEY"] = apiKey
  openai.api_key = os.getenv("OPENAI_API_KEY")

  from google.colab import drive
  drive.mount('/content/gdrive')

  embeddings = HuggingFaceEmbeddings(model_name=embeddingAi)

  if os.path.isdir("./ChromaDB/"):  
    docsearch = Chroma(embedding_function = embeddings, persist_directory="./ChromaDB/")
  else:
    pr = PDFReader()
    templates = loadTemplate()

    pdftexts = pr.getPDF(datapath=pdfFolderPath)

    docs_split = []
    for doc in tqdm(pdftexts, desc='PDF 세부분할'):
      doc_split= pr.split_text(doc,
                              separator =docSeparator,
                              size      =docSize,
                              overlap   =docOverlap)
      docs_split.extend(doc_split)


    docsearch = Chroma.from_documents(docs_split, embeddings, persist_directory="./ChromaDB/")

  return docsearch

def run_proc(query:str,
             baseDocument,
             llmAiEngine:str = 'gpt-3.5-turbo',
             numberOfReason:int = 15,
             iterNum:int = 5,
             temperature:float = 0.5,
             frequencyPenalty:float = 1.0,
             ):
  '''
  쿼리에 대한 응답을 생성하는 텍스트 컨센서스 생성 프로세스
  :param query: 쿼리 (스트링 형식)
  :param baseDocumnet: setting process에서 생성한 크로마 데이터베이스
  :param llmAiEngine: OpenAi에서 제공하는 LLM AI Engine
  :param numberOfReason: 응답 생성을 위해 사용할 분할 문서의 개수
  :param temperature: 텍스트 생성의 자유도, 창의도(이런 케이스에서는 낮은게 좋음)
  :param frequencyPenalty: 반복 단어에 대한 벌점(높을수록 반복 회피)
  :return: None
  '''

  openai.Engine = llmAiEngine

  '''prompt 기본 서식 로딩'''
  templates   = loadTemplate()
  PROMPT      = PromptTemplate(template=templates['template'], input_variables=["summaries", "question"])
  PROMPT_S    = PromptTemplate(template=templates['template_s'], input_variables=["summaries", "question"])
  PROMPT_AGG  = PromptTemplate(template=templates['template_agg'], input_variables=["summaries", "question"])

  '''chain prompt 정의'''
  chain     = load_qa_with_sources_chain(ChatOpenAI(model_name=       llmAiEngine, 
                                                    temperature=      0.0,
                                                    frequency_penalty=0.0,
                                                    ), 
                                         chain_type ="stuff",
                                         prompt     =PROMPT)
  chain_s   = load_qa_with_sources_chain(ChatOpenAI(model_name=       llmAiEngine,
                                                    temperature=      0.5,
                                                    frequency_penalty=1.0,
                                                    ),
                                         chain_type ="stuff",
                                         prompt     =PROMPT_S)
  chain_agg = load_qa_with_sources_chain(ChatOpenAI(model_name=       llmAiEngine,
                                                    temperature=      1.0,
                                                    frequency_penalty=0.0,
                                                    ),
                                         chain_type ="stuff",
                                         prompt     =PROMPT_AGG)

  '''쿼리와 유사한 문서 목록 획득'''
  docs = baseDocument.similarity_search(query, k=numberOfReason)

  '''근거 목록 저장'''
  context_doc = []
  for idx in range(0, int(numberOfReason/iterNum)):
    summed_docs_part = docs[idx*iterNum:(idx+1)*iterNum]
    output = chain({"input_documents": summed_docs_part, "question": query}, return_only_outputs=True)
    output['output_text'] = change_quote_num(output['output_text'], idx*iterNum)
    context_doc.append(Document(page_content=output['output_text'], metadata={"source": ''}))

  '''필터링 근거 출력'''
  output = chain_s({"input_documents": context_doc, "question": query}, return_only_outputs=True)
  print(output['output_text'])
  rearr_context_doc = Document(page_content=output['output_text'], metadata={"source": ''})

  '''요약 기준 결론 출력'''
  output = chain_agg({"input_documents": [rearr_context_doc], "question": query}, return_only_outputs=True)
  print(output['output_text'])

  '''주석 출력'''
  print_quote(rearr_context_doc, docs)

docsearch = setting(apiKey        ='YOUR API KEY',
                    pdfFolderPath ='YOUR GOOGLE DRIVE PDF FOLDER PATH',
                    embeddingAi   ='BM-K/KoSimCSE-bert-multitask',
                    docSeparator  ='. ',
                    docSize       =2,
                    docOverlap    =0,
                    )

In [None]:
run_proc(query            ='국내 주식시장 전망',
          baseDocument     =docsearch,
          llmAiEngine      ='gpt-3.5-turbo',
          numberOfReason   =15,
          iterNum          =5,
          temperature      =0.5,
          frequencyPenalty =1.0
         )