## Week8 Basic Homework
- ChatGPT의 MapReduce를 모방한 요약 corpus 만들기
- input data size 축소

In [15]:
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

from langchain.text_splitter import CharacterTextSplitter

from langchain.document_loaders import PyPDFLoader

from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate

import operator
from typing import Annotated, List, TypedDict

from langgraph.constants import Send
from langgraph.graph import END, START, StateGraph

## MapReduce class and function
class OverallState(TypedDict):
    # Notice here we use the operator.add
    # This is because we want combine all the summaries we generate
    # from individual nodes back into one list - this is essentially
    # the "reduce" part
    contents: List[str]
    summaries: Annotated[list, operator.add]
    final_summary: str


# This will be the state of the node that we will "map" all
# documents to in order to generate summaries
class SummaryState(TypedDict):
    content: str


# Here we generate a summary, given a document
async def generate_summary(state: SummaryState):
    response = await map_chain.ainvoke(state["content"])
    return {"summaries": [response]}


# Here we define the logic to map out over the documents
# We will use this an edge in the graph
def map_summaries(state: OverallState):
    # We will return a list of `Send` objects
    # Each `Send` object consists of the name of a node in the graph
    # as well as the state to send to that node
    return [
        Send("generate_summary", {"content": content}) for content in state["contents"]
    ]


# Here we will generate the final summary
async def generate_final_summary(state: OverallState):
    response = await reduce_chain.ainvoke(state["summaries"])
    return {"final_summary": response}


import os
from dotenv import load_dotenv
import pprint

load_dotenv()
api_key = os.getenv('OPENAI_API_KEY')

## [MY CODE] 데이터셋 만들기

In [19]:
file_list = os.listdir('week7_pdf')
file_list = [x for x in file_list if x not in ['.DS_Store']]
print(file_list)

['dl paper.pdf', 'Artical.pdf', 'deep neural network.pdf', 'bayesian deep learning.pdf', 'RAG_paper.pdf', 'An Improved Particle Filter.pdf', 'NIPS-2017-attention.pdf', 'deep learning.pdf']


In [17]:
# Map prompt
map_template = """This is a part of document:
{pages}

Please summarize the main points of the content.
Answer:"""

map_prompt = PromptTemplate.from_template(map_template)

# Reduce prompt
reduce_template = """This is a set of summary:
{doc_summaries}

Please write a comprehensive summary of this..
Answer:"""

# Reduce 프롬프트 완성
reduce_prompt = PromptTemplate.from_template(reduce_template)

In [18]:
llm = ChatOpenAI(temperature=0,
                 model_name='gpt-4o-mini',
                 api_key=api_key)


In [53]:
corpus_set = []
for file_name in file_list:
    try:
        loader = PyPDFLoader(
            file_path = 'week7_pdf/'+file_name,
        )
        docs = loader.load()
    except:
        continue

    new_docs = []
    for doc in docs:
        tmp = doc.page_content
        if('References' in tmp):
            tmp = tmp.split('References\n')[0]
            doc.page_content = tmp
            new_docs.append(doc)
            break
        elif('references' in tmp):
            tmp = tmp.split('references\n')[0]
            doc.page_content = tmp
            new_docs.append(doc)
            break
        else:
            new_docs.append(doc)

    ## 3 page씩 요약
    print(file_name, '| doc 수:', len(docs))
    base_page = 2
    for i in range(int(len(new_docs)/base_page)):
        if(i+1==int(len(new_docs)/base_page)):
            docs = new_docs[i*base_page:]
        else:
            docs = new_docs[i*base_page:(i+1)*base_page]

        page_contents = [doc.page_content for doc in docs]

        page_contents = '\n'.join(page_contents)

        text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
            separator="\n",  # 분할기준
            chunk_size=500,   # 사이즈
            chunk_overlap=50, # 중첩 사이즈
        )

        # 분할 실행
        split_docs = text_splitter.split_text(page_contents)
        split_docs = [split.replace('-\n','-').replace('\n',' ') for split in split_docs]
        split_docs = split_docs[:3]

        print(file_name, '| split 수:', len(split_docs))


        ### Map Reduce
        map_prompt = ChatPromptTemplate([("human", map_template)])
        map_chain = map_prompt | llm | StrOutputParser()

        reduce_prompt = ChatPromptTemplate([("human", reduce_template)])
        reduce_chain = reduce_prompt | llm | StrOutputParser()

        # Construct the graph: here we put everything together to construct our graph
        graph = StateGraph(OverallState)
        graph.add_node("generate_summary", generate_summary)
        graph.add_node("generate_final_summary", generate_final_summary)
        graph.add_conditional_edges(START, map_summaries, ["generate_summary"])
        graph.add_edge("generate_summary", "generate_final_summary")
        graph.add_edge("generate_final_summary", END)
        app = graph.compile()

        rsts = []
        async for step in app.astream({"contents": split_docs}):
            rsts.append(step)
            #print(list(step.keys()))


        # GPT 결과와 map prompt, reduce prompt를 통해 corpus 데이터셋 만들기
        for i in range(len(rsts)):
            if('generate_summary' in rsts[i].keys()):
                map_content = rsts[i]['generate_summary']['summaries']
                if(not isinstance(map,str)):
                    map_content = map_content[0]
                corpus = {"input": map_prompt.format(pages=split_docs[i]),
                        "output": map_content}


            else:
                maps = [rst['generate_summary']['summaries'][0] for rst in rsts[:-1]]
                maps_content = '\n'.join(maps)
                reduce_content = rsts[i]['generate_final_summary']['final_summary']
                corpus = {"input": reduce_prompt.format(doc_summaries=maps_content),
                        "output": reduce_content}

            corpus_set.append(corpus)

        print('corpus count:', len(corpus_set))

Ignoring wrong pointing object 2 65536 (offset 0)
Ignoring wrong pointing object 34 65536 (offset 0)
Ignoring wrong pointing object 92 65536 (offset 0)
Ignoring wrong pointing object 145 65536 (offset 0)
Ignoring wrong pointing object 206 65536 (offset 0)
Ignoring wrong pointing object 274 65536 (offset 0)
Ignoring wrong pointing object 330 65536 (offset 0)
Ignoring wrong pointing object 372 65536 (offset 0)


dl paper.pdf | doc 수: 8
dl paper.pdf | split 수: 3
corpus count: 4
dl paper.pdf | split 수: 3
corpus count: 8
dl paper.pdf | split 수: 3
corpus count: 12
dl paper.pdf | split 수: 3
corpus count: 16
Artical.pdf | doc 수: 16
Artical.pdf | split 수: 3
corpus count: 20
Artical.pdf | split 수: 3
corpus count: 24
Artical.pdf | split 수: 3
corpus count: 28
Artical.pdf | split 수: 3
corpus count: 32
Artical.pdf | split 수: 2
corpus count: 35
Artical.pdf | split 수: 3
corpus count: 39
Artical.pdf | split 수: 3
corpus count: 43
deep neural network.pdf | doc 수: 9
deep neural network.pdf | split 수: 3
corpus count: 47
deep neural network.pdf | split 수: 3
corpus count: 51
deep neural network.pdf | split 수: 3
corpus count: 55
deep neural network.pdf | split 수: 2
corpus count: 58
bayesian deep learning.pdf | doc 수: 7
bayesian deep learning.pdf | split 수: 3
corpus count: 62
RAG_paper.pdf | doc 수: 19
RAG_paper.pdf | split 수: 3
corpus count: 66
RAG_paper.pdf | split 수: 3
corpus count: 70
RAG_paper.pdf | split 수: 3
c

In [83]:
# 총 글자수가 8100개 미만으로 유지
tmp_corpus_set = []
for c in corpus_set:
    if(len(c['input'])+len(c['output'])<8100):
        tmp_corpus_set.append(c)

print(len(corpus_set), len(tmp_corpus_set))

140 138


In [84]:
corpus_set = tmp_corpus_set

In [86]:
import json
with open("corpus_small.json", "w", encoding="utf-8") as f:
    json.dump(corpus_set, f, ensure_ascii=False, indent=2)