## Week7 Advanced1 Homework
- ChatGPT의 MapReduce를 모방한 요약 corpus 만들기

In [1]:
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

from langchain.text_splitter import CharacterTextSplitter

from langchain.document_loaders import PyPDFLoader

from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate

import operator
from typing import Annotated, List, TypedDict

from langgraph.constants import Send
from langgraph.graph import END, START, StateGraph

## MapReduce class and function
class OverallState(TypedDict):
    # Notice here we use the operator.add
    # This is because we want combine all the summaries we generate
    # from individual nodes back into one list - this is essentially
    # the "reduce" part
    contents: List[str]
    summaries: Annotated[list, operator.add]
    final_summary: str


# This will be the state of the node that we will "map" all
# documents to in order to generate summaries
class SummaryState(TypedDict):
    content: str


# Here we generate a summary, given a document
async def generate_summary(state: SummaryState):
    response = await map_chain.ainvoke(state["content"])
    return {"summaries": [response]}


# Here we define the logic to map out over the documents
# We will use this an edge in the graph
def map_summaries(state: OverallState):
    # We will return a list of `Send` objects
    # Each `Send` object consists of the name of a node in the graph
    # as well as the state to send to that node
    return [
        Send("generate_summary", {"content": content}) for content in state["contents"]
    ]


# Here we will generate the final summary
async def generate_final_summary(state: OverallState):
    response = await reduce_chain.ainvoke(state["summaries"])
    return {"final_summary": response}


import os
from dotenv import load_dotenv
import pprint

load_dotenv()
api_key = os.getenv('OPENAI_API_KEY')

## [MY CODE] 데이터셋 만들기
-

In [2]:
file_list = os.listdir('week7_pdf')
print(file_list)

['dl paper.pdf', 'Artical.pdf', 'deep neural network.pdf', '.DS_Store', 'bayesian deep learning.pdf', 'RAG_paper.pdf', 'An Improved Particle Filter.pdf', 'NIPS-2017-attention.pdf', 'deep learning.pdf']


In [3]:
# Map prompt
map_template = """This is a part of document:
{pages}

Please summarize the main points of the content.
Answer:"""

map_prompt = PromptTemplate.from_template(map_template)

# Reduce prompt
reduce_template = """This is a set of summary:
{doc_summaries}

Please write a comprehensive summary of this..
Answer: """

# Reduce 프롬프트 완성
reduce_prompt = PromptTemplate.from_template(reduce_template)

In [4]:
llm = ChatOpenAI(temperature=0,
                 model_name='gpt-4o-mini',
                 api_key=api_key)


In [5]:
corpus_set = []
for file_name in file_list:
    try:
        loader = PyPDFLoader(
            file_path = 'week7_pdf/'+file_name,
        )
        docs = loader.load()
    except:
        continue

    page_contents = []
    for doc in docs:
        tmp = doc.page_content
        if('References' in tmp):
            tmp = tmp.split('References\n')[0]
            page_contents.append(tmp)
            break
        elif('references' in tmp):
            tmp = tmp.split('references\n')[0]
            page_contents.append(tmp)
            break
        else:
            page_contents.append(tmp)

    page_contents = '\n'.join(page_contents)
    page_contents = page_contents

    text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
        separator="\n",  # 분할기준
        chunk_size=900,   # 사이즈
        chunk_overlap=200, # 중첩 사이즈
    )

    # 분할 실행
    split_docs = text_splitter.split_text(page_contents)
    split_docs = [split.replace('-\n','-').replace('\n',' ') for split in split_docs]

    print(file_name, '| doc 수:', len(docs), '| split 수:', len(split_docs))


    ### Map Reduce
    map_prompt = ChatPromptTemplate([("human", map_template)])
    map_chain = map_prompt | llm | StrOutputParser()

    reduce_prompt = ChatPromptTemplate([("human", reduce_template)])
    reduce_chain = reduce_prompt | llm | StrOutputParser()

    # Construct the graph: here we put everything together to construct our graph
    graph = StateGraph(OverallState)
    graph.add_node("generate_summary", generate_summary)
    graph.add_node("generate_final_summary", generate_final_summary)
    graph.add_conditional_edges(START, map_summaries, ["generate_summary"])
    graph.add_edge("generate_summary", "generate_final_summary")
    graph.add_edge("generate_final_summary", END)
    app = graph.compile()

    rsts = []
    async for step in app.astream({"contents": split_docs}):
        rsts.append(step)
        #print(list(step.keys()))


    # GPT 결과와 map prompt, reduce prompt를 통해 corpus 데이터셋 만들기
    for i in range(len(rsts)):
        if('generate_summary' in rsts[i].keys()):
            map_content = rsts[i]['generate_summary']['summaries']
            if(not isinstance(map,str)):
                map_content = map_content[0]
            corpus = {"input": map_prompt.format(pages=split_docs[i]),
                    "output": map_content}


        else:
            maps = [rst['generate_summary']['summaries'][0] for rst in rsts[:-1]]
            maps_content = '\n'.join(maps)
            reduce_content = rsts[i]['generate_final_summary']['final_summary']
            corpus = {"input": reduce_prompt.format(doc_summaries=maps_content),
                      "output": reduce_content}

        corpus_set.append(corpus)

    print('corpus count:', len(corpus_set))

Ignoring wrong pointing object 2 65536 (offset 0)
Ignoring wrong pointing object 34 65536 (offset 0)
Ignoring wrong pointing object 92 65536 (offset 0)
Ignoring wrong pointing object 145 65536 (offset 0)
Ignoring wrong pointing object 206 65536 (offset 0)
Ignoring wrong pointing object 274 65536 (offset 0)
Ignoring wrong pointing object 330 65536 (offset 0)
Ignoring wrong pointing object 372 65536 (offset 0)


dl paper.pdf | doc 수: 8 | split 수: 12
corpus count: 13
Artical.pdf | doc 수: 16 | split 수: 13
corpus count: 27
deep neural network.pdf | doc 수: 9 | split 수: 8


invalid pdf header: b'\x00\x00\x00\x01B'
EOF marker not found


corpus count: 36
bayesian deep learning.pdf | doc 수: 7 | split 수: 4
corpus count: 41
RAG_paper.pdf | doc 수: 19 | split 수: 15
corpus count: 57
An Improved Particle Filter.pdf | doc 수: 14 | split 수: 14
corpus count: 72
NIPS-2017-attention.pdf | doc 수: 11 | split 수: 10
corpus count: 83
deep learning.pdf | doc 수: 11 | split 수: 12
corpus count: 96


In [6]:
print('corpus final count:', len(corpus_set))
corpus_set[:3]

corpus final count: 96


[{'input': 'Human: This is a part of document:\n249 Understanding the difﬁculty of training deep feedforward neural networks Xavier Glorot Yoshua Bengio DIRO, Universit´e de Montr ´eal, Montr ´eal, Qu ´ebec, Canada Abstract Whereas before 2006 it appears that deep multi-layer neural networks were not successfully trained, since then several algorithms have been shown to successfully train them, with experi-mental results showing the superiority of deeper vs less deep architectures. All these experimen-tal results were obtained with new initialization or training mechanisms. Our objective here is to understand better why standard gradient descent from random initialization is doing so poorly with deep neural networks, to better understand these recent relative successes and help design better algorithms in the future. We ﬁrst observe the inﬂuence of the non-linear activations func-tions. We ﬁnd that the logistic sigmoid activation is unsuited for deep networks with random ini-tializatio

## [LOG] 8개 문서 중 하나 실패하였고, 총 96개의 학습 데이터 생성

In [7]:
import json
with open("corpus.json", "w", encoding="utf-8") as f:
    json.dump(corpus_set, f, ensure_ascii=False, indent=2)