# PDF Summary

In [1]:
#!pip install --upgrade sagemaker --quiet
!pip install langchain --quiet

#!pip install faiss-cpu --quiet

In [2]:
import time
import sagemaker, boto3, json
from sagemaker.session import Session
from sagemaker.model import Model
from sagemaker import image_uris, model_uris, script_uris, hyperparameters
from sagemaker.predictor import Predictor
from sagemaker.utils import name_from_base
from typing import Any, Dict, List, Optional
from langchain.embeddings import SagemakerEndpointEmbeddings
from langchain.llms.sagemaker_endpoint import ContentHandlerBase

sagemaker_session = Session()
aws_role = sagemaker_session.get_caller_identity_arn()

sess = sagemaker.Session()
model_version = "*"

In [3]:
import boto3, json
from sagemaker.session import Session

In [4]:
sagemaker_session = Session()
aws_role = sagemaker_session.get_caller_identity_arn()
aws_region = boto3.Session().region_name

In [5]:
endpoint_name = 'jumpstart-dft-hf-llm-falcon-7b-instruct-bf16'

In [6]:
from langchain import SagemakerEndpoint

In [7]:
from langchain.llms.sagemaker_endpoint import LLMContentHandler

In [8]:
aws_region = boto3.Session().region_name

In [9]:
class ContentHandler(LLMContentHandler):
    content_type = "application/json"
    accepts = "application/json"

    def transform_input(self, prompt: str, model_kwargs: dict) -> bytes:
        input_str = json.dumps({'inputs': prompt, 'parameters': model_kwargs})
        # input_str = json.dumps({'inputs': prompt, **model_kwargs})
        return input_str.encode('utf-8')
      
    def transform_output(self, output: bytes) -> str:
        response_json = json.loads(output.read().decode("utf-8"))
        return response_json[0]["generated_text"]

content_handler = ContentHandler()

In [10]:
parameters = {
    "max_new_tokens": 300,
}

In [11]:
llm = SagemakerEndpoint(
    endpoint_name = endpoint_name, 
    region_name = aws_region, 
    model_kwargs = parameters,
    content_handler = content_handler
)

## PDF Summary

In [12]:
!pip install PyPDF2 --quiet

In [13]:
import PyPDF2
from io import BytesIO

In [14]:
sess = sagemaker.Session()
s3_bucket = sess.default_bucket()
s3_prefix = 'docs'

In [15]:
# !aws s3 cp --recursive contents `s3://{s3_bucket}/{s3_prefix}/`

In [16]:
#s3_file_name = 'sample-blog.pdf'
s3_file_name = '2016-3series.pdf'
#s3_file_name = 'gen-ai-aws.pdf'

In [17]:
s3r = boto3.resource("s3")
doc = s3r.Object(s3_bucket, s3_prefix+'/'+s3_file_name)
        
contents = doc.get()['Body'].read()
reader = PyPDF2.PdfReader(BytesIO(contents))
        
raw_text = []
for page in reader.pages:
    raw_text.append(page.extract_text())
contents = '\n'.join(raw_text)  

In [18]:
new_contents = str(contents).replace("\n"," ") 
#new_contents = str(contents[:8000]).replace("\n"," ") 

### RecursiveCharacterTextSplitter를 이용해 chunk로 텍스트를 분리합니다.

In [19]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=0)
texts = text_splitter.split_text(new_contents) 

In [20]:
len(texts)

444

In [21]:
print(texts[0])

Owner's Manual for Vehicle The Ultimate Driving Machine® THE BMW 3 SERIES SEDAN. OWNER'S MANUAL. Contents A-Z Online Edition for Part no. 01 40 2 960 440 - II/15  3 Series Owner's Manual for Vehicle Thank you for choosing a BMW. The more familiar you are with your vehicle, the better control you will have on the road. We therefore strongly suggest: Read this Owner's Manual before starting off in your new BMW. Also use the Integrated Owner's Manual in your vehicle. It con‐ tains important information on vehicle operation that will help you make full use of the technical features available in your BMW. The manual also contains information designed to en‐ hance operating reliability and road safety, and to contribute to maintaining the value of your BMW. Any updates made after the editorial deadline for the printed or Integrated Owner's Manual are found in the appendix of the printed Quick Reference for the vehicle. Supplementary information can be found in the additional bro‐ chures in


(주의) page가 여러개일 경우에 refine이나 map_reduce를 쓰는데, pages수가 너무 많으면 1) 입력 Token수가 1024개를 넘을 수 있고, 2) 브라우저 시간제한(30초)내에 처리가 어려울 수 있습니다. 편의상 여기서는 3000자로 요약을 수행합니다.

In [22]:
from langchain.docstore.document import Document
docs = [
    Document(
        page_content=t
    ) for t in texts[:3]
]

In [23]:
len(docs)

3

### Summerrize type: stuff, refine, map_reduce

- stuff puts all the chunks into one prompt. Thus, this would hit the maximum limit of tokens.
- map_reduce summarizes each chunk, combines the summary, and summarizes the combined summary. If the combined summary is too large, it would raise error. (채팅에서 쓰기에 너무 많은 시간이 소요가되며, map_reduce를 쓰려면 transfomer를 설치하여야 하는데, 노트북에서는 잘 동작하지만 container에서는 write가 안되어서 사용할 수 없습니다)
- refine summarizes the first chunk, and then summarizes the second chunk with the first summary. The same process repeats until all chunks are summarized.

In [24]:
# !pip install transformers --quiet

In [25]:
from langchain.chains.summarize import load_summarize_chain
chain = load_summarize_chain(llm, chain_type="map_reduce")
summary = chain.run(docs)
print(summary)

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


 The Owner's Manual for the 3 Series Sedan provides essential information on vehicle operation, maintenance, and safety features. It is important to familiarize yourself with the manual to ensure optimal control and value of your BMW. The manual is available in many countries as an app and online.


In [26]:
len(summary.split())

48

In [27]:
from langchain.chains.summarize import load_summarize_chain
chain = load_summarize_chain(llm, chain_type="refine")
summary = chain.run(docs)
print(summary)




### Prompt로 load_summarize_chain을 하는 방법
- Prompt에 docs가 한꺼번에 들어가므로 chain_type은 stuff가 되어야 하고, LLM의 token 숫자가 보다 docs의 단어수가 많은 경우에는 실패합니다. 따라서 아래처럼 docs의 일부만 요약(Summary)할 수 있습니다.

In [28]:
from langchain.docstore.document import Document
docs = [
    Document(
        page_content=t
    ) for t in texts[:3]
]

In [29]:
from langchain.prompts import PromptTemplate

prompt_template = """Write a concise summary of the following:


{text}


CONCISE SUMMARY """


In [30]:
PROMPT = PromptTemplate(template=prompt_template, input_variables=["text"])
chain = load_summarize_chain(llm, chain_type="stuff", prompt=PROMPT)
summary = chain.run(docs)
print(summary)


The Owner's Manual for the 3 Series is a comprehensive guide to the vehicle's features and operation. It contains important information on vehicle operation that will help you make full use of the technical features available in your BMW. The manual also contains information designed to enhance operating reliability and road safety, and to contribute to maintaining the value of your BMW. Any updates made after the editorial deadline for the printed or Integrated Owner's Manual are found in the appendix of the printed Quick Reference for the vehicle. Supplementary information can be found in the additional brochures in the onboard literature. We wish you a safe and enjoyable ride.

BMW AG
The Owner's Manual is available in many countries as an app. Additional information on the Internet: www.bmw.com/bmw_drivers_guide Online Edition for Part no. 01 40 2 960 440 - II/15 © 2015 Bayerische Motoren Werke Aktiengesellschaft Munich, Germany Reprinting, including excerpts, only with the writte

In [31]:
len(summary.split())

207