In [1]:
import langchain
import pypdf
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Text file을 열어서 내용 가져오기

with open('data/generative ai.txt', 'r') as file:
    text_gen_ai = file.read()

In [3]:
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(
    separator='\n\n', # 문단을 기준으로 나누기
    chunk_size=1000, # 1000자씩 나누기
    chunk_overlap=100, # 100자씩 겹치기
    length_function=len # 글자 수로 나누기
)

texts = text_splitter.split_text(text_gen_ai)
texts[0]

Created a chunk of size 1003, which is longer than the specified 1000
Created a chunk of size 2239, which is longer than the specified 1000
Created a chunk of size 5023, which is longer than the specified 1000
Created a chunk of size 3461, which is longer than the specified 1000
Created a chunk of size 1266, which is longer than the specified 1000
Created a chunk of size 1143, which is longer than the specified 1000
Created a chunk of size 2523, which is longer than the specified 1000


'What is generative AI?\nGenerative AI can learn from existing artifacts to generate new, realistic artifacts (at scale) that reflect the characteristics of the training data but don’t repeat it. It can produce a variety of novel content, such as images, video, music, speech, text, software code and product designs.\xa0\xa0\nGenerative AI uses a number of techniques that continue to evolve. Foremost are AI foundation models, which are trained on a broad set of unlabeled data that can be used for different tasks, with additional fine-tuning.\xa0Complex math and enormous computing power are required to create these trained models, but they are, in essence, prediction algorithms.\xa0\nToday, generative AI most commonly creates content in response to natural language requests — it doesn’t require knowledge of or entering code — but the\xa0enterprise use cases\xa0are numerous and include innovations in drug and chip design and material science development. (Also see “What are some practical

In [4]:
print(texts[0])
print('-' * 100)
print(texts[1])
print('-' * 100)
print(texts[2])


What is generative AI?
Generative AI can learn from existing artifacts to generate new, realistic artifacts (at scale) that reflect the characteristics of the training data but don’t repeat it. It can produce a variety of novel content, such as images, video, music, speech, text, software code and product designs.  
Generative AI uses a number of techniques that continue to evolve. Foremost are AI foundation models, which are trained on a broad set of unlabeled data that can be used for different tasks, with additional fine-tuning. Complex math and enormous computing power are required to create these trained models, but they are, in essence, prediction algorithms. 
Today, generative AI most commonly creates content in response to natural language requests — it doesn’t require knowledge of or entering code — but the enterprise use cases are numerous and include innovations in drug and chip design and material science development. (Also see “What are some practical uses of generative AI

In [5]:

char_list = list(map(len, texts))
char_list

[1003, 2238, 5022, 3460, 1265, 995, 1142, 2523, 3101]

In [6]:
docs = text_splitter.create_documents(texts)
docs

[Document(page_content='What is generative AI?\nGenerative AI can learn from existing artifacts to generate new, realistic artifacts (at scale) that reflect the characteristics of the training data but don’t repeat it. It can produce a variety of novel content, such as images, video, music, speech, text, software code and product designs.\xa0\xa0\nGenerative AI uses a number of techniques that continue to evolve. Foremost are AI foundation models, which are trained on a broad set of unlabeled data that can be used for different tasks, with additional fine-tuning.\xa0Complex math and enormous computing power are required to create these trained models, but they are, in essence, prediction algorithms.\xa0\nToday, generative AI most commonly creates content in response to natural language requests — it doesn’t require knowledge of or entering code — but the\xa0enterprise use cases\xa0are numerous and include innovations in drug and chip design and material science development. (Also see “

In [8]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, # 1000자씩 나누기
    chunk_overlap=100, # 100자씩 겹치기
    length_function=len, # 글자 수로 나누기
)

docs = text_splitter.create_documents([text_gen_ai])
print(len(docs))
print(len(docs[0].page_content))



27
673


In [9]:
char_list = list(map(lambda x:len(x.page_content), docs))
print(char_list)

[673, 328, 659, 672, 904, 989, 925, 915, 927, 936, 370, 754, 915, 984, 858, 907, 356, 995, 374, 767, 851, 878, 791, 868, 934, 777, 669]


## 토큰 단위 텍스트 분할기

- ChatGPT 토큰 단위

In [10]:
!pip install tiktoken --quiet

In [15]:
import tiktoken

tokenizer = tiktoken.get_encoding('cl100k_base')

# 토큰 수를 세는 함수
tiktoken_len = lambda text: len(tokenizer.encode(text))


In [21]:
print(list(map(lambda x: tiktoken_len(x.page_content), docs))) # 토큰 수를 세는 함수를 적용하여 토큰 수를 세기

[137, 63, 148, 125, 164, 190, 177, 174, 181, 197, 72, 134, 166, 172, 169, 178, 70, 222, 76, 133, 161, 172, 144, 218, 178, 171, 121]


In [24]:
# PDF 파일을 Token 단위로 Split

from langchain.document_loaders import PyPDFLoader

loader = PyPDFLoader('data/[정책브리프 2021-04] 탄소중립 대응을 위한 정부 정책과 동향.pdf')
pages = loader.load_and_split()
print(len(pages))

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500, # 500씩 자르기
    chunk_overlap=50, # 50자씩 겹치기
    length_function=tiktoken_len, # 토큰 수로 나누기
)

docs = text_splitter.split_documents(pages)

print(len(docs))

19
43


In [25]:
print(list(map(lambda x: len(x.page_content), docs)))

[175, 239, 458, 448, 304, 495, 576, 286, 452, 452, 325, 475, 444, 324, 437, 447, 379, 425, 394, 272, 465, 431, 151, 471, 375, 458, 453, 105, 476, 345, 408, 287, 425, 465, 2, 465, 380, 478, 388, 510, 263, 366, 78]
