# langchain of python (data_connetion, chain, memory, tool)

In [1]:
# python 에서 pdf 파일을 다루기 위한 라이브러리 (읽기, 수정)
# !pip install pypdf

Collecting pypdf
  Downloading pypdf-4.3.1-py3-none-any.whl.metadata (7.4 kB)
Downloading pypdf-4.3.1-py3-none-any.whl (295 kB)
Installing collected packages: pypdf
Successfully installed pypdf-4.3.1


In [2]:
# OpenAI에서 제공하는 임베딩을 위한 라이브러리 (OpenAIEmbeddings 를 사용하기 위해 필요)
# !pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.7.0-cp38-cp38-macosx_11_0_arm64.whl.metadata (6.6 kB)
Collecting regex>=2022.1.18 (from tiktoken)
  Downloading regex-2024.9.11-cp38-cp38-macosx_11_0_arm64.whl.metadata (40 kB)
Downloading tiktoken-0.7.0-cp38-cp38-macosx_11_0_arm64.whl (906 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m906.8/906.8 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading regex-2024.9.11-cp38-cp38-macosx_11_0_arm64.whl (284 kB)
Installing collected packages: regex, tiktoken
Successfully installed regex-2024.9.11 tiktoken-0.7.0


In [3]:
# facebook ai 에서 개발한 라이브러리로, 벡터의 유사도 검색을 위해 사용 (gpu 사용시 faiss-gpu)
# !pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.8.0.post1-cp38-cp38-macosx_11_0_arm64.whl.metadata (3.7 kB)
Downloading faiss_cpu-1.8.0.post1-cp38-cp38-macosx_11_0_arm64.whl (6.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.0/6.0 MB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.8.0.post1


In [5]:
# 자연어 처리에서 문장 또는 단락을 벡터로 변환하기 위해 사용되는 라이브러리
# !pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.1.0-py3-none-any.whl.metadata (23 kB)
Collecting transformers<5.0.0,>=4.38.0 (from sentence-transformers)
  Using cached transformers-4.44.2-py3-none-any.whl.metadata (43 kB)
Collecting torch>=1.11.0 (from sentence-transformers)
  Downloading torch-2.4.1-cp38-none-macosx_11_0_arm64.whl.metadata (26 kB)
Collecting scikit-learn (from sentence-transformers)
  Downloading scikit_learn-1.3.2-cp38-cp38-macosx_12_0_arm64.whl.metadata (11 kB)
Collecting scipy (from sentence-transformers)
  Downloading scipy-1.10.1-cp38-cp38-macosx_12_0_arm64.whl.metadata (53 kB)
Collecting sympy (from torch>=1.11.0->sentence-transformers)
  Downloading sympy-1.13.2-py3-none-any.whl.metadata (12 kB)
Collecting networkx (from torch>=1.11.0->sentence-transformers)
  Downloading networkx-3.1-py3-none-any.whl.metadata (5.3 kB)
Collecting safetensors>=0.4.1 (from transformers<5.0.0,>=4.38.0->sentence-transformers)
  Downloading safetensors-0.4.5-

## 실습1.
### PDF 파일 불러오기

In [10]:
from langchain.document_loaders import PyPDFLoader

file_path = './data/The_Adventures_of_Tom_Sawyer.pdf'
loader = PyPDFLoader(file_path)
document = loader.load()
document[5].page_content[:5000]

'Chapter 1    The Fence \n \nTom Sawyer lived with his aunt because his mother and \nfather were dead. Tom didn’t like going to school, and he didn’t like working. He liked playing and having adventures. One Friday, he didn’t go to school—he went to the river. \nAunt Polly was angry. “You’re a bad boy!” she said. \n“Tomorrow you can’t play with your friends because you didn’t go to school today. Tomorrow you’re going to work for me. You can paint the fence.” \nSaturday morning, Tom was not happy, but he started to \npaint the fence. His friend Jim was in the street. \nTom asked him, “Do you want to paint?” \nJim said, “No, I can’t. I’m going to get water.” \nThen Ben came to Tom’s house. He watched Tom and \nsaid, “I’m going to swim today. You can’t swim because you’re working.” \nTom said, “This isn’t work. I like painting.” \n“Can I paint, too?” Ben asked. \n“No, you can’t,” Tom answered. “Aunt Polly asked me \nbecause I’m a very good painter.” \nBen said, “I’m a good painter, too. P

### 임베딩 처리 (data_connection)
- 오픈AI 에서 제공하는 임베딩 모델을 사용하여, 벡터 데이터베이스로 **파이스(FAISS)** 를 사용

In [24]:
import os

In [17]:
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()
db = FAISS.from_documents(document, embeddings)

  embeddings = OpenAIEmbeddings()


#### 임베딩 과정 중간 예제
1. OpenAIEmbeddings
2. JuggingFaceEmbeddings

In [21]:
# OpenAIEmbedding 모델 사용
from langchain.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()

text = "진희는 강아지를 키우고 있습니다. 진희가 키우고 있는 동물은?"
text_embedding = embeddings.embed_query(text)
print(text_embedding)

[-0.0029022407787281613, -0.020200480148001805, -0.012732614997768232, -0.016551768712307287, -0.02254878584879169, 0.028684681810162562, -0.027901912622350873, 0.006445215668117995, -0.012025598658249466, 0.009885610263466692, -0.015491243737367842, 0.014443343771482585, 0.006634595460543822, -0.03325504125855524, 0.0006221905465367085, 0.02795241452121282, 0.022233153171856177, -0.00495227388852399, 0.02411432422441507, -0.027068643398322417, -0.00037915353430797733, 0.003894905165848091, 0.014152961112655455, -0.020970624326759304, -0.005078526773033677, 0.018306685949033903, 0.024328953103626687, -0.020452985917021178, -0.004134785925813274, -0.01867281866218618, 0.001387993947109237, -0.0011780982821397016, -0.03550234502426641, -0.005062745511715938, 0.001271998998871404, -0.01772592063137964, 0.01112920252630487, 0.007221671885741293, 0.009607853545016344, 0.003224185960190771, -0.00072437660008321, 0.0032636400448077104, 0.005482536375993713, -0.007625681488701329, 0.0155038687

In [26]:
# HuggingFaceEmbeddings 모델 사용
from langchain.embeddings import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name = "sentence-transformers/all-MiniLM-L6-v2")

text = "진희는 강아지를 키우고 있습니다. 진희가 키우고 있는 동물은?"
text_embedding = embeddings.embed_query(text)
print(text_embedding)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

[0.013278815895318985, 0.07225915044546127, 0.09263098239898682, -0.003979538567364216, 0.001561783137731254, -0.1030636802315712, 0.10929874330759048, 0.055662013590335846, -0.03116733767092228, -0.05020313337445259, 0.08312954008579254, -0.008924362249672413, 0.09506319463253021, -0.06980787217617035, 0.039559025317430496, -0.10899190604686737, 0.049438707530498505, 0.037364814430475235, -0.1240922138094902, -0.0033154527191072702, 0.04840945824980736, -0.031085141003131866, 0.008207034319639206, 0.06326041370630264, -0.06804247945547104, -0.010208231396973133, 0.004926998633891344, -0.014940319582819939, -0.0014765297528356314, -0.006598863285034895, -0.04015946388244629, 0.08289807289838791, 0.014144647866487503, -0.01179353054612875, -0.09415140748023987, 0.0021563915070146322, -0.01905306987464428, -0.03773898631334305, -0.0032711380627006292, 0.046856094151735306, -0.18111614882946014, -0.11718789488077164, 0.0350484773516655, -0.06848125159740448, 0.06553437560796738, 0.0352285

## 검색기(RetrievalQA) 활용: (원하는 질문에 답변할 수 있도록 사용)

In [27]:
from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(
    temperature = 0,
    model_name = 'gpt-3.5-turbo'
)

  llm = ChatOpenAI(


In [29]:
from langchain.chains import RetrievalQA
retriever = db.as_retriever()

qa = RetrievalQA.from_chain_type(
    llm = llm,
    chain_type = "stuff",
    retriever = retriever
)

query = "마을 무덤에 있던 남자를 죽인 사람은 누구니?"
result = qa({"query": query})
print(result['result'])

마을 무덤에 있던 남자를 죽인 사람은 Injun Joe입니다.


#### Tip.

chain_type = "stuff"의 의미:  
Stuff 체인은 가장 간단한 형태의 체인입니다.  
이 방식에서는 모든 문서(또는 텍스트)가 하나로 모아진 후, 그 전체 문서에 대해 LLM(대형 언어 모델)이 한번에 응답을 생성합니다.  
즉, 관련된 문서나 정보들이 많더라도, 모든 내용을 한번에 LLM에게 전달하고 결과를 얻습니다.  
장점: 구조가 단순하고, 빠르게 결과를 얻을 수 있습니다.  
단점: 문서가 너무 크거나 길면, 모델의 토큰 한계를 초과할 수 있습니다.  
다른 체인 타입으로는 map_reduce, refine, map_rerank 등이 있습니다. 이들은 보다 복잡한 방식으로 문서를 처리하고, 각각 장단점이 있습니다  

## 실습2. (chain)
체인을 파이썬에서 실행해보기

In [31]:
from langchain.chains import LLMChain
from langchain import PromptTemplate
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI(
    temperature= 0,
    model_name = 'gpt-4'
)

prompt = PromptTemplate(
    input_variables = ["country"],
    # input_variables는 사용자로부터 받을 변수를 지정합니다. 이 코드에서는 ["country"]로 설정되어 있으며, 이는 나라 이름이 들어갈 자리를 의미합니다.
    template = "{country} 의 수도는 어디야?"
    # template은 그 변수를 포함한 기본적인 프롬프트 형식을 정의합니다. 즉, 여기서는 **"{country} 의 수도는 어디야?"**가 템플릿입니다.
)

chain = LLMChain(llm = llm, prompt = prompt) # 프롬프트와 모델을 체인으로 연결
chain.run("대한민국")

  chain.run("대한민국")


'대한민국의 수도는 서울입니다.'

## 실습3
**SequentialChain**을 사용해 체인 두개를 연결하고 output_key를 사용하여 각각의 결과를 확인하는 모델  
(영어 문장을 한글로 번역한 후 그 문장을 다시 한 문장으로 요약하는 예제)

In [36]:
# 프롬프트1 정의
prompt1 = PromptTemplate(
    input_variables = ['sentence'],
    template = "다음 문장을 한글로 번역하세요. \n\n{sentence}"
)

# 번역(체인1)에 대한 모델
chain1 = LLMChain(llm = llm, prompt = prompt1, output_key = "translation")

# 프롬프트2 정의
prompt2 = PromptTemplate.from_template(
    "다음 문장을 한 문장으로 요약하세요.\n\n{translation}"
)

# 요약(체인2)에 대한 모델
chain2 = LLMChain(llm = llm, prompt=prompt2, output_key = "summary")

from langchain.chains import SequentialChain
all_chain = SequentialChain(
    chains = [chain1, chain2],
    input_variables = ['sentence'],
    output_variables = ['translation', 'summary']
)

# 번역하고 요약해야 할 영어 문장
sentence ="""
“I’m late because I talked to Huck Finn,” Tom said. Then the teacher was very angry. “Sit with the girls,” he said to Tom.
Tom sat near the beautiful new girl. He was happy. He looked at her.
“What’s your name?” he asked.
“Becky,” she answered.
Tom smiled and said, “My name’s Tom.”
The teacher was angry again. “Tom Sawyer, stop
talking! Go to your place now,” he said. Tom went to his place.
At twelve o’clock Tom and Becky didn’t go home. They stayed in the school yard and talked. Tom said, “I love you. Do you love me?”
“Yes,” Becky answered.
“Good,” Tom said. “Then you’re going to walk to school with me every day. Amy always walked with me.”
“Amy!” Becky said angrily. “Do you love her?”
“No,” Tom answered. “I love you now. Do you want to walk with me?”
But Becky was angry with Tom. She walked away and didn’t answer. Tom was unhappy. He didn’t go to school in the afternoon.
That night Tom went to bed at nine o’clock, but he didn’t sleep.
At eleven o’clock he went out his bedroom window to the yard. Huck was there. 
They walked to the graveyard. They stopped behind some big trees and talked quietly.
Suddenly, there was a noise. Three men came into the graveyard—the doctor, Muff Potter, and Injun Joe. Injun Joe 
and the doctor talked angrily. Then Injun Joe"""

all_chain(sentence)

{'sentence': '\n“I’m late because I talked to Huck Finn,” Tom said. Then the teacher was very angry. “Sit with the girls,” he said to Tom.\nTom sat near the beautiful new girl. He was happy. He looked at her.\n“What’s your name?” he asked.\n“Becky,” she answered.\nTom smiled and said, “My name’s Tom.”\nThe teacher was angry again. “Tom Sawyer, stop\ntalking! Go to your place now,” he said. Tom went to his place.\nAt twelve o’clock Tom and Becky didn’t go home. They stayed in the school yard and talked. Tom said, “I love you. Do you love me?”\n“Yes,” Becky answered.\n“Good,” Tom said. “Then you’re going to walk to school with me every day. Amy always walked with me.”\n“Amy!” Becky said angrily. “Do you love her?”\n“No,” Tom answered. “I love you now. Do you want to walk with me?”\nBut Becky was angry with Tom. She walked away and didn’t answer. Tom was unhappy. He didn’t go to school in the afternoon.\nThat night Tom went to bed at nine o’clock, but he didn’t sleep.\nAt eleven o’clock h

## 실습4 (memory) : 대화 기록 저장
1. 모든 대화 유지
2. 최근 k개의 대화 유지
3. 대화를 요약해서 유지
- 앞에서의 대화 내용을 기억해서 답변을 제공

In [37]:
from langchain.chat_models import ChatOpenAI
from langchain import ConversationChain

llm = ChatOpenAI(
    temperature=0,
    model_name = 'gpt-4'
)

conversation = ConversationChain(llm=llm, verbose = True)

conversation.predict(input="진희는 강아지를 한마리 키우고 있습니다.")
conversation.predict(input="영수는 고양이를 두마리 키우고 있습니다.")
conversation.predict(input="진희와 영수가 키우는 동물은 총 몇마리?")

  conversation = ConversationChain(llm=llm, verbose = True)




[1m> Entering new ConversationChain chain...[0m
Prompt after formatting:
[32;1m[1;3mThe following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know.

Current conversation:

Human: 진희는 강아지를 한마리 키우고 있습니다.
AI:[0m

[1m> Finished chain.[0m


[1m> Entering new ConversationChain chain...[0m
Prompt after formatting:
[32;1m[1;3mThe following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know.

Current conversation:
Human: 진희는 강아지를 한마리 키우고 있습니다.
AI: 그렇군요, 진희님이 강아지를 한 마리 키우고 계시다니 정말 멋진 일이네요. 강아지는 사람들에게 많은 행복과 사랑을 주는 동물이니까요. 혹시 진희님의 강아지는 어떤 종류인지, 그리고 이름은 무엇인지 알 수 있을까요?
Human: 영수는 고양이를 두마리 키우고 있습니다.
AI:[0m

[1m> Finished chain.[0m


[1m> Entering new Conver

'진희님이 한 마리의 강아지를 키우고 있고, 영수님이 두 마리의 고양이를 키우고 있으니, 진희와 영수가 키우는 동물은 총 세 마리입니다.'

## 실습5 (agent, tool)
- 한계점: 학습을 마친 그 시점 이후의 사건이나 사실에 대해서는 정보가 없으며,
일반적인 데이터로 학습했기 떄문에 특정한 산업(예: 의료)에 대해 특화되어 있지 않는 현상.
- 극복: 툴을 사용해서 특정 작업을 수행할 수 있는 에이전트를 구현한다.
---

- 에이전트가 위키피디아에서 에드 시런의 생년월일을 조회한 후 계산을 통해 2024년 나이를 계산하는 문제

In [38]:
# 위키피디아 아리브러리
# !pip install wikipedia

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting wikipedia
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: wikipedia
  Building wheel for wikipedia (setup.py) ... [?25ldone
[?25h  Created wheel for wikipedia: filename=wikipedia-1.4.0-py3-none-any.whl size=11680 sha256=fc242123f2f2005c08cb3a57928b14545b916a3240a3bc09b906d45a51fff3fd
  Stored in directory: /Users/kimkyusan/Library/Caches/pip/wheels/07/93/05/72c05349177dca2e0ba31a33ba4f7907606f7ddef303517c6a
Successfully built wikipedia
Installing collected packages: wikipedia
Successfully installed wikipedia-1.4.0


In [39]:
# 연산을 위해 사용되는 라이브러리
# !pip install numexpr

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting numexpr
  Downloading numexpr-2.8.6-cp38-cp38-macosx_11_0_arm64.whl.metadata (8.0 kB)
Downloading numexpr-2.8.6-cp38-cp38-macosx_11_0_arm64.whl (91 kB)
Installing collected packages: numexpr
Successfully installed numexpr-2.8.6


In [40]:
from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(
    temperature=0,
    model_name= 'gpt-4'
)

In [44]:
from langchain.agents import load_tools, initialize_agent, AgentType


tools = load_tools(["wikipedia", "llm-math"], llm = llm) # llm-math의 경우 나이 계산을 위해 사용

# 에이전트가 접근할 수 있는 툴로 여기서는 위키피디아를 사용
agent = initialize_agent(tools,                         
                         # llm 은 에이전트로 사용할 언어 모델
                         llm,
                         # AgentType.ZERO_SHOT_REACT_DESCRIPTION
                         # 툴의 용도와 사용 시기를 결정하는 에이전트
                         agent = AgentType.ZERO_SHOT_REACT_DESCRIPTION,
                         description = "계산이 필요할 때 사용",
                         verbose = True)

agent.run("에드 시런이 태어난 해는? 2024년도 현재 에드 시런은 몇 살? 두가지 모두 답해줘야지")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThe question is asking for the year Ed Sheeran was born and how old he would be in 2024. I need to find out when Ed Sheeran was born first.
Action: wikipedia
Action Input: Ed Sheeran[0m
Observation: [36;1m[1;3mPage: Ed Sheeran
Summary: Edward Christopher Sheeran  ( SHEER-ən; born 17 February 1991) is an English singer-songwriter. Born in Halifax, West Yorkshire, and raised in Framlingham, Suffolk, he began writing songs around the age of eleven. In early 2011, Sheeran independently released the extended play No. 5 Collaborations Project. He signed with Asylum Records the same year.
Sheeran's debut album, + ("Plus"), was released in September 2011 and topped the UK Albums Chart. It contained his first hit single "The A Team". In 2012, Sheeran won the Brit Awards for Best British Male Solo Artist and British Breakthrough Act. Sheeran's second studio album, × ("Multiply"), topped charts around the world upon its release in Ju

'에드 시런은 1991년에 태어났고, 2024년에는 33살입니다.'