# 벡터의 유사도 개념 파악하기

## 텍스트 임베딩
pip install openai  
pip install pandas  

In [1]:
import openai
import numpy as np
from numpy import dot
from numpy.linalg import norm
import pandas as pd

openai.api_key = 'API_key'


In [2]:
embedding_result = openai.Embedding.create(input = '저는 배가 고파요', model="text-embedding-ada-002")['data'][0]['embedding']
print(embedding_result)

[-0.016397610306739807, -0.021951215341687202, 0.015186872333288193, -0.02716265618801117, -0.03674328327178955, 0.011752059683203697, -0.034611329436302185, -0.006728152744472027, -0.02396472729742527, -0.016897698864340782, -0.008139584213495255, 0.010850585997104645, -0.01056764181703329, -0.022490784525871277, 0.011298032477498055, -0.005092998035252094, 0.01224556751549244, -0.002967625390738249, 0.008172485046088696, -0.016226528212428093, 0.001420479267835617, -0.014620983973145485, 0.01835848018527031, -0.012646953575313091, 0.0032719550654292107, 0.006300446577370167, 0.005336461588740349, -0.019042810425162315, -0.009198980405926704, -0.0017190512735396624, 0.034348126500844955, -0.016542373225092888, -4.070794602739625e-06, 0.0031781885772943497, 0.007244690787047148, -0.005639146082103252, -0.006543910130858421, 0.0033542062155902386, -0.0038361987099051476, -0.002403381746262312, -0.022411823272705078, -0.010251796804368496, 0.011271712370216846, -0.024833299219608307, 0.0

In [3]:
len(embedding_result)

1536

In [4]:
data = ['저는 배가 고파요',
        '저기 배가 지나가네요',
        '굶어서 허기가 지네요',
        '허기 워기라는 게임이 있는데 즐거워',
        '스팀에서 재밌는 거 해야지',
        '스팀에어프라이어로 연어구이 해먹을거야']

df = pd.DataFrame(data, columns=['text'])
df


Unnamed: 0,text
0,저는 배가 고파요
1,저기 배가 지나가네요
2,굶어서 허기가 지네요
3,허기 워기라는 게임이 있는데 즐거워
4,스팀에서 재밌는 거 해야지
5,스팀에어프라이어로 연어구이 해먹을거야


In [5]:
def embedding_func(text):
    embedding_result = openai.Embedding.create(input = text, model="text-embedding-ada-002")['data'][0]['embedding']
    return embedding_result

df['embedding'] = df.apply(lambda row: embedding_func(row.text), axis=1)
df

Unnamed: 0,text,embedding
0,저는 배가 고파요,"[-0.01643836498260498, -0.021926594898104668, ..."
1,저기 배가 지나가네요,"[-0.002701738616451621, -0.028862077742815018,..."
2,굶어서 허기가 지네요,"[-0.005840584635734558, -0.007400696165859699,..."
3,허기 워기라는 게임이 있는데 즐거워,"[-0.011341209523379803, -0.011656243354082108,..."
4,스팀에서 재밌는 거 해야지,"[-0.015407744795084, -0.014045882038772106, 0...."
5,스팀에어프라이어로 연어구이 해먹을거야,"[-0.0021207586396485567, -0.03023245744407177,..."


## 코사인 유사도

In [6]:
def cos_sim(A, B):
  return dot(A, B)/(norm(A)*norm(B))

vec1 = np.array([0,1,1,1])
vec2 = np.array([1,0,1,1])
vec3 = np.array([2,0,2,2])

print('벡터1과 벡터2의 유사도 :',cos_sim(vec1, vec2))
print('벡터1과 벡터3의 유사도 :',cos_sim(vec1, vec3))
print('벡터2와 벡터3의 유사도 :',cos_sim(vec2, vec3))


벡터1과 벡터2의 유사도 : 0.6666666666666667
벡터1과 벡터3의 유사도 : 0.6666666666666667
벡터2와 벡터3의 유사도 : 1.0000000000000002


In [7]:
def return_answer_candidate(df, query):
    query_embedding = embedding_func(query)
    df["similarity"] = df.embedding.apply(lambda x: cos_sim(np.array(x),
                                                            np.array(query_embedding)))
    top_three_doc = df.sort_values("similarity",
                                ascending=False).head(3)
    return top_three_doc


In [8]:
sim_result = return_answer_candidate(df, '아무 것도 안 먹었더니 꼬르륵 소리가나네')
sim_result


Unnamed: 0,text,embedding,similarity
2,굶어서 허기가 지네요,"[-0.005840584635734558, -0.007400696165859699,...",0.838963
5,스팀에어프라이어로 연어구이 해먹을거야,"[-0.0021207586396485567, -0.03023245744407177,...",0.821658
0,저는 배가 고파요,"[-0.01643836498260498, -0.021926594898104668, ...",0.814633


# Langchain 사용하여 PDF 내용에 질문하기

# PDF 추출하기

pip install PyPDF2

In [9]:
# 패키지 불러오기
from PyPDF2 import PdfReader

In [10]:
# PDF 파일 경로를 지정하여 불러오기. 
pdf_reader = PdfReader('Summary of ChatGPTGPT-4 Research.pdf')

In [11]:
# 텍스트 추출하기
total_text = ""
for page in pdf_reader.pages:
    total_text += page.extract_text()

In [12]:
print(total_text)

Summary of ChatGPT/GPT-4 Research
and Perspective Towards the Future of Large
Language Models
Yiheng Liu1, Tianle Han∗1, Siyuan Ma1, Jiayue Zhang1,
Yuanyuan Yang1, Jiaming Tian1, Hao He1, Antong Li2, Mengshen
He1, Zhengliang Liu3, Zihao Wu3, Dajiang Zhu4, Xiang Li5, Ning
Qiang1, Dingang Shen6,7,8, Tianming Liu3, and Bao Ge†1
1School of Physics and Information Technology, Shaanxi Normal University, Xi'an
710119 China
2School of Life and Technology Biomedical-Engineering, Xi'an Jiaotong University,
Xi'an 710049, China
3School of Computing, The University of Georgia, Athens 30602, USA
4Department of Computer Science and Engineering, The University of Texas at
Arlington, Arlington 76019, USA
5Department of Radiology, Massachusetts General Hospital and Harvard Medical
School, Boston 02115, USA
6School of Biomedical Engineering, ShanghaiTech University, Shanghai 201210,
China
7Shanghai United Imaging Intelligence Co., Ltd., Shanghai 200230, China
8Shanghai Clinical Research and Trial Center

# 텍스트 청크 사이즈로 자르기
pip install langchain

In [13]:
from langchain.text_splitter import CharacterTextSplitter

In [14]:
text_splitter = CharacterTextSplitter(
            separator="\n",
            chunk_size=1000,
            chunk_overlap=200,
            length_function=len
        )

In [15]:
chunks = text_splitter.split_text(total_text)

In [16]:
len(chunks)

113

In [17]:
chunks[0]

"Summary of ChatGPT/GPT-4 Research\nand Perspective Towards the Future of Large\nLanguage Models\nYiheng Liu\x031, Tianle Han∗1, Siyuan Ma1, Jiayue Zhang1,\nYuanyuan Yang1, Jiaming Tian1, Hao He1, Antong Li2, Mengshen\nHe1, Zhengliang Liu3, Zihao Wu3, Dajiang Zhu4, Xiang Li5, Ning\nQiang1, Dingang Shen6,7,8, Tianming Liu3, and Bao Ge†1\n1School of Physics and Information Technology, Shaanxi Normal University, Xi'an\n710119 China\n2School of Life and Technology Biomedical-Engineering, Xi'an Jiaotong University,\nXi'an 710049, China\n3School of Computing, The University of Georgia, Athens 30602, USA\n4Department of Computer Science and Engineering, The University of Texas at\nArlington, Arlington 76019, USA\n5Department of Radiology, Massachusetts General Hospital and Harvard Medical\nSchool, Boston 02115, USA\n6School of Biomedical Engineering, ShanghaiTech University, Shanghai 201210,\nChina\n7Shanghai United Imaging Intelligence Co., Ltd., Shanghai 200230, China"

In [18]:
chunks[1]

"School, Boston 02115, USA\n6School of Biomedical Engineering, ShanghaiTech University, Shanghai 201210,\nChina\n7Shanghai United Imaging Intelligence Co., Ltd., Shanghai 200230, China\n8Shanghai Clinical Research and Trial Center, Shanghai 201210, China\nAbstract\nThis paper presents a comprehensive survey of ChatGPT and GPT-4,\nstate-of-the-art large language models (LLM) from the GPT series, and\ntheir prospective applications across diverse domains. Indeed, key innova-\ntions such as large-scale pre-training that captures knowledge across the\nentire world wide web, instruction \x0cne-tuning and Reinforcement Learn-\ning from Human Feedback (RLHF) have played signi\x0ccant roles in en-\nhancing LLMs' adaptability and performance. We performed an in-depth\nanalysis of 194 relevant papers on arXiv, encompassing trend analysis,\nword cloud representation, and distribution analysis across various appli-\ncation domains. The \x0cndings reveal a signi\x0ccant and increasing interest"

## 텍스트 임베딩 /시멘틱 인덱싱하기
pip install tiktoken  
pip install faiss-cpu

In [19]:
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings

In [20]:
embeddings = OpenAIEmbeddings(openai_api_key="API_key")

In [21]:
knowledge_base = FAISS.from_texts(chunks, embeddings)

## 사용자 질문 임베딩하여 시멘틱 Search 진행하기

In [40]:
docs = knowledge_base.similarity_search("where can i use chatGPT")
docs

[Document(page_content='users to interact with systems, reducing the need for specialized knowledge or\ntraining. Some studies in the literature we collected have already demonstrated\nthis.\nTreude et al. [39] integrated ChatGPT into the prototype of "GPTCOM-\nCARE" to address programming query problems. This integration allowed for\nthe generation of multiple source code solutions for the same query, which in-\ncreased the e\x0eciency of software development. The results of their study demon-\nstrated the e\x0bectiveness of using ChatGPT to improve the quality and diversity\nof code solutions, ultimately reducing the amount of time and e\x0bort required for\nsoftware development.Wang et al. [83] proposed the chatCAD method, which\nutilizes large language models (LLMs) such as ChatGPT to enhance the out-\nput of multiple CAD networks for medical images, including diagnosis, lesion\nsegmentation, and report generation networks. The method generates sugges-', metadata={}),
 Document(pag

In [38]:
docs = knowledge_base.similarity_search_with_score("where can i use chatGPT")
docs

[(Document(page_content='users to interact with systems, reducing the need for specialized knowledge or\ntraining. Some studies in the literature we collected have already demonstrated\nthis.\nTreude et al. [39] integrated ChatGPT into the prototype of "GPTCOM-\nCARE" to address programming query problems. This integration allowed for\nthe generation of multiple source code solutions for the same query, which in-\ncreased the e\x0eciency of software development. The results of their study demon-\nstrated the e\x0bectiveness of using ChatGPT to improve the quality and diversity\nof code solutions, ultimately reducing the amount of time and e\x0bort required for\nsoftware development.Wang et al. [83] proposed the chatCAD method, which\nutilizes large language models (LLMs) such as ChatGPT to enhance the out-\nput of multiple CAD networks for medical images, including diagnosis, lesion\nsegmentation, and report generation networks. The method generates sugges-', metadata={}),
  0.33123994

## ChatGPT에게 최종 질문하기(load_qa_chain)



In [51]:
from langchain.chat_models import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain

In [36]:
llm = ChatOpenAI(temperature=0,
                    openai_api_key="API_key",
                    max_tokens=3000,
                    model_name='gpt-3.5-turbo',
                    request_timeout=120
                    )

In [37]:
chain = load_qa_chain(llm, chain_type="stuff")

In [41]:
response = chain.run(input_documents=docs, question="where can i use chatGPT")
response

'ChatGPT can be used in various applications, including but not limited to:\n\n1. Programming: ChatGPT can assist programmers in generating code snippets, suggesting alternative problem-solving methods, translating code between programming languages, and explaining code.\n\n2. Medical Imaging: ChatGPT can enhance the output of CAD networks for medical images, including diagnosis, lesion segmentation, and report generation.\n\n3. Writing Aid: ChatGPT can be used as a writing aid to improve writing performance, generate complex text output, and provide assistance in article writing.\n\n4. Question and Answering: ChatGPT is commonly used in the education sector for question and answering tasks, allowing users to learn, compare, and verify answers in various academic subjects.\n\n5. Chatbots: ChatGPT can be used to create more convincing chatbots that can interact with humans and simulate human-like conversations.\n\nThese are just a few examples of the applications of ChatGPT. Its potenti

In [42]:
print(response)

ChatGPT can be used in various applications, including but not limited to:

1. Programming: ChatGPT can assist programmers in generating code snippets, suggesting alternative problem-solving methods, translating code between programming languages, and explaining code.

2. Medical Imaging: ChatGPT can enhance the output of CAD networks for medical images, including diagnosis, lesion segmentation, and report generation.

3. Writing Aid: ChatGPT can be used as a writing aid to improve writing performance, generate complex text output, and provide assistance in article writing.

4. Question and Answering: ChatGPT is commonly used in the education sector for question and answering tasks, allowing users to learn, compare, and verify answers in various academic subjects.

5. Chatbots: ChatGPT can be used to create more convincing chatbots that can interact with humans and simulate human-like conversations.

These are just a few examples of the applications of ChatGPT. Its potential is vast, a

## Langchain을 활용한 또다른 질문방법 RetrievalQA

pip install chromadb

In [52]:
from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma

In [54]:
db = Chroma.from_texts(chunks, embeddings)
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k":2})

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "c:\inflearn_chatgpt\ch10\ch10_env\lib\site-packages\langchain\vectorstores\chroma.py", line 80, in __init__
    import chromadb
  File "c:\inflearn_chatgpt\ch10\ch10_env\lib\site-packages\chromadb\__init__.py", line 4, in <module>
    import chromadb.config
  File "c:\inflearn_chatgpt\ch10\ch10_env\lib\site-packages\chromadb\config.py", line 12, in <module>
    from pydantic import BaseSettings, validator
  File "c:\inflearn_chatgpt\ch10\ch10_env\lib\site-packages\pydantic\__init__.py", line 210, in __getattr__
    return _getattr_migration(attr_name)
  File "c:\inflearn_chatgpt\ch10\ch10_env\lib\site-packages\pydantic\_migration.py", line 289, in wrapper
    raise PydanticImportError(
pydantic.errors.PydanticImportError: `BaseSettings` has been moved to the `pydantic-settings` package. See https://docs.pydantic.dev/2.3/migration/#basesettings-has-moved-to-pydantic-settings for more details.

For further information visit https://errors.pydant

In [57]:
qa = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True)
query = "where can i use chatGPT"
result = qa({"query": query})

In [59]:
print(result["result"])

ChatGPT can be used in various domains and applications. Some examples include:

1. Programming: ChatGPT can assist programmers by generating code snippets, suggesting alternative methods for problem-solving, and translating code between programming languages.

2. Software Development: ChatGPT can be integrated into software development tools to improve the efficiency and quality of code solutions, reducing the time and effort required for software development.

3. Medical Imaging: ChatGPT can enhance the output of CAD networks for medical images, including diagnosis, lesion segmentation, and report generation networks.

4. Natural Language Processing: ChatGPT can be used for tasks such as language translation, text summarization, and question-answering systems.

5. Customer Support: ChatGPT can be employed in customer support systems to provide automated responses and assist users with their queries.

These are just a few examples, and the potential applications of ChatGPT are vast. I

In [61]:
print(result["source_documents"])

[Document(page_content='users to interact with systems, reducing the need for specialized knowledge or\ntraining. Some studies in the literature we collected have already demonstrated\nthis.\nTreude et al. [39] integrated ChatGPT into the prototype of "GPTCOM-\nCARE" to address programming query problems. This integration allowed for\nthe generation of multiple source code solutions for the same query, which in-\ncreased the e\x0eciency of software development. The results of their study demon-\nstrated the e\x0bectiveness of using ChatGPT to improve the quality and diversity\nof code solutions, ultimately reducing the amount of time and e\x0bort required for\nsoftware development.Wang et al. [83] proposed the chatCAD method, which\nutilizes large language models (LLMs) such as ChatGPT to enhance the out-\nput of multiple CAD networks for medical images, including diagnosis, lesion\nsegmentation, and report generation networks. The method generates sugges-', metadata={}), Document(page

## 이전 질문 기록을 포함하여 질분하는 방법 ConversationalRetrievalChain

We can now create a memory object, which is necessary to track the inputs/outputs and hold a conversation.

In [47]:
from langchain.memory import ConversationBufferMemory
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

We now initialize the ConversationalRetrievalChain

In [48]:
from langchain.chains import ConversationalRetrievalChain
qa = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)

NameError: name 'retriever' is not defined

In [49]:
chat_history = []
query = "where can i use chatGPT"
result = qa({"question": query, "chat_history": chat_history})
print(result["answer"])

NameError: name 'qa' is not defined

In [50]:
#이전 질문 및 답변 저장
chat_history = [(query, result["answer"])]
#다시 질문
query = "which field is the most used?"
result = qa({"question": query, "chat_history": chat_history})
print(result["answer"])

NameError: name 'result' is not defined