In [13]:

import openai
import faiss
import numpy as np

class Chatbot:
    def __init__(self, index, embeddings, documents, system_message):
        self.index = index
        self.embeddings = embeddings
        self.documents = documents
        self.system_message = system_message
        self.chat_history = []

    def get_embedding(self, text, model="text-embedding-ada-002"):
        text = text.replace("\n", " ")
        embedding = openai.Embedding.create(input=[text], model=model)["data"][0]["embedding"]
        print("Generated embedding:", embedding)  # 디버깅을 위한 출력
        return embedding
        
    def find_similar_document(self, user_embedding):
        _, top_indices = self.index.search(np.array([user_embedding]), 1)
        top_index = top_indices[0][0]
        return self.documents[top_index]

    def chat(self, user_input):
        user_embedding = self.get_embedding(user_input)
        similar_document = self.find_similar_document(user_embedding)
        similar_document = similar_document[:500]
        system_message = self.system_message + " " + similar_document
        messages = [{"role": "system", "content": system_message}]
        for message in self.chat_history:
            messages.append(message)
        messages.append({"role": "user", "content": user_input})
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=messages
        )
        assistant_message = response.choices[0].message.content
        self.chat_history.append({"role": "user", "content": user_input})
        self.chat_history.append({"role": "assistant", "content": assistant_message})
        return assistant_message

In [14]:
openai.api_key = "Your API Key"


In [15]:
!export OPENAI_API_KEY=your_openai_api_key_here

In [16]:
import PyPDF2

with open("Your_Data_PDF", "rb") as f:
    reader = PyPDF2.PdfFileReader(f)
    text = ""
    for i in range(reader.numPages):
        page = reader.getPage(i)
        text += page.extractText()


In [17]:
documents = text.split("\n\n")  # 두 개의 개행 문자를 기준으로 문서를 분할


In [18]:
chatbot_instance = Chatbot(None, None, None, "System Message")

In [19]:
from numpy import mean, array

# 가정: documents는 긴 텍스트의 리스트입니다.
max_tokens = 8000  # 예시로 8000 토큰을 최대로 설정
embeddings = []

print("Loop starting...")
for doc in documents:
    token_count = len(doc.split())
    if token_count > max_tokens:
        print("Document exceeds max tokens.")
        # 텍스트를 나누고 각 부분에 대해 임베딩을 생성
        num_parts = -(-token_count // max_tokens)  # 올림을 사용하여 부분의 수 계산
        parts = [doc[i:i+max_tokens] for i in range(0, token_count, max_tokens)]
        part_embeddings = [chatbot_instance.get_embedding(part) for part in parts]
        # 부분 임베딩을 어떻게 다룰지는 여러 방법이 있을 수 있습니다.
        # 예를 들어, 평균을 내거나 첫 번째 부분만 사용할 수 있습니다.
        # 부분 임베딩을 평균내어 하나의 임베딩으로 만듭니다.
        avg_embedding = mean(array(part_embeddings), axis=0).tolist()
        embeddings.append(avg_embedding)
    else:
        print("Document within max tokens.")
        embedding = chatbot_instance.get_embedding(doc)
        embeddings.append(embedding)
        print("Current embeddings list:", embeddings)


Loop starting...
Document exceeds max tokens.
Generated embedding: [0.008372033014893532, 0.006289201322942972, -0.007035492919385433, -0.0030801482498645782, -0.01544144842773676, 0.015210776589810848, -0.0159299299120903, -0.010305606760084629, -0.019200043752789497, -0.04681282490491867, 0.004511671140789986, 0.026310166344046593, -0.013541797176003456, -0.0015502508031204343, 0.007150828838348389, 0.007150828838348389, 0.0004064320237375796, -0.0022422666661441326, 0.004270822275429964, -0.012266317382454872, 0.018643716350197792, -0.0036059445701539516, -0.015427879057824612, -0.016377704218029976, -0.010522709228098392, -0.0010694017400965095, 0.0154957240447402, -0.013060100376605988, -0.014450916089117527, -0.0038332242984324694, 0.02096400409936905, -0.011282569728791714, -0.01502081099897623, -0.0076528796926140785, 0.010020658373832703, -0.006458813324570656, -0.0026578151155263186, -0.035740576684474945, 0.016689790412783623, -0.0006267151911742985, 0.014857984147965908, 0.

In [20]:
import faiss

# 임베딩을 NumPy 배열로 변환
embeddings_np = np.array(embeddings).astype('float32')

# FAISS 인덱스 생성
index = faiss.IndexFlatL2(embeddings_np.shape[1])
index.add(embeddings_np)

In [21]:
system_message = "안녕하세요, w5500 봇 입니다어떻게 도와드릴까요?"
chatbot_instance = Chatbot(index, embeddings, documents, system_message)

In [22]:
user_input = "안녕하세요, w5500에 대해 알려주세요."
response = chatbot_instance.chat(user_input)
print("챗봇: ", response)

Generated embedding: [0.01689409464597702, -0.014876889064908028, -0.012992395088076591, -0.033283889293670654, -0.03646894916892052, 0.023675620555877686, -0.02275991626083851, 0.013191461563110352, -0.023649077862501144, -0.005729792173951864, 0.0062739066779613495, 0.014996329322457314, -0.04198972135782242, -0.018207931891083717, -0.001357798115350306, -0.00017687871877569705, 0.004697964992374182, 0.0038121200632303953, 0.019999530166387558, -0.0010326565243303776, 0.02042420394718647, -0.023914501070976257, -0.008082092739641666, -0.0008601323934271932, -0.002320781582966447, -0.009090695530176163, 0.0002865725546143949, -0.009608267806470394, 0.00047278249985538423, -0.03163827210664749, 0.02003934234380722, 0.004800816066563129, -0.025201795622706413, 0.00578951183706522, 0.017398396506905556, -0.006526057608425617, 0.030204996466636658, -0.04007868468761444, 0.014757449738681316, -0.012448280118405819, 0.013184825889766216, 0.003336019581183791, 0.029621068388223648, -0.011698

In [23]:
user_input = "w5500의 기능은 무엇인가요?"
response = chatbot_instance.chat(user_input)
print("챗봇: ", response)

Generated embedding: [0.004902536980807781, -0.01073762122541666, 0.004436262883245945, -0.01384167280048132, -0.054727230221033096, 0.011949933134019375, -0.028136298060417175, 0.0014454490737989545, -0.022074738517403603, -0.006417926866561174, 0.0025495190639048815, 0.035303592681884766, -0.033971384167671204, -0.0030690813437104225, -0.0020133040379732847, 0.0048425872810184956, 0.006471215281635523, -0.012522784061729908, 0.005158987361937761, -0.021248767152428627, 0.007786773610860109, -0.018517734482884407, -0.01669260486960411, -0.008626067079603672, 0.0021831609774380922, 0.012842514552175999, 0.0025761632714420557, -0.00732049997895956, -0.005688541103154421, -0.0018700913060456514, 0.03306547924876213, 0.007840062491595745, -0.027110494673252106, -0.0012764247367158532, 0.012322952970862389, 0.014241336844861507, 0.02314050681889057, -0.042151160538196564, 0.012329613789916039, -0.018997330218553543, 0.008785932324826717, 0.010031549260020256, 0.02356681413948536, -0.006461

In [34]:
while True:
  
    user_input = input("당신: ")

    if user_input.lower() == "quit":
        print("챗봇: 안녕히 가세요!")
        break

    response = chatbot_instance.chat(user_input)
    
    print("챗봇: ", response)