In [1]:
from qdrant_client import QdrantClient
from qdrant_client.http import models
from langchain_community.vectorstores import Qdrant
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from sentence_transformers import SentenceTransformer

import os
import numpy as np
import openai
import tiktoken
import json

In [4]:
with open('/home/prompt_eng/langchain/langchain_proto/web_main/config/qdrant.json', 'r', encoding='utf8') as f:
    config = json.load(f)

os.environ['QDRANT_HOST'] = config['QDRANT_HOST']
os.environ['QDRANT_API_KEY'] = config['QDRANT_API_KEY']
os.environ['OPENAI_API_KEY'] = config['OPENAI_API_KEY']

In [6]:
client = QdrantClient(
        os.getenv("QDRANT_HOST"),
        api_key=os.getenv("QDRANT_API_KEY")
    )

In [7]:
client

<qdrant_client.qdrant_client.QdrantClient at 0x7fee98fd8110>

##### **1. Qdrant Create**

In [94]:
# create or replace collection

os.environ['QDRANT_COLLECTION_NAME'] = "sample_server"

collection_config = qdrant_client.http.models.VectorParams(
        size=768, # 1536 for OpenAI #768 for Gemini/HuggingFace/instructor-xl
        distance=qdrant_client.http.models.Distance.COSINE
    )

client.recreate_collection(
    collection_name=os.getenv('QDRANT_COLLECTION_NAME'),
    vectors_config=collection_config
)

True

In [39]:
# calculate token length

def tiktoken_len(text):
    tokenizer = tiktoken.get_encoding("cl100k_base")
    tokens = tokenizer.encode(text)
    return len(tokens)

In [40]:
# split texts into chunks

def get_chunks(text):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=900,
        chunk_overlap=200,
        length_function=tiktoken_len
    )
    chunks = text_splitter.split_text(text)
    return chunks

In [101]:
# embedding model definition(be aware of dimension size!!!)

def get_embedding():
    embeddings = HuggingFaceEmbeddings(
                                    model_name="jhgan/ko-sroberta-multitask",
                                    model_kwargs={'device': 'cpu'},
                                    encode_kwargs={'normalize_embeddings': True}
                                    )  
    return embeddings

In [136]:
# transform chucnks into vector and add to vectorstore

def add_vectorstore(text_chunks, collection_name,ids, metadatas
                    ):
    vectorstore = Qdrant(
        client=client,
        collection_name=collection_name,
        embeddings=get_embedding()
    )
    vectorstore.add_texts(text_chunks
                        ,ids=ids
                        ,metadatas=metadatas
                        )
    return vectorstore

In [137]:
with open("/opt/qdrant/context.txt", 'rt', encoding='UTF8') as f:
    context = f.read()
with open("/opt/qdrant/code_rule.txt", 'rt', encoding='UTF8') as f:
    code_rule = f.read()
with open("/opt/qdrant/task.txt", 'rt', encoding='UTF8') as f:
    task = f.read()

In [138]:
chunks_1 = get_chunks(context)
chunks_2 = get_chunks(code_rule)
chunks_3 = get_chunks(task)

In [139]:
#metadata for langchaing method

meta_1=list(np.repeat({"filter":"context"},len(chunks_1)))
meta_2=list(np.repeat({"filter":"code_rule"},len(chunks_2)))
meta_3=list(np.repeat({"filter":"task"},len(chunks_3)))

In [149]:
chunck_list = sum([chunks_1,chunks_2, chunks_3],[] )
metadatas= sum([meta_1,meta_2, meta_3] , [])
collection_name="sample_server"
ids=list(range(len(chunck_list)))

In [150]:
vec=add_vectorstore(chunck_list, collection_name, ids, metadatas)

In [151]:
# metadata for Qdrant search method

client.set_payload(
    collection_name=collection_name,
    payload={'filter': 'context'},
    points=[0,1,2,3,4,5,6,7,8]
)

client.set_payload(
    collection_name=collection_name,
    payload={'filter': 'code_rule'},
    points=[9,10]
)

client.set_payload(
    collection_name=collection_name,
    payload={'filter': 'task'},
    points=[11]
)

UpdateResult(operation_id=3, status=<UpdateStatus.COMPLETED: 'completed'>)

##### **2. Qdrant Search**

In [152]:
# count vectors 

cnt = client.count(
    collection_name=collection_name, 
    exact=True,
)
index=[i for i in range(0,cnt.count)]

In [153]:
#SELECT all vectors

search_result = client.retrieve(
    collection_name=collection_name,
    ids=index,
    with_vectors=True # the default is False
)

search_result

[Record(id=0, payload={'filter': 'context', 'metadata': {'filter': 'context'}, 'page_content': 'This data is credit card data.\nThe information about the data is as follows:\n\n #   Column            Dtype  \n\n---  ------            -----'}, vector=[0.02601632, -0.04547918, 0.023858193, -0.07823035, -0.0119933775, -0.038667053, -0.0352143, 0.051805772, 0.040345397, -0.026753692, 0.083545744, 0.053498164, 0.0018694845, 0.04118108, -0.055670585, -0.003803508, 0.045027137, -0.040559188, 0.035644986, -0.062638305, -0.06650583, 0.06313125, -0.009619449, 0.035505746, 0.0028497837, -0.048543416, -0.015972482, -0.05202071, 0.007982194, -0.0056235585, 0.016014088, -0.0028079085, -0.05504412, -0.008358555, 0.027628211, -0.013164936, -0.04301963, -0.05519156, -0.01610246, -0.022001663, -0.040385086, 0.015920632, -0.09379461, 0.032807883, 0.024451468, -0.049829457, 0.0033194146, -0.015102072, -0.010416056, -0.009708907, 0.011674811, -0.03472952, 0.018320154, 0.024505889, -0.0060274294, -0.0004016

In [154]:
# create query filter

def get_filter(key, value):
    filter = models.Filter(
    must=[models.FieldCondition(key=key
                                , match=models.MatchValue(value=value))]
                                )
    return filter

In [155]:
code_rule_filter = get_filter(key="filter", value="code_rule")
context_filter = get_filter(key="filter", value="context")
task_filter = get_filter(key="filter", value="task")

In [156]:
# create dummy vector for test

def new_rule():
    return np.random.uniform(low=-1.0, high=1.0, size=768).tolist()

In [157]:
# filter by code_rule

client.search(
    collection_name=collection_name
    ,query_vector=new_rule()
    ,query_filter=code_rule_filter
)

[ScoredPoint(id=9, version=2, score=0.023089273, payload={'filter': 'code_rule', 'metadata': {'filter': 'code_rule'}, 'page_content': '1.Text Start에서 TextEnd까지의 내용만 번역한다.\n\n2.""으로 감싸진 부분은 원본값을 유지한다.\n\n3."부산 동래구" 같은 데이터의 값들은 번역을 하지 않고 원본값을 유지한다.\n\n4.데이터의 설명중 원본 데이터 관한 내용은 원본값 그대로 유지한다.\n\n5.번역의 내용을 절대 축소하지 않고 텍스트 전체의 내용을 번역한다.\n\n1.모든 코드는 Python으로 작성이 된다.\n\n2.그래프 코드에 그래프를 저장하는 코드를 작성한다. 파일 위치는 지금 위치의 \'test_graph\'폴더에 \'png\'파일로 저장한다.\n\n3.그래프에 대한 간단한 설명을 작성하고 설명을 저장하는 코드를 작성한다. 파일위치는 지금 위치의 \'test_text\'폴더에 \'txt\'파일로 저장한다.\n\n4.데이터프레임을 생성할 필요는 없다.\n\n5.데이터가 로드되어 있는 상태고 그 데이터프레임 명은 df 이다.\n\n1. All code is written in Python.\n\n2. Write code to save the graph in the \'png\' file in the \'test_graph\' folder at the current location.\n\n3. Write code to save the description of the graph with a brief explanation. Save the file in the \'txt\' format in the \'test_text\' folder at the current location.\n\n4. There is no need to create a dataframe.\n\n5. The data is loaded and the datafr

In [71]:
# set filter + score threshold + Top N limit

client.search(
    collection_name=collection_name,
    query_vector=new_rule(),
    query_filter=context_filter,
    score_threshold = 0.01,
    with_vectors=True,
    limit=5
)

[ScoredPoint(id=8, version=17, score=0.044103608, payload={'filter': 'context', 'metadata': {'filter': 'context'}, 'page_content': "This bar graph illustrates the top industries by sales in Dongnae-gu, Busan, based on the provided credit card data.\nEach bar represents a different type of business, and the height of the bar indicates the total sales amount (in Korean Won, KRW) generated by that industry.\nThe x-axis denotes the type of business, while the y-axis represents the total sales amount.\n\nyou are a korean translator who lived in Korea for many years and knows well about korea's region and district name.\nPlease translate the following Korean question to English, but do not translate any proper nouns such as names of places, people, or specific terms. Leave those in Korean and include them in single quotes in the English translation.\n* example1\nkorean question:서초동에서 가장 비싼 동네는 어디야?\nenglish translation: which city is the most expensive in 서초동?\n* example2\nkorean question:서울

##### **3. Qdrant retriever**

In [158]:
# llm definition

def load_llm():
    llm = OpenAI(openai_api_key= config['OPENAI_API_KEY'])
    #llm = CTransformers(
        #model = "TheBloke/Llama-2-7B-Chat-GGML",
        #model_type="llama",
        #temperature = 0.2)
    return llm

In [167]:
# plug the vector store to retrieval chain

from langchain.chains import RetrievalQA
from langchain_openai import OpenAI

def qdrant_qa_response(task, collection_name):
    client = qdrant_client.QdrantClient(
        os.getenv("QDRANT_HOST"),
        api_key=os.getenv("QDRANT_API_KEY")
    )
    vectordb = Qdrant(
        client=client,
        collection_name=collection_name,
        embeddings=get_embedding()
        )
    llm = load_llm()
    qa = RetrievalQA.from_chain_type(llm=load_llm(),
                                       chain_type='stuff',
                                       retriever=vectordb.as_retriever(),
                                       return_source_documents=True
                                       #,chain_type_kwargs={'prompt': prompt}
                                       )

    response = qa.invoke({'query': task})

    return response

In [160]:
# Similarity search with score by filter

def qdrant_similarity_search(task, collection_name, filter):
    client = qdrant_client.QdrantClient(
                os.getenv("QDRANT_HOST"),
                api_key=os.getenv("QDRANT_API_KEY")
                )
    vectordb = Qdrant(client=client,
                collection_name=collection_name,
                embeddings=get_embedding()
                )
    search_result = vectordb.similarity_search_with_score(task 
                                        #, k=k
                                        #, score_threshold =0.3         
                                        , filter=filter)
    return search_result

In [161]:
query_1 = "번역하지 않는 값은 무엇인가?"
query_2 = "상권분석 데이터를 한국어로 설명해줘"

In [162]:
qdrant_similarity_search(query_2, collection_name, context_filter)

[(Document(page_content="This bar graph illustrates the top industries by sales in Dongnae-gu, Busan, based on the provided credit card data.\nEach bar represents a different type of business, and the height of the bar indicates the total sales amount (in Korean Won, KRW) generated by that industry.\nThe x-axis denotes the type of business, while the y-axis represents the total sales amount.\n\nyou are a korean translator who lived in Korea for many years and knows well about korea's region and district name.\nPlease translate the following Korean question to English, but do not translate any proper nouns such as names of places, people, or specific terms. Leave those in Korean and include them in single quotes in the English translation.\n* example1\nkorean question:서초동에서 가장 비싼 동네는 어디야?\nenglish translation: which city is the most expensive in 서초동?\n* example2\nkorean question:서울에 왜 가고싶어 ?\nenglish translation: why do you want to go to 서울?\n* example3\nkorean question:맥도날드 언제 갈꺼야?\nen

In [163]:
qdrant_qa_response(query_1, collection_name)

{'query': '번역하지 않는 값은 무엇인가?',
 'result': ' "부산 동래구" 같은 데이터의 값들은 번역을 하지 않고 원본값을 유지한다.',
 'source_documents': [Document(page_content='1.Text Start에서 TextEnd까지의 내용만 번역한다.\n\n2.""으로 감싸진 부분은 원본값을 유지한다.\n\n3."부산 동래구" 같은 데이터의 값들은 번역을 하지 않고 원본값을 유지한다.\n\n4.데이터의 설명중 원본 데이터 관한 내용은 원본값 그대로 유지한다.\n\n5.번역의 내용을 절대 축소하지 않고 텍스트 전체의 내용을 번역한다.\n\n1.모든 코드는 Python으로 작성이 된다.\n\n2.그래프 코드에 그래프를 저장하는 코드를 작성한다. 파일 위치는 지금 위치의 \'test_graph\'폴더에 \'png\'파일로 저장한다.\n\n3.그래프에 대한 간단한 설명을 작성하고 설명을 저장하는 코드를 작성한다. 파일위치는 지금 위치의 \'test_text\'폴더에 \'txt\'파일로 저장한다.\n\n4.데이터프레임을 생성할 필요는 없다.\n\n5.데이터가 로드되어 있는 상태고 그 데이터프레임 명은 df 이다.\n\n1. All code is written in Python.\n\n2. Write code to save the graph in the \'png\' file in the \'test_graph\' folder at the current location.\n\n3. Write code to save the description of the graph with a brief explanation. Save the file in the \'txt\' format in the \'test_text\' folder at the current location.\n\n4. There is no need to create a dataframe.\n\n5. The data is loaded and the dataframe