In [133]:
#from qdrant_client import models, QdrantClient
import qdrant_client
#from langchain.vectorstores import Qdrant
from langchain_community.vectorstores import Qdrant
from langchain.text_splitter import RecursiveCharacterTextSplitter
#from langchain_openai import OpenAIEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings
from sentence_transformers import SentenceTransformer

import os
import numpy as np
import pandas as pd
import openai
import tiktoken
import uuid
import json

In [21]:
with open('c:/Program Files/Git/config.json', 'r', encoding='utf8') as f:
    config = json.load(f)

os.environ['QDRANT_HOST'] = config['QDRANT_HOST']
os.environ['QDRANT_API_KEY'] = config['QDRANT_API_KEY']
os.environ['OPENAI_API_KEY'] = config['OPENAI_API_KEY']

In [22]:
# create client connection

client = qdrant_client.QdrantClient(
        os.getenv("QDRANT_HOST"),
        api_key=os.getenv("QDRANT_API_KEY")
    )

##### **1. Qdrant Create**

In [203]:
# create or replace collection

os.environ['QDRANT_COLLECTION_NAME'] = "sample"

collection_config = qdrant_client.http.models.VectorParams(
        size=768, # 1536 for OpenAI #768 for Gemini/HuggingFace/instructor-xl
        distance=qdrant_client.http.models.Distance.COSINE
    )

client.recreate_collection(
    collection_name=os.getenv('QDRANT_COLLECTION_NAME'),
    vectors_config=collection_config
)

True

In [74]:
def tiktoken_len(text):
    tokenizer = tiktoken.get_encoding("cl100k_base")
    tokens = tokenizer.encode(text)
    return len(tokens)

In [75]:
# add documents to your vector database

from langchain.text_splitter import RecursiveCharacterTextSplitter

def get_chunks(text):
    text_splitter = RecursiveCharacterTextSplitter(
        #separator="\n",
        chunk_size=900,
        chunk_overlap=200,
        length_function=tiktoken_len
    )
    chunks = text_splitter.split_text(text)
    return chunks

In [76]:
# embedding model definition(be aware of dimension size!!!)

def get_embedding():
    embeddings = HuggingFaceEmbeddings(
                                    model_name="jhgan/ko-sroberta-multitask",
                                    model_kwargs={'device': 'cpu'},
                                    encode_kwargs={'normalize_embeddings': True}
                                    )  
    return embeddings

In [242]:
# transform chucnks into vector

def get_vectorstore(text_chunks, collection_name,ids, metadatas
                    ):
    vectorstore = Qdrant(
        client=client,
        collection_name=collection_name,
        embeddings=get_embedding()
    )
    vectorstore.add_texts(text_chunks
                        ,ids=ids
                        , metadatas=metadatas
                        )
    return vectorstore

In [120]:
# # upsert vectors into Qdrant 
# # id=indexing method 
# # metadata= payload {key:value}

# def upsert_vectorstore(text_chunks, vectorstore, ids, metadatas):
#     vectorstore.add_texts(text_chunks
#                         ,ids=ids
#                         , metadatas=metadatas
#                         )
#     return vectorstore

In [110]:
with open("C:/Users/astri/Downloads/비씨카드 리뉴얼/sample_prompt/qdrant/context.txt", 'rt', encoding='UTF8') as f:
    context = f.read()
with open("C:/Users/astri/Downloads/비씨카드 리뉴얼/sample_prompt/qdrant/code_rule.txt", 'rt', encoding='UTF8') as f:
    code_rule = f.read()
with open("C:/Users/astri/Downloads/비씨카드 리뉴얼/sample_prompt/qdrant/task.txt", 'rt', encoding='UTF8') as f:
    task = f.read()

In [124]:
chunks_1 = get_chunks(context)
chunks_2 = get_chunks(code_rule)
chunks_3 = get_chunks(task)

In [299]:
#for Qdrant search method

meta_1=list(np.repeat("context",len(chunks_1)))
meta_2=list(np.repeat("code_rule",len(chunks_2)))
meta_3=list(np.repeat("task",len(chunks_3)))

In [306]:
#for langchaing invoke method

meta_1=list(np.repeat({"filter":"context"},len(chunks_1)))
meta_2=list(np.repeat({"filter":"code_rule"},len(chunks_2)))
meta_3=list(np.repeat({"filter":"task"},len(chunks_3)))

In [307]:
#chunck_list = sum([chunks_1,chunks_2, chunks_3],[] )
metadatas= sum([meta_1,meta_2, meta_3] , [])

In [308]:
collection_name="sample"
ids=list(range(len(chunck_list)))
vec=get_vectorstore(chunck_list, collection_name, ids, metadatas)

In [318]:
client.set_payload(
    collection_name="sample",
    payload={'filter': 'task'},
    points=[11]
)

UpdateResult(operation_id=16, status=<UpdateStatus.COMPLETED: 'completed'>)

In [319]:
client.set_payload(
    collection_name="sample",
    payload={'filter': 'code_rule'},
    points=[9,10]
)

UpdateResult(operation_id=17, status=<UpdateStatus.COMPLETED: 'completed'>)

In [320]:
client.set_payload(
    collection_name="sample",
    payload={'filter': 'context'},
    points=[0,1,2,3,4,5,6,7,8]
)

UpdateResult(operation_id=18, status=<UpdateStatus.COMPLETED: 'completed'>)

##### **2. Qdrant Search**

In [309]:
# count vectors 

cnt = client.count(
    collection_name="sample", 
    exact=True,
)
index=[i for i in range(0,cnt.count)]

In [321]:
#SELECT *

client.retrieve(
    collection_name="sample",
    ids=index,
    with_vectors=False # the default is False
)

[Record(id=0, payload={'filter': 'context', 'metadata': {'filter': 'context'}, 'page_content': 'This data is credit card data.\nThe information about the data is as follows:\n\n #   Column            Dtype  \n\n---  ------            -----'}, vector=None, shard_key=None),
 Record(id=1, payload={'filter': 'context', 'metadata': {'filter': 'context'}, 'page_content': "0   store_id          int64  \n 1   card_id           int64  \n 2   card_company      object \n 3   transacted_date   object \n 4   transacted_time   object \n 5   installment_term  int64  \n 6   region            object \n 7   type_of_business  object \n 8   amount            float64\nBelow are additional details about each column.\nDate columns are in the format YYYY-MM-DD.\nTime columns are in the format HH:MM.\nThe values \u200b\u200bin the region column are as follows.\narray(['부산 동래구', '서울 종로구', '대구 수성구', '경기 용인시', '경기 안양시', '경기 수원시',\n       '서울 마포구', '부산 부산진구', '서울 중랑구', '서울 용산구', '전남 목포시', '서울 동작구',\n       '경북 경주시

In [194]:
def get_filter(key, value):
    filter = models.Filter(
    must=[models.FieldCondition(key=key
                                , match=models.MatchValue(value=value))]
                                )
    return filter

In [322]:

code_rule_filter = models.Filter(
    must=[models.FieldCondition(key="filter"
                                , match=models.MatchValue(value="code_rule"))]
)
context_filter = models.Filter(
    must=[models.FieldCondition(key="filter"
                                , match=models.MatchValue(value="context"))]
)
task_filter = models.Filter(
    must=[models.FieldCondition(key="filter"
                                , match=models.MatchValue(value="task"))]
)

In [173]:
# create dummy vector

def new_rule():
    return np.random.uniform(low=-1.0, high=1.0, size=768).tolist()

In [323]:
# WHERE metadata="code_rule"

client.search(
    collection_name="sample"
    ,query_vector=new_rule()
    ,query_filter=code_rule_filter
)

[ScoredPoint(id=9, version=17, score=0.009048067, payload={'filter': 'code_rule', 'metadata': {'filter': 'code_rule'}, 'page_content': '1.Text Start에서 TextEnd까지의 내용만 번역한다.\n\n2.""으로 감싸진 부분은 원본값을 유지한다.\n\n3."부산 동래구" 같은 데이터의 값들은 번역을 하지 않고 원본값을 유지한다.\n\n4.데이터의 설명중 원본 데이터 관한 내용은 원본값 그대로 유지한다.\n\n5.번역의 내용을 절대 축소하지 않고 텍스트 전체의 내용을 번역한다.\n\n1.모든 코드는 Python으로 작성이 된다.\n\n2.그래프 코드에 그래프를 저장하는 코드를 작성한다. 파일 위치는 지금 위치의 \'test_graph\'폴더에 \'png\'파일로 저장한다.\n\n3.그래프에 대한 간단한 설명을 작성하고 설명을 저장하는 코드를 작성한다. 파일위치는 지금 위치의 \'test_text\'폴더에 \'txt\'파일로 저장한다.\n\n4.데이터프레임을 생성할 필요는 없다.\n\n5.데이터가 로드되어 있는 상태고 그 데이터프레임 명은 df 이다.\n\n1. All code is written in Python.\n\n2. Write code to save the graph in the \'png\' file in the \'test_graph\' folder at the current location.\n\n3. Write code to save the description of the graph with a brief explanation. Save the file in the \'txt\' format in the \'test_text\' folder at the current location.\n\n4. There is no need to create a dataframe.\n\n5. The data is loaded and the dataf

In [324]:
# add filter & score threshold

client.search(
    collection_name="sample",
    query_vector=new_rule(),
    query_filter=context_filter,
    score_threshold = 0.01
)

[ScoredPoint(id=8, version=18, score=0.041703954, payload={'filter': 'context', 'metadata': {'filter': 'context'}, 'page_content': "This bar graph illustrates the top industries by sales in Dongnae-gu, Busan, based on the provided credit card data.\nEach bar represents a different type of business, and the height of the bar indicates the total sales amount (in Korean Won, KRW) generated by that industry.\nThe x-axis denotes the type of business, while the y-axis represents the total sales amount.\n\nyou are a korean translator who lived in Korea for many years and knows well about korea's region and district name.\nPlease translate the following Korean question to English, but do not translate any proper nouns such as names of places, people, or specific terms. Leave those in Korean and include them in single quotes in the English translation.\n* example1\nkorean question:서초동에서 가장 비싼 동네는 어디야?\nenglish translation: which city is the most expensive in 서초동?\n* example2\nkorean question:서울

In [325]:
result = client.search(
    collection_name="sample",
    query_vector=new_rule(),
    query_filter=code_rule_filter
)

In [326]:
# Measure performance by similarity score

query = "상권분석"
found_docs = vec.similarity_search_with_score(query)
found_docs

[(Document(page_content="This bar graph illustrates the top industries by sales in Dongnae-gu, Busan, based on the provided credit card data.\nEach bar represents a different type of business, and the height of the bar indicates the total sales amount (in Korean Won, KRW) generated by that industry.\nThe x-axis denotes the type of business, while the y-axis represents the total sales amount.\n\nyou are a korean translator who lived in Korea for many years and knows well about korea's region and district name.\nPlease translate the following Korean question to English, but do not translate any proper nouns such as names of places, people, or specific terms. Leave those in Korean and include them in single quotes in the English translation.\n* example1\nkorean question:서초동에서 가장 비싼 동네는 어디야?\nenglish translation: which city is the most expensive in 서초동?\n* example2\nkorean question:서울에 왜 가고싶어 ?\nenglish translation: why do you want to go to 서울?\n* example3\nkorean question:맥도날드 언제 갈꺼야?\nen

In [327]:
# Measure performance by similarity score by filter

query = "상권분석"
found_docs_2 = vec.similarity_search_with_score(query
                                                , filter=context_filter)
found_docs_2

[(Document(page_content="This bar graph illustrates the top industries by sales in Dongnae-gu, Busan, based on the provided credit card data.\nEach bar represents a different type of business, and the height of the bar indicates the total sales amount (in Korean Won, KRW) generated by that industry.\nThe x-axis denotes the type of business, while the y-axis represents the total sales amount.\n\nyou are a korean translator who lived in Korea for many years and knows well about korea's region and district name.\nPlease translate the following Korean question to English, but do not translate any proper nouns such as names of places, people, or specific terms. Leave those in Korean and include them in single quotes in the English translation.\n* example1\nkorean question:서초동에서 가장 비싼 동네는 어디야?\nenglish translation: which city is the most expensive in 서초동?\n* example2\nkorean question:서울에 왜 가고싶어 ?\nenglish translation: why do you want to go to 서울?\n* example3\nkorean question:맥도날드 언제 갈꺼야?\nen

##### **3. Qdrant retriever**

In [216]:
# model definition TBC

def load_llm():
    llm = OpenAI(openai_api_key= config['OPENAI_API_KEY'])
    #llm = CTransformers(
        #model = "TheBloke/Llama-2-7B-Chat-GGML",
        #model_type="llama",
        #temperature = 0.2)
    return llm

In [292]:
# plug the vector store to your retrieval chain

from langchain.chains import RetrievalQA, VectorDBQA
from langchain_openai import OpenAI

# Create a question-answering instance
qa = RetrievalQA.from_chain_type(
    llm=load_llm(),
    chain_type="stuff",
    retriever=vec.as_retriever(),
    return_source_documents=False,
    #chain_type_kwargs={'prompt': prompt}
    )

In [328]:
#sample
query_1 = "번역하지 않는 값은 무엇인가?"
response = qa.invoke(query_1)
print(response)

{'query': '번역하지 않는 값은 무엇인가?', 'result': ' "부산 동래구" 같은 데이터의 값들은 번역을 하지 않고 원본값을 유지한다.'}


In [289]:
#sample
query_2 = "상권분석에 필요한 데이터를 한국어로 설명해줘"
response = qa.invoke(query_2)
print(response)

{'query': '상권분석에 필요한 데이터를 한국어로 설명해줘', 'result': '\n이 프로젝트는 한국의 지역별 데이터를 활용하여 상권 분석을 하는 것을 목표로 합니다. 데이터는 신용카드 데이터로, 각 지역의 업종별 매출을 나타내는 바 그래프와 카드 회사별 2017년 매출 추이를 나타내는 선 그래프가 포함됩니다. 또한, 데이터를 토대로 시간대별 업종별 매출 비교를 할 수 있습니다. 프로젝트 결과물로는 각 지역의 매출 상위 10개 업종을 비교하는 바 그래프와, 각 카드 회사의 매출 추이를 비교하는 선 그래프가 포함됩니다. 모든 코드는 파이썬으로 작성되며, 결과물로는 그래프와 설명이'}
