In [None]:
# install required packages
!pip install dashvector dashscope
!pip install transformers_stream_generator python-dotenv

In [None]:
import dashscope
import os
from dotenv import load_dotenv
from dashscope import TextEmbedding
from dashvector import Client, Doc

In [None]:
# get env variable from .env
# please make sure DASHSCOPE_KEY is defined in .env
load_dotenv()
dashscope.api_key = 'sk-a6667deead1b47dd8d8b87d3906564c7'
# initialize DashVector for embedding's indexing and searching
dashvector_client = Client(api_key='sk-GUbWsxMunraDOjTenzU4oIVFwKJ7A6A7D6812471611EEA245A2FAE309D5DD')
# define collection name
collection_name = 'news_embeddings'

# delete if already exist
dashvector_client.delete(collection_name)
# create a collection with embedding size of 1536
rsp = dashvector_client.create(collection_name, 1536)
collection = dashvector_client.get(collection_name)

In [None]:
def prepare_data_from_dir(path, size):
    # prepare the data from a file folder in order to upsert to DashVector with a reasonable doc's size.
    batch_docs = []
    for file in os.listdir(path):
        if file == '.ipynb_checkpoints':
           continue
        else:
            with open(path + '/' + file, 'r', encoding='utf-8') as f:
               batch_docs.append(f.read())
               if len(batch_docs) == size:
                   yield batch_docs[:]
                   batch_docs.clear()

    if batch_docs:
        yield batch_docs

In [None]:
def prepare_data_from_file(path, size):
    # prepare the data from file in order to upsert to DashVector with a reasonable doc's size.
    batch_docs = []
    chunk_size = 12
    with open(path, 'r', encoding='utf-8') as f:
        doc = ''
        count = 0
        for line in f:
            if count < chunk_size and line.strip() != '':
                doc += line
                count += 1
            if count == chunk_size:
                batch_docs.append(doc)
                if len(batch_docs) == size:
                    yield batch_docs[:]
                    batch_docs.clear()
                doc = ''
                count = 0

    if batch_docs:
        yield batch_docs

In [None]:
def generate_embeddings(docs):
    # create embeddings via DashScope's TextEmbedding model API
    rsp = TextEmbedding.call(model=TextEmbedding.Models.text_embedding_v1,
                             input=docs)
    embeddings = [record['embedding'] for record in rsp.output['embeddings']]
    return embeddings if isinstance(docs, list) else embeddings[0]

In [None]:
#  !git clone https://github.com/jsonzhuwei/gasgootest.git
id = 0
dir_name = 'gasgootest/companytext'

# indexing the raw docs with index to DashVector
collection = dashvector_client.get(collection_name)

# embedding api max batch size
batch_size = 4

for news in list(prepare_data_from_dir(dir_name, batch_size)):
    ids = [id + i for i, _ in enumerate(news)]
    id += len(news)
    # generate embedding from raw docs
    vectors = generate_embeddings(news)
    # upsert and index
    ret = collection.upsert(
        [
            Doc(id=str(id), vector=vector, fields={"raw": doc})
            for id, doc, vector in zip(ids, news, vectors)
        ]
    )
    print(ret)

# check the collection status
collection = dashvector_client.get(collection_name)
rsp = collection.stats()
print(rsp)

In [None]:
def search_relevant_context(question, topk=3, client=dashvector_client):
    # query and recall the relevant information
    collection = client.get(collection_name)

    # recall the top k similarity results from DashVector
    rsp = collection.query(generate_embeddings(question), output_fields=['raw'],
                           topk=topk)
    return "".join([item.fields['raw'] for item in rsp.output])

In [None]:
# query the top 1 results
question = '注册地址在上海的有哪几家公司？'
context = search_relevant_context(question, topk=3)
print(context)

In [None]:
# initialize qwen 7B model
from modelscope import AutoModelForCausalLM, AutoTokenizer
from modelscope import GenerationConfig

tokenizer = AutoTokenizer.from_pretrained("qwen/Qwen-7B-Chat", revision = 'v1.0.5',trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("qwen/Qwen-7B-Chat", revision = 'v1.0.5',device_map="auto", trust_remote_code=True, fp16=True).eval()
model.generation_config = GenerationConfig.from_pretrained("Qwen/Qwen-7B-Chat",revision = 'v1.0.5', trust_remote_code=True) # 可指定不同的生成长度、top_p等相关超参

In [None]:
# define a prompt template for the vectorDB-enhanced LLM generation
def answer_question(question, context):
    prompt = f'''请基于```内的内容回答问题。"
	```
	{context}
	```
	我的问题是：{question}？。
    '''
    history = None
    print(prompt)
    response, history = model.chat(tokenizer, prompt, history=None)
    return response

In [None]:
# test the case on plain LLM without vectorDB enhancement
question = '注册地址在上海的有哪几家公司'
answer = answer_question(question, '')
print(f'question: {question}\n' f'answer: {answer}')

In [None]:
# test the case with knowledge
context = search_relevant_context(question, topk=3)
answer = answer_question(question, context)
print(f'question: {question}\n' f'answer: {answer}')