# install & import

## install


In [None]:
!pip install rank_bm25

In [None]:
!pip install cohere

In [None]:
!pip install langchain_community
!pip install chromadb
!pip install sentence_transformers
!pip install langchain
!pip install langchain langchainhub httpx_sse
!pip install langchain_openai
!pip install azure-ai-documentintelligence
!pip install pypdf
!pip install flashrank
!pip install torch
!pip install cohere
!pip install llama-index FlagEmbedding
!pip install llama-index-postprocessor-flag-embedding-reranker

In [None]:
pip install zai-sdk

## import


In [None]:
pip install "langchain==0.1.20" "langchain-community==0.0.38" --upgrade


In [None]:
pip uninstall -y langchain-openai


In [None]:
import zai
print(zai.__version__)

In [None]:
import os
import csv
import concurrent.futures

from langchain_community.document_loaders import (
    TextLoader,
    PyPDFLoader,
    AzureAIDocumentIntelligenceLoader,
)

from langchain_text_splitters import RecursiveCharacterTextSplitter

from langchain_community.embeddings import HuggingFaceBgeEmbeddings

from langchain_community.vectorstores import Chroma

from langchain.chat_models import ChatOpenAI

from langchain.prompts import (
    ChatPromptTemplate,
    MessagesPlaceholder,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)

from langchain.chains import LLMChain

from langchain.memory import ConversationBufferMemory

from langchain.schema import Document

from llama_index.postprocessor.flag_embedding_reranker import FlagEmbeddingReranker

from llama_index.core.schema import NodeWithScore, QueryBundle, TextNode

from langchain.retrievers import EnsembleRetriever
from langchain_community.retrievers import BM25Retriever


In [None]:
from zai import ZhipuAiClient

In [None]:
#pip show langchain

# RAG

## DATA FOLDER/ AZURE KEY


In [None]:
folder_path = "700data"
endpoint = "https://1-297.cognitiveservices.azure.com/"
key = "9f723c4c0b9245e98dd7a9dfd6d5667f.SUBdNWoWjIgmmpAe"

## use recursive text splitter, give each chunk a metadata, use Huggingface embeddingmodel to vectorize
## support pdf, docx, txt files


In [None]:
def text_split(folder_path):
    text_splitter = RecursiveCharacterTextSplitter(
    separators=[
        "\n",
        "\u3002",  
        "",
    ],
    chunk_size=960,
    chunk_overlap=50,
    length_function=len,
    is_separator_regex=False,
    )

    all_docs = []
    documents = []

    n=0;

    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
    
        if filename.endswith(".docx"):  
            docx_loader = AzureAIDocumentIntelligenceLoader(
            api_endpoint=endpoint, api_key=key, file_path = file_path, api_model="prebuilt-layout"
        )
            documents = docx_loader.load()
        
        elif filename.endswith(".pdf"):
            documents = PyPDFLoader(file_path).load()

        elif filename.endswith(".txt"):
            txt_loader = TextLoader(file_path)
            documents = txt_loader.load()
    
        else:
            continue

        
        for doc in documents:
            chunks = text_splitter.split_text(doc.page_content)
            for chunk in chunks:
                all_docs.append(Document(page_content=chunk, metadata={"source": filename.split('.')[0],"index": str(n)}))
                n = n + 1
            
    return all_docs

In [None]:
model_name = "bge-large-zh-v1.5"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True}
hf = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs,
)

In [None]:
all_docs = text_split(folder_path)
# all_docs[1]

In [None]:
print(all_docs[2])

In [None]:
all_docs[3].metadata['source']

In [None]:
len(all_docs)


### save to local，persist directory，chroma

In [None]:
persist_directory = "./chroma_db"

pa = ":/luanlai"

def w_v_db(doc, embed, pd):
    vectorstore = Chroma.from_documents(
    documents=doc,
    embedding=hf, 
    persist_directory= pd
    )
    return vectorstore


In [None]:
def r_v_db(pd, hf):
    db= Chroma(persist_directory = pd , embedding_function=hf)
    return db


In [None]:
vectorstore = w_v_db(all_docs, hf, persist_directory)
# vectorstore = r_v_db(persist_directory, hf)


In [None]:
print(vectorstore._collection.count()) 

## use CHATGLM, heres api and set rompt& conversation & memory

In [None]:
llm = ChatOpenAI(
    model="glm-4.5",
    temperature=0.6,
    openai_api_key="9f723c4c0b9245e98dd7a9dfd6d5667f.SUBdNWoWjIgmmpAe",
    openai_api_base="https://open.bigmodel.cn/api/paas/v4/"
)

prompt = ChatPromptTemplate(
    messages=[
        SystemMessagePromptTemplate.from_template(
            "You are a medical expert assistant. Always answer in English. "
        ),
        MessagesPlaceholder(variable_name="chat_history"),
        HumanMessagePromptTemplate.from_template("{question}")
    ]
)


In [None]:
 
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
conversation = LLMChain(
    llm=llm,
    prompt=prompt,
    verbose=True,
    memory=memory
)

## clear memory


In [None]:
def memoryclear(memory):
    memory.clear()

## use chroma similarity search and feed it back to model

In [None]:
query = "how to prevent breast cancer?"
ans = vectorstore.similarity_search(query, 3)
memoryclear(memory)
conversation.invoke(query + str(ans[:1]))

# test

In [None]:
prompt_test = ChatPromptTemplate(
    messages=[
        SystemMessagePromptTemplate.from_template(
            "You are a medical expert assistant. Always answer in English. "
        ),
        MessagesPlaceholder(variable_name="chat_history"),
        HumanMessagePromptTemplate.from_template("{question}")
    ]
)

question_generate = LLMChain(
    llm=llm,
    prompt=prompt_test,
    verbose=True,
    memory=memory
)

In [None]:
multiquery = ChatPromptTemplate(
    messages=[
        SystemMessagePromptTemplate.from_template(
            "Your task is to generate several different versions of the user’s question to help retrieve relevant documents from the vector database. By rephrasing the question from different perspectives, your goal is to overcome certain limitations of distance-based similarity search."
        ),
        MessagesPlaceholder(variable_name="chat_history"),
        HumanMessagePromptTemplate.from_template("{question}")
    ]
)

mq_chain = LLMChain(
    llm=llm,
    prompt=multiquery,
    verbose=True,
    memory=memory
)

## test QA set

In [None]:
import concurrent.futures

def call_api(doc):
    memoryclear(memory)
    value = question_generate.invoke(doc.page_content)
    return doc.metadata['index'], [value['text'], value['question'], doc.metadata['source']]

chunk_questions = {}
max_workers = 1

with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
    results = executor.map(call_api, all_docs)
    
    for index, data in results:
        chunk_questions[index] = data


In [None]:
print(1)

## CSV

### write CSV

In [None]:
import re

def clean_text(text):
    return re.sub('\x00','',text)

with open(folder_path+'.csv', mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file, escapechar = '\\')
    writer.writerow(['Gnerated Q', 'Chunk', 'Source'])  
    for key, value in chunk_questions.items():
        clean_value = [clean_text(field) if isinstance(field, str) else field for field in value]
        writer.writerow(clean_value)  
print("Data has been written to something csv")


In [None]:
print("len(chunk_questions) =", len(chunk_questions))

### csv into dictionary

In [None]:
def extract_csv(file_path):
    chunk_questions = {}

    with open(file_path, mode='r', newline='', encoding='utf-8') as file:
        reader = csv.reader(file)
        headers = next(reader)  

        key_counter = 0  
        for row in reader:
            chunk_questions[str(key_counter)] = [row[0],row[1],row[2]]
            key_counter += 1
    
    print("Data has been read from csv into dictionary.")
    return chunk_questions
    

In [None]:
chunk_questions = extract_csv(folder_path+'.csv')
print(len(chunk_questions))

## recall test

### similarity_search and metadata

In [None]:
def search_with_metadata(query, db, top_k, meta):
    query_lower = query.lower()
    result = db.similarity_search(
        query=query,
        k= top_k,
        filter={'source': meta} 
    )
    
    return result

In [None]:
def correct_original(dic, mem, k, db):
    hit = 0;
    for key,value in dic.items():
        x = hit
        memoryclear(mem)
        query = value[0]
        result = db.similarity_search(query=query, k = k)
        for doc in result:
            #print(doc.metadata['index'])
            #print(doc.metadata['source'])
            if doc.metadata['index'] == key :
                hit = hit + 1
                break    
        # if x == hit:
        #     print(key)
            
    return hit

print(correct_original(chunk_questions, memory, 1, vectorstore))
print(correct_original(chunk_questions, memory, 3, vectorstore))
print(correct_original(chunk_questions, memory, 5, vectorstore))


In [None]:
def correct_filtered(dic, mem, k, db):
    hit = 0;
    for key,value in dic.items():
        x = hit
        memoryclear(mem)
        result = search_with_metadata(value[0], db, k, value[2])
        for doc in result:
            #print(doc.metadata['index'])
            #print(doc.metadata['source'])
            if doc.metadata['index'] == key :
                hit = hit + 1
                break
        # if x == hit:
        #     print(key)

    return hit

print(correct_filtered(chunk_questions, memory, 1, vectorstore))
print(correct_filtered(chunk_questions, memory, 3, vectorstore))
print(correct_filtered(chunk_questions, memory, 5, vectorstore))

### Ensemble retriever

In [None]:
bm25_retriever = BM25Retriever.from_documents(all_docs)
#chroma_vectorstore = Chroma.from_documents(all_docs, hf)
from langchain.retrievers import EnsembleRetriever
from langchain_community.retrievers import (
     ElasticSearchBM25Retriever,
 )


bm25_retriever.k = 5


chroma_retriever = vectorstore.as_retriever()

ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, chroma_retriever], weights=[0.5, 0.5],
)

docs = ensemble_retriever.get_relevant_documents(query = "breat cancer")
len(docs)

In [None]:
 def correct_ensemble(dic, mem, k, db):
     hit = 0;
     
     for key,value in dic.items():
         x = hit
         memoryclear(mem)
         query = value[0]
         result = ensemble_retriever.get_relevant_documents(query = query)
         result = result[:k]
         for doc in result:
             #print(doc.metadata['index'])
             #print(doc.metadata['source'])
             if doc.metadata['index'] == key :
                 hit = hit + 1
                 break    
            
     return hit

 print(correct_ensemble(chunk_questions, memory, 1, vectorstore))
 print(correct_ensemble(chunk_questions, memory, 3, vectorstore))

### chroma similarity + mmr search

In [None]:
bm25_retriever.k = 1

In [None]:
from collections import Counter

def ensemble_retrieval(query, top_k, db,meta):
    all_results = []
    result1 = search_with_metadata(query, db, top_k, meta)
    result2 = bm25_retriever.invoke(query)
    all_results.extend(result2)
    all_results.extend(result1)
    # for retriever in retrievers:
    #     if retriever == 'search_with_metadata':
    #         results = retriever(query, db, top_k, meta)
    #     else:
    #         results = retriever(query)
    #     all_results.extend(results)
    

    
    return all_results
    
# retrievers = [search_with_metadata, bm25_retriever]

In [None]:
print(ensemble_results)

In [None]:
def correct_ensem(dic, mem, k, db):
    hit = 0;
    for key,value in dic.items():
        x = hit
        memoryclear(mem)
        # result = search_with_metadata(value[0], db, k, value[2])
        result = ensemble_retrieval(value[0], 1, vectorstore, value[2])
        for doc in result:
            print(doc.metadata['index'])
            print(doc.metadata['source'])
            if doc.metadata['index'] == key :
                hit = hit + 1
                break
        # if x == hit:
        #     print(key)

    return hit

print(correct_ensem(chunk_questions, memory, 2, vectorstore))

### Multiquery

In [None]:
def correct_mq(dic, mem, k, db):
    hit = 0

    for key, value in dic.items():
        retrieved_chunks = set()  
        memoryclear(mem)
        querys = mq_chain.invoke(value[0])['text']
        #qs = querys.split('.')
        qs = []
        qs.append(querys)
        qs.append(value[0])
        result = [search_with_metadata(q, db, k, value[2]) for q in qs]
        exit_flag = False

        # print(len(result))
        # for re in result:
        #     for r in re:
        #         print(r)
        #         print("111111111111111111111111111111111111111111111111111111111111111111111111111111111111111")
        #     print("2222222222222222222222222222222222222222222222222222222222222222222222222222222")
        # break

        for docs in result:
            counter = 0
            if exit_flag:
                break
            for doc in docs:
                if counter == k:
                    break
                if doc.metadata['index'] in retrieved_chunks:
                    continue
                #print(doc.metadata['index'])
                #print(doc.metadata['source'])
                if doc.metadata['index'] == key:
                    hit = hit + 1
                    exit_flag = True
                    break
                retrieved_chunks.add(doc.metadata['index'])  
                counter = counter + 1


    return hit



In [None]:
print(correct_mq(chunk_questions, memory, 1, vectorstore))


In [None]:
print(correct_mq(chunk_questions, memory, 3, vectorstore))

In [None]:
# def correct_mq(dic, mem, k, db):
#     hit = 0;
#     for key,value in dic.items():
#         x = hit
#         memoryclear(mem)
#         querys = mq_chain.invoke(value[0])
#         query = querys['text']
#         qs = query.split('.')
#         qs.append(value[0])
#         result = [search_with_metadata(q, db, k, value[2]) for q in qs]
#         exit_flag = False
#         for docs in result:
#              if exit_flag:  
#                 break
#              for doc in docs:
#                 print(doc.metadata['index'])
#                 print(doc.metadata['source'])
#                 if doc.metadata['index'] == key :
#                     hit = hit + 1
#                     exit_flag = True
#                     break
#         # if x == hit:
#         #     print(key)

#     return hit

# print(correct_mq(chunk_questions, memory, 1, vectorstore))

## Reranker

### BGE RERANKER 

#### BGE RERANKER M3

In [None]:
# reranker = FlagEmbeddingReranker(
#     top_n= 1,
#     model="BAAI/bge-reranker-v2-m3",
#     use_fp16=True
# )

#### BGE RERANKER LARGE

In [None]:
# reranker = FlagEmbeddingReranker(
#     top_n= 5,
#     model="BAAI/bge-reranker-large",
#     use_fp16=True
# )

#### BGE RERANKER MULTILAYER

In [None]:
from FlagEmbedding import FlagReranker

In [None]:
# reranker = FlagEmbeddingReranker(
#     top_n= 1,
#     model = "BAAIbge-reranker-v2-minicpm-layerwise/bge-reranker-v2-minicpm-layerwise",
#     use_fp16=True
# )

from FlagEmbedding import LayerWiseFlagLLMReranker

reranker = LayerWiseFlagLLMReranker(
    model_name_or_path="BAAI/bge-reranker-v2-minicpm-layerwise",
    use_fp16=True,
    trust_remote_code=True 
)

In [None]:
def bge_rerank_multilayer(dic, mem,k):
    hit = 0
    count = 0
    for key, value in dic.items():
        #print(count)
        count = count + 1
        memoryclear(mem)
        query = value[0]
        result = search_with_metadata(query, vectorstore, 5, value[2])
        documents = [re.page_content for re in result]
        pairs = [[query, doc] for doc in documents]
        scores = reranker.compute_score(pairs, cutoff_layers=[28])

        top_scores_indices = sorted(
            range(len(scores)),
            key=lambda i: scores[i],
            reverse=True
        )[:k]
        
        for index in top_scores_indices:
            if documents[index] == value[1]:
                hit += 1
                print(hit)
                break

    return hit


In [None]:
print(bge_rerank_multilayer(chunk_questions, memory, 1))

In [None]:
print(bge_rerank_multilayer(chunk_questions, memory, 3))

#### RERANK function

In [None]:
# def bge_rerank(dic, mem, vectorstore):
#     hit = 0
#     count = 0
#     for key, value in dic.items():
#         print(count)
#         count = count + 1
#         memoryclear(mem)
#         query = value[0]
#         result = search_with_metadata(query, vectorstore, 5, value[2])
#         documents = [re.page_content for re in result]
#         nodes = [NodeWithScore(node=TextNode(text=doc)) for doc in documents]
#         query_bundle = QueryBundle(query_str=query)
#         ranked_nodes = reranker._postprocess_nodes(nodes, query_bundle)
#         for node in ranked_nodes:
#             # node.node.get_content(),
#             print("-> Score:", node.score)
#             if node.node.get_content() == value[1]:
#                 hit += 1
#                 break
#     return hit

In [None]:
# print(bge_rerank(chunk_questions, memory, vectorstore))

In [None]:
# query_bundle = QueryBundle(query_str=query)
# ranked_nodes = reranker._postprocess_nodes(nodes, query_bundle)

In [None]:
# for node in ranked_nodes:
#     print(node.node.get_content(), "-> Score:", node.score)

### Cohere Reranker


In [None]:
import cohere

co = cohere.Client("quwDJP3BjSin11gf53PUFyjwV01o6oAu5QnazGxJ")

def rerank_with_cohere(query, results, k): 
    texts = [result.page_content for result in results]
    reranked = co.rerank(model='rerank-multilingual-v3.0', query=query, documents=texts, top_n = k, return_documents = True)

    return reranked


In [None]:
# query = value[0] + value[2].split('.')[0]
# qe ="GPU ？"
# re = rerank_with_cohere(qe, search_with_metadata(qe, vectorstore, 5), 5)
# print(re.results[2].document.text)

# print(re.results)

In [None]:
#### COHERE reranker

In [None]:
import time

class RateLimiter:
    def __init__(self, calls_per_minute):
        self.calls_per_minute = calls_per_minute
        self.calls_made = 0
        self.start_time = time.time()

    def wait(self):
        elapsed_time = time.time() - self.start_time
        if self.calls_made >= self.calls_per_minute:
            time_to_wait = 80 - elapsed_time
            if time_to_wait > 0:
                time.sleep(time_to_wait)
            self.calls_made = 0
            self.start_time = time.time()
        self.calls_made += 1

# RateLimiter
def correct_rerank(dic, mem, k):
    hit = 0
    rate_limiter = RateLimiter(10)  
    for key, value in dic.items():  
        memoryclear(mem)
        query = value[0]
        result = search_with_metadata(query, vectorstore, 5, value[2])
        rate_limiter.wait()  
        reranked = rerank_with_cohere(query, result, k)
        for a in reranked.results:
            if a.document.text == value[1]:
                hit += 1
                break
    return hit



In [None]:
print(correct_rerank(chunk_questions, memory, 3))