# RAG langchain

In [54]:
from typing import List
from langchain_core.callbacks import CallbackManagerForRetrieverRun
from langchain_core.retrievers import BaseRetriever

from FlagEmbedding import BGEM3FlagModel
from pinecone import Pinecone, Index

from langchain_core.prompts import PromptTemplate

import torch
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, TextStreamer
from langchain_huggingface import HuggingFacePipeline

from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

In [2]:
from dotenv import load_dotenv
import os

load_dotenv("../")

True

In [3]:
class HybridSearchRetriever(BaseRetriever):

    pinecone_index:Index
    embedding_model:BGEM3FlagModel
    alpha:float
    top_k: int
    min_score:float

    def _get_relevant_documents(
        self, query: str, *, run_manager: CallbackManagerForRetrieverRun
    ) -> List[str]:
        """Sync implementations for retriever."""
        user_query_emb = self.embedding_model.encode(query, return_dense=True, return_sparse=True, return_colbert_vecs=False) #dense, sparse 둘 다 반환함
        
        query_dense_vector = user_query_emb['dense_vecs'].tolist()
        user_query_sparse = user_query_emb['lexical_weights']
        query_sparse_vector = {
            'indices': [int(k) for k in user_query_sparse.keys() if k.isdigit()], #isdigit() 안하면 에러뜨더라
            'values': [float(v) for k, v in user_query_sparse.items() if k.isdigit()]
        }

        hdense, hsparse = self._hybrid_score_norm(query_dense_vector, query_sparse_vector, alpha=self.alpha)

        hybrid_query_response = self.pinecone_index.query(
            top_k=self.top_k,
            vector=hdense,
            sparse_vector=hsparse,
            include_metadata=True,
        )
        
        documents = [
            f"{match['metadata']['answer_intro']}\n"
            f"{match['metadata']['answer_body']}\n"
            f"{match['metadata']['answer_conclusion']}"
            for match in hybrid_query_response['matches']
            if match['score'] >= self.min_score
        ]
        return documents
    
    def _hybrid_score_norm(self, dense, sparse, alpha: float):
        """Hybrid score using a convex combination

        alpha * dense + (1 - alpha) * sparse

        Args:
            dense: Array of floats representing
            sparse: a dict of `indices` and `values`
            alpha: scale between 0 and 1
        """
        if alpha < 0 or alpha > 1:
            raise ValueError("Alpha must be between 0 and 1")
        hs = {
            'indices': sparse['indices'],
            'values':  [v * (1 - alpha) for v in sparse['values']]
        }
        return [v * alpha for v in dense], hs

In [4]:
# Connect to the existing Pinecone index
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pinecone_index = pc.Index("health-care")

embedding_model = BGEM3FlagModel('BAAI/bge-m3',use_fp16=True) # Setting use_fp16 to True speeds up computation with a slight performance degradation

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

  colbert_state_dict = torch.load(os.path.join(model_dir, 'colbert_linear.pt'), map_location='cpu')
  sparse_state_dict = torch.load(os.path.join(model_dir, 'sparse_linear.pt'), map_location='cpu')


In [5]:
retriever = HybridSearchRetriever(
        pinecone_index=pinecone_index,
        embedding_model=embedding_model,
        alpha=0.95,
        top_k=3,
        min_score=0.55,
    )

In [6]:
def hybrid_score_norm(dense, sparse, alpha: float):
    """Hybrid score using a convex combination

    alpha * dense + (1 - alpha) * sparse

    Args:
        dense: Array of floats representing
        sparse: a dict of `indices` and `values`
        alpha: scale between 0 and 1
    """
    if alpha < 0 or alpha > 1:
        raise ValueError("Alpha must be between 0 and 1")
    hs = {
        'indices': sparse['indices'],
        'values':  [v * (1 - alpha) for v in sparse['values']]
    }
    return [v * alpha for v in dense], hs

def test(query):
    user_query_emb = embedding_model.encode(query, return_dense=True, return_sparse=True, return_colbert_vecs=False) #dense, sparse 둘 다 반환함
            
    query_dense_vector = user_query_emb['dense_vecs'].tolist()
    user_query_sparse = user_query_emb['lexical_weights']
    query_sparse_vector = {
        'indices': [int(k) for k in user_query_sparse.keys() if k.isdigit()], #isdigit() 안하면 에러뜨더라
        'values': [float(v) for k, v in user_query_sparse.items() if k.isdigit()]
    }

    hdense, hsparse = hybrid_score_norm(query_dense_vector, query_sparse_vector, alpha=0.95)

    hybrid_query_response = pinecone_index.query(
        top_k=10,
        vector=hdense,
        sparse_vector=hsparse,
        include_metadata=True,
    )
    print(hybrid_query_response)

In [7]:
test("밥먹다가 체한거 같아 너무 급하게 먹었나?")

{'matches': [{'id': 'HC-A-03401038',
              'metadata': {'answer_body': '에너지 섭취가 식사 속도보다 빠르게 증가하면 지방을 태우기 위해 '
                                          '사용되는 연료인 포도당을 충분히 섭취하지 못하게 되어 지방 '
                                          '축적이 증가합니다. 또한 식사 속도가 높으면 식사량을 유지하기 '
                                          '위해 근육과 지방 조직 내에 있는 여분의 에너지도 더 많이 '
                                          '저장됩니다. 이로 인해 에너지 필요량이 증가하고 에너지 소비량이 '
                                          '감소하게 됩니다.',
                           'answer_conclusion': '식사 속도와 식사량에 대한 관리는 비만 예방에 매우 '
                                                '중요합니다. 식사 속도를 늦추고 천천히 음식을 '
                                                '섭취하여 과도한 음식 섭취를 피하는 것이 좋습니다.',
                           'answer_intro': '비만의 주요 원인 중 하나는 식사 속도와 식사량에 따라 '
                                           '발생하는 과다한 음식 섭취입니다.',
                           'department': '내과',
                           'disease_category': '기타',
                           'dis

In [8]:
model_id="google/gemma-2-2b-it"

gemma_2_model = AutoModelForCausalLM.from_pretrained(model_id)
gemma_2_tokenizer = AutoTokenizer.from_pretrained(model_id)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [105]:
streamer = TextStreamer(gemma_2_tokenizer, skip_prompt=True, skip_special_tokens=True)

gen = pipeline(
    task='text-generation',
    model=gemma_2_model,
    tokenizer=gemma_2_tokenizer,
    max_new_tokens=1024,
    streamer=streamer,
    device=0 if torch.cuda.is_available() else -1,
    temperature=.5,
    top_p=0.7,
    repetition_penalty=1.1,
    do_sample=True,
    )

llm = HuggingFacePipeline(pipeline=gen)

In [106]:
template = """
Instructions:
- If the question involves a health-related issue, suggest possible causes and basic steps the user can take for relief, if applicable.
- You should explain in as much detail as possible what you know from the bottom of your heart to the user's questions.
- You can refer to the contents of the documents to create a response.
- Only use information that is directly related to the question.
- If no information is found in the documents, provide an answer based on general knowledge without fabricating details.
- You MUST answer in Korean.


Documents: {documents}

Question: {question}
"""

prev_chat = []

chat = [
    *prev_chat,
     { "role": "user", "content": template}
]

prompt_template = gemma_2_tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)

prompt = PromptTemplate(input_variables=["question", "documents"], template=prompt_template)
prompt

PromptTemplate(input_variables=['documents', 'question'], input_types={}, partial_variables={}, template="<bos><start_of_turn>user\nInstructions:\n- If the question involves a health-related issue, suggest possible causes and basic steps the user can take for relief, if applicable.\n- You should explain in as much detail as possible what you know from the bottom of your heart to the user's questions.\n- You can refer to the contents of the documents to create a response.\n- Only use information that is directly related to the question.\n- If no information is found in the documents, provide an answer based on general knowledge without fabricating details.\n- You MUST answer in Korean.\n\n\nDocuments: {documents}\n\nQuestion: {question}<end_of_turn>\n<start_of_turn>model\n")

In [107]:
rag_chain = (
    {"documents": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [108]:
answer = rag_chain.invoke("머리 아프다")

머리 아프니면 꼭 병원에 방문해야 합니다.  하지만, 몇 가지 기본적인 방법들을 시도해 볼 수 있습니다. 

**1. 편안하게 쉬세요.**  편안한 자세로 누워서 휴식을 취하세요. 
**2. 충분한 수분 섭취:** 물이나 음료를 많이 마셔 체내 수분량을 보충하세요. 
**3. 차분하고 잠깐 휴식:** 스트레스를 줄이고 잠깐 휴식을 취하면 더욱 빠른 회복을 도울 수 있습니다. 



**주의 사항:** 이 정보는 일반적인 건강 지침이며, 정확한 진단과 치료를 위해서는 의료 전문가의 도움을 받으십시오. 뇌출혈은 매우 심각한 질환이므로 즉시 의료진에게 문의하십시오. 



In [101]:
def stream_response(response, return_output=False):
    answer = ""
    for token in response:
        answer += token
        print(token, end="", flush=True)
    if return_output:
        return answer

END.