# RAG langchain

In [5]:
from typing import List
from langchain_core.callbacks import CallbackManagerForRetrieverRun
from langchain_core.retrievers import BaseRetriever

from FlagEmbedding import BGEM3FlagModel
from pinecone import Pinecone, Index

from langchain_core.prompts import PromptTemplate

import torch
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM

from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

In [6]:
from dotenv import load_dotenv
import os

load_dotenv("../.env")

True

In [7]:
class HybridSearchRetriever(BaseRetriever):

    pinecone_index:Index
    embedding_model:BGEM3FlagModel
    alpha:float
    top_k: int
    min_score:float

    def _get_relevant_documents(
        self, query: str, *, run_manager: CallbackManagerForRetrieverRun
    ) -> List[str]:
        """Sync implementations for retriever."""
        user_query_emb = self.embedding_model.encode(query, return_dense=True, return_sparse=True, return_colbert_vecs=False) #dense, sparse 둘 다 반환함
        
        query_dense_vector = user_query_emb['dense_vecs'].tolist()
        user_query_sparse = user_query_emb['lexical_weights']
        query_sparse_vector = {
            'indices': [int(k) for k in user_query_sparse.keys() if k.isdigit()], #isdigit() 안하면 에러뜨더라
            'values': [float(v) for k, v in user_query_sparse.items() if k.isdigit()]
        }

        hdense, hsparse = self._hybrid_score_norm(query_dense_vector, query_sparse_vector, alpha=self.alpha)

        hybrid_query_response = self.pinecone_index.query(
            top_k=self.top_k,
            vector=hdense,
            sparse_vector=hsparse,
            include_metadata=True,
        )
        
        documents = [
            f"{match['metadata']['answer_intro']}\n"
            f"{match['metadata']['answer_body']}\n"
            f"{match['metadata']['answer_conclusion']}"
            for match in hybrid_query_response['matches']
            if match['score'] >= self.min_score
        ]
        return documents
    
    def _hybrid_score_norm(self, dense, sparse, alpha: float):
        """Hybrid score using a convex combination

        alpha * dense + (1 - alpha) * sparse

        Args:
            dense: Array of floats representing
            sparse: a dict of `indices` and `values`
            alpha: scale between 0 and 1
        """
        if alpha < 0 or alpha > 1:
            raise ValueError("Alpha must be between 0 and 1")
        hs = {
            'indices': sparse['indices'],
            'values':  [v * (1 - alpha) for v in sparse['values']]
        }
        return [v * alpha for v in dense], hs

In [8]:
# Connect to the existing Pinecone index
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pinecone_index = pc.Index("health-care")

embedding_model = BGEM3FlagModel('BAAI/bge-m3',use_fp16=True) # Setting use_fp16 to True speeds up computation with a slight performance degradation

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

In [9]:
retriever = HybridSearchRetriever(
        pinecone_index=pinecone_index,
        embedding_model=embedding_model,
        alpha=0.95,
        top_k=3,
        min_score=0.55,
    )

In [10]:
def hybrid_score_norm(dense, sparse, alpha: float):
    """Hybrid score using a convex combination

    alpha * dense + (1 - alpha) * sparse

    Args:
        dense: Array of floats representing
        sparse: a dict of `indices` and `values`
        alpha: scale between 0 and 1
    """
    if alpha < 0 or alpha > 1:
        raise ValueError("Alpha must be between 0 and 1")
    hs = {
        'indices': sparse['indices'],
        'values':  [v * (1 - alpha) for v in sparse['values']]
    }
    return [v * alpha for v in dense], hs

def test(query):
    user_query_emb = embedding_model.encode(query, return_dense=True, return_sparse=True, return_colbert_vecs=False) #dense, sparse 둘 다 반환함
            
    query_dense_vector = user_query_emb['dense_vecs'].tolist()
    user_query_sparse = user_query_emb['lexical_weights']
    query_sparse_vector = {
        'indices': [int(k) for k in user_query_sparse.keys() if k.isdigit()], #isdigit() 안하면 에러뜨더라
        'values': [float(v) for k, v in user_query_sparse.items() if k.isdigit()]
    }

    hdense, hsparse = hybrid_score_norm(query_dense_vector, query_sparse_vector, alpha=0.95)

    hybrid_query_response = pinecone_index.query(
        top_k=3,
        vector=hdense,
        sparse_vector=hsparse,
        include_metadata=True,
    )
    print(hybrid_query_response)

In [11]:
test("아데노")

{'matches': [{'id': 'HC-A-02104685',
              'metadata': {'answer_body': '이 질환의 초기 증상은 주로 추위나 스트레스 상황에서 손가락이나 '
                                          '발가락이 창백해지고 감각이 둔화되는 것입니다. 이러한 증상은 '
                                          '일시적으로 발생하며, 전신성 경화증이 동반된 경우 더욱 '
                                          '심해집니다. 레이노현상은 따뜻한 환경에서 더욱 심해질 수 '
                                          '있으므로 환우회나 의사를 방문하여 검사를 받는 것이 좋습니다.',
                           'answer_conclusion': '손발이 차고 수족냉증이 있는 경우 레이노증후군 '
                                                '가능성이 있으므로 의심 증상이 있다면 반드시 검사를 '
                                                '받아 적절한 치료를 받아야 합니다.',
                           'answer_intro': '레이노증후군은 혈관 반응 장애로 인해 손가락이나 발가락에서 '
                                           '창백하거나 비정상적인 혈관의 수축 증상이 나타나는 질환입니다.',
                           'department': '내과',
                           'disease_category': '뇌신경정신질환',
                           'disease_name_kor': '레이노병',
                      

In [12]:
model_id="google/gemma-2-2b-it"

gemma_2_model = AutoModelForCausalLM.from_pretrained(model_id)
gemma_2_tokenizer = AutoTokenizer.from_pretrained(model_id)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [13]:
from __future__ import annotations  # type: ignore[import-not-found]

from typing import Any, Iterator, List, Mapping, Optional

from langchain_core.callbacks import CallbackManagerForLLMRun
from langchain_core.language_models.llms import BaseLLM
from langchain_core.outputs import Generation, GenerationChunk, LLMResult
from pydantic import ConfigDict

DEFAULT_MODEL_ID = "gpt2"
DEFAULT_TASK = "text-generation"
VALID_TASKS = (
    "text2text-generation",
    "text-generation",
    "summarization",
    "translation",
)
DEFAULT_BATCH_SIZE = 4


class HuggingFacePipeline(BaseLLM):

    pipeline: Any = None  #: :meta private:
    model_id: str = DEFAULT_MODEL_ID
    """Model name to use."""
    model_kwargs: Optional[dict] = None
    """Keyword arguments passed to the model."""
    pipeline_kwargs: Optional[dict] = None
    """Keyword arguments passed to the pipeline."""
    batch_size: int = DEFAULT_BATCH_SIZE
    """Batch size to use when passing multiple documents to generate."""

    model_config = ConfigDict(
        extra="forbid",
    )

    @property
    def _identifying_params(self) -> Mapping[str, Any]:
        """Get the identifying parameters."""
        return {
            "model_id": self.model_id,
            "model_kwargs": self.model_kwargs,
            "pipeline_kwargs": self.pipeline_kwargs,
        }

    @property
    def _llm_type(self) -> str:
        return "huggingface_pipeline"

    def _generate(
        self,
        prompts: List[str],
        stop: Optional[List[str]] = None,
        run_manager: Optional[CallbackManagerForLLMRun] = None,
        **kwargs: Any,
    ) -> LLMResult:
        # List to hold all results
        text_generations: List[str] = []
        pipeline_kwargs = kwargs.get("pipeline_kwargs", {})
        skip_prompt = kwargs.get("skip_prompt", False)

        for i in range(0, len(prompts), self.batch_size):
            batch_prompts = prompts[i : i + self.batch_size]

            # Process batch of prompts
            responses = self.pipeline(
                batch_prompts,
                **pipeline_kwargs,
            )

            # Process each response in the batch
            for j, response in enumerate(responses):
                if isinstance(response, list):
                    # if model returns multiple generations, pick the top one
                    response = response[0]

                if self.pipeline.task == "text-generation":
                    text = response["generated_text"]
                elif self.pipeline.task == "text2text-generation":
                    text = response["generated_text"]
                elif self.pipeline.task == "summarization":
                    text = response["summary_text"]
                elif self.pipeline.task in "translation":
                    text = response["translation_text"]
                else:
                    raise ValueError(
                        f"Got invalid task {self.pipeline.task}, "
                        f"currently only {VALID_TASKS} are supported"
                    )
                if skip_prompt:
                    text = text[len(batch_prompts[j]) :]
                # Append the processed text to results
                text_generations.append(text)

        return LLMResult(
            generations=[[Generation(text=text)] for text in text_generations]
        )

    def _stream(
        self,
        prompt: str,
        stop: Optional[List[str]] = None,
        run_manager: Optional[CallbackManagerForLLMRun] = None,
        **kwargs: Any,
    ) -> Iterator[GenerationChunk]:
        from threading import Thread

        import torch
        from transformers import (
            StoppingCriteria,
            StoppingCriteriaList,
            TextIteratorStreamer,
        )

        pipeline_kwargs = kwargs.get("pipeline_kwargs", {})
        skip_prompt = kwargs.get("skip_prompt", True)

        if stop is not None:
            stop = self.pipeline.tokenizer.convert_tokens_to_ids(stop)
        stopping_ids_list = stop or []

        class StopOnTokens(StoppingCriteria):
            def __call__(
                self,
                input_ids: torch.LongTensor,
                scores: torch.FloatTensor,
                **kwargs: Any,
            ) -> bool:
                for stop_id in stopping_ids_list:
                    if input_ids[0][-1] == stop_id:
                        return True
                return False

        stopping_criteria = StoppingCriteriaList([StopOnTokens()])

        streamer = TextIteratorStreamer(
            self.pipeline.tokenizer,
            timeout=60.0,
            skip_prompt=skip_prompt,
            skip_special_tokens=True,
        )
        generation_kwargs = dict(
            text_inputs= prompt,
            streamer=streamer,
            stopping_criteria=stopping_criteria,
            **pipeline_kwargs,
        )
        t1 = Thread(target=self.pipeline, kwargs=generation_kwargs)
        t1.start()

        for char in streamer:
            chunk = GenerationChunk(text=char)
            if run_manager:
                run_manager.on_llm_new_token(chunk.text, chunk=chunk)

            yield chunk

In [14]:
pipeline_kwargs={
    "max_new_tokens": 1024,
    "temperature": .5,
    "top_p": 0.7,
    "repetition_penalty": 1.1,
    "do_sample": True,
}
device = 0 if torch.cuda.is_available() else -1
gen = pipeline(
    task='text-generation',
    model=gemma_2_model,
    tokenizer=gemma_2_tokenizer,
    device=device,
    **pipeline_kwargs
    )
llm = HuggingFacePipeline(pipeline=gen)

In [15]:
template = """
Instructions:
- If the question involves a health-related issue, suggest possible causes and basic steps the user can take for relief, if applicable.
- You should explain in as much detail as possible what you know from the bottom of your heart to the user's questions.
- You can refer to the contents of the documents to create a response.
- Only use information that is directly related to the question.
- If no information is found in the documents, provide an answer based on general knowledge without fabricating details.
- You MUST answer in Korean.


Documents: {documents}

Question: {question}
"""

prev_chat = []

chat = [
    *prev_chat,
     { "role": "user", "content": template}
]

prompt_template = gemma_2_tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)

prompt = PromptTemplate(input_variables=["question", "documents"], template=prompt_template)
prompt

PromptTemplate(input_variables=['documents', 'question'], input_types={}, partial_variables={}, template="<bos><start_of_turn>user\nInstructions:\n- If the question involves a health-related issue, suggest possible causes and basic steps the user can take for relief, if applicable.\n- You should explain in as much detail as possible what you know from the bottom of your heart to the user's questions.\n- You can refer to the contents of the documents to create a response.\n- Only use information that is directly related to the question.\n- If no information is found in the documents, provide an answer based on general knowledge without fabricating details.\n- You MUST answer in Korean.\n\n\nDocuments: {documents}\n\nQuestion: {question}<end_of_turn>\n<start_of_turn>model\n")

In [16]:
rag_chain = (
    {"documents": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [17]:
answer = rag_chain.invoke("갑자기 배가 너무 아파")
answer

"<bos><start_of_turn>user\nInstructions:\n- If the question involves a health-related issue, suggest possible causes and basic steps the user can take for relief, if applicable.\n- You should explain in as much detail as possible what you know from the bottom of your heart to the user's questions.\n- You can refer to the contents of the documents to create a response.\n- Only use information that is directly related to the question.\n- If no information is found in the documents, provide an answer based on general knowledge without fabricating details.\n- You MUST answer in Korean.\n\n\nDocuments: ['월경전 증후군은 월경 주기 전에 나타나는 다양한 신체적, 정신적 증상을 말합니다. 이는 여성들의 일상적인 생활에 영향을 주며, 월경전 불쾌한 증상으로 나타나기도 합니다.\\n월경전 증후군의 증상 중 하나는 배 아픔입니다. 일부 여성들은 배가 자주 아프고 불편함을 느낄 수 있습니다. 이러한 증상은 월경 전 몇 일에서 2주 전부터 시작되며 월경이 시작되면서 사라집니다. 복부 팽만감, 복통, 불안, 짜증, 우울 등이 월경전 증후군의 증상 중 일부일 수 있습니다.\\n월경전 증후군은 일시적으로 나타날 수도 있고 오랫동안 지속될 수도 있습니다. 월경전 증후군을 완화하기 위해 적절한 휴식과 충분한 수면을 취하는 것이 중요합니다. 일상 생활에서 신체적인 피로를 줄이고 정신적인 안정감을 유지하는 것도 도움이 

In [18]:
answer = rag_chain.stream("갑자기 배가 너무 아파")
for chunk in answer:
    print(chunk, end="", flush=True)

월경전 증후군의 증상 중 하나로 배 아픔이 나타날 수 있습니다. 월경 전 몇 일에서 2주 전부터 시작되며 월경이 시작되면서 사라집니다. 갑작스러운 배 아픔은 월경전 증후군의 증상일 수 있지만, 다른 질환의 증상일 수도 있습니다.  

**월경전 증후군의 경우:**

* **휴식과 충분한 수면**: 월경전 증후군의 증상을 완화하기 위해 적절한 휴식과 충분한 수면을 취하는 것이 중요합니다. 
* **일상생활에서 신체적인 피로를 줄이고 정신적인 안정감을 유지하는 것도 도움이 됩니다.**

**하지만, 갑작스러운 배 아픔은 다른 질환의 증상일 수 있으므로 다음과 같은 방법들을 고려해 볼 수 있습니다.**

1. **의사와 상담**: 갑작스러운 배 아픔이 지속되거나 심각하거나 특별한 증상이 있는 경우, 의사에게 문의하여 진단을 받아보세요. 
2. **복용 가능한 약물**: 의사의 처방을 받아 복용하는 약물이 필요할 수 있습니다.


**참고:** 위 내용은 일반적인 정보 제공 목적이며 의학적 조언이 아닙니다. 궁금한 점이 있다면 의사와 상담하세요. 


END.