In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots

import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv("/kaggle/input/medquad-medical-question-answer-for-ai-research/medquad.csv")
df = df.dropna()

#df = df.sample(2000)
#df = df.reset_index()
#df = df.drop('index',axis =1)
display(df.shape)
df.head()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming 'df' is your input DataFrame
focus_area_counts = df['focus_area'].value_counts()
focus_areas = focus_area_counts[focus_area_counts > 15].index

fig, ax = plt.subplots(figsize=(20, 8))
sns.set(font_scale=1.2)

ax.pie(focus_area_counts[focus_areas], labels=focus_areas, autopct='%1.1f%%')
ax.set_title('Pie Chart of Focus Areas')
ax.axis('equal')  # Equal aspect ratio ensures that pie is circular.

plt.show()

In [None]:
import random


# ANSI color codes

color_codes = {

    "blue": 34,
    "green": 32,
    "red": 31,
    "purple": 35,
    "orange": 33,
    "yellow": 33,
    "pink": 35,
    "brown": 33,
    "gray": 37
}



for i in range(0, len(df), 7):

    color = random.choice(list(color_codes.values()))

    print(f"\033[1;{color}mThe question is: {df['question'][i]}\033[0m\n\033[1;{color}m The answer is: {df['answer'][i]}\033[0m\n")

    if i > 30:

        break

In [None]:
!pip install transformers sentence-transformers faiss-cpu

In [None]:
from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer
from transformers import DPRContextEncoder, DPRContextEncoderTokenizer

# Question Encoder (for retrieving documents)
question_encoder = DPRQuestionEncoder.from_pretrained('facebook/dpr-question_encoder-single-nq-base')
question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained('facebook/dpr-question_encoder-single-nq-base')

# Context Encoder (for encoding the documents in the knowledge base)
context_encoder = DPRContextEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')
context_tokenizer = DPRContextEncoderTokenizer.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')

In [None]:
from transformers import BartForConditionalGeneration, BartTokenizer
from sentence_transformers import SentenceTransformer
import torch

# Generator model
generator = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
generator_tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

In [None]:
unique_focus_areas = df['focus_area'].unique()

# Filter for focus areas with more than 50 occurrences
filtered_focus_areas = df['focus_area'].value_counts()[df['focus_area'].value_counts() > 15]


# Get the top 1000
selected_focus_areas = filtered_focus_areas.head(1000)

# Print the selected focus areas
selected_focus_areas

new_df = pd.DataFrame()

for focus_area in selected_focus_areas.index:
    # Filter the original DataFrame for the current focus area
    focus_area_df = df[df['focus_area'] == focus_area]

    # Sample 20 rows from the filtered DataFrame
    sampled_df = focus_area_df.sample(n=min(20, len(focus_area_df))) # Ensure we don't try to sample more than exist

    # Concatenate the sampled rows to the new DataFrame
    new_df = pd.concat([new_df, sampled_df])

# Now 'new_df' contains up to 20 samples from each of the selected focus areas.
display(new_df.shape)

new_df.head()

In [None]:
 encoded_docs = []
for index, row in new_df.iterrows():

    inputs = context_tokenizer(row['answer'], return_tensors='pt', truncation=True, max_length=512)

    with torch.no_grad():

        doc_embedding = context_encoder(**inputs).pooler_output

    encoded_docs.append(doc_embedding.numpy())

In [None]:
import faiss

import numpy as np



encoded_docs = np.array([embedding.reshape(1,-1) for embedding in encoded_docs]) # Reshape each embedding to (1, embedding_dimension)

encoded_docs = np.vstack(encoded_docs) # Stack the reshaped embeddings into a single array



# Now proceed with indexing

dimension = encoded_docs.shape[1]  # Get the embedding dimension

index = faiss.IndexFlatIP(dimension)  # Using Inner Product (dot-product) for similarity

index.add(encoded_docs)

# Retrieval-Augmented Generation (RAG) 

Retrieval-Augmented Generation (RAG) is a technique that combines the strengths of large language models (LLMs) with the power of information retrieval to produce more accurate, relevant, and informative responses to user queries.

![](https://global.discourse-cdn.com/openai1/original/4X/8/f/f/8ffadd37d90228cdbded952f027d75155fb69391.jpeg)
   
## How RAG Works:

- **Query Understanding:** The LLM first processes the user's query to understand its intent and meaning.
- **Document Retrieval:** Based on the query, a document retrieval system searches a knowledge base or external data source to find relevant information.   
- **Contextual Augmentation:** The retrieved information is then integrated with the original query to create a more comprehensive context for the LLM.   
- **Response Generation:** The LLM uses the augmented context to generate a response that is more accurate, relevant, and informative than it would be without the additional information.


## Benefits of RAG:

- **Improved Accuracy:** RAG helps LLMs access and incorporate factual information, reducing the likelihood of hallucinations or generating incorrect responses.   
- **Enhanced Relevance:** By retrieving relevant information, RAG ensures that the LLM's responses are directly related to the user's query.   
- **Increased Informativeness:** RAG allows LLMs to provide more detailed and comprehensive responses by leveraging external knowledge sources.   
- **Adaptability:** RAG can be applied to various domains and industries, making it a versatile tool for many applications.

## Use Cases of RAG:

- **Customer Service Chatbots:** RAG-powered chatbots can access a knowledge base of product information, FAQs, and customer support guidelines to provide accurate and helpful responses.   
- **Search Engines:** RAG can enhance search engine results by providing more relevant and informative summaries of web pages.   
- **Content Creation:** RAG can assist in content creation by suggesting relevant facts, statistics, and citations.   
- **Healthcare:** RAG can help doctors and researchers access and analyze medical literature to improve diagnosis and treatment.

 
## Challenges and Considerations:

- **Data Quality:** The quality of the retrieved information is crucial for the effectiveness of RAG. It's important to ensure that the knowledge base is accurate, up-to-date, and relevant.
- **Model Bias:** LLMs can inherit biases from the data they are trained on. It's important to be aware of these biases and take steps to mitigate them.   
- **Computational Cost:** RAG can be computationally expensive, especially when dealing with large knowledge bases and complex queries.
- **Privacy Concerns:** When accessing and processing sensitive information, it's important to consider privacy implications and implement appropriate safeguards.

Overall, RAG is a powerful technique that has the potential to significantly improve the capabilities of LLMs. By combining the strengths of both retrieval and generation, RAG can help create more intelligent and informative AI systems

In [None]:
%%time


def rag(query):

    # Tokenize the question

    question_inputs = question_tokenizer(query, return_tensors='pt')

    with torch.no_grad():

        # Embeddings

        question_embedding = question_encoder(**question_inputs).pooler_output.numpy()

    # Retrieve indexes of top 2 documents

    top_k = 2

    _, indices = index.search(question_embedding, top_k)

    retrieved_docs = [new_df['answer'].iloc[idx] for idx in indices[0]] # Use new_df and iloc

    # Concatenate retrieved documents with the query as input for the generator

    context = " ".join(retrieved_docs)

    input_text = query + " " + context

    # Generate a response using the generator model

    inputs = generator_tokenizer(input_text, return_tensors="pt", max_length=1024, truncation=True)

    output_ids = generator.generate(**inputs, max_length=20, num_beams=5, early_stopping=True)

    answer = generator_tokenizer.decode(output_ids[0], skip_special_tokens=True)

    return answer

In [None]:
new_df['focus_area'].value_counts()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming 'df' is your input DataFrame
focus_area_counts =new_df['focus_area'].value_counts()
focus_areas = focus_area_counts[focus_area_counts > 15].index

fig, ax = plt.subplots(figsize=(20, 8))
sns.set(font_scale=1.2)

ax.pie(focus_area_counts[focus_areas], labels=focus_areas, autopct='%1.1f%%')
ax.set_title('Pie Chart of Focus Areas')
ax.axis('equal')  # Equal aspect ratio ensures that pie is circular.

plt.show()

In [None]:
query = "What is the Breast Cancer?"

response = rag(query)

print("Question:", query)
print("Response:", response)

In [None]:
query = "What is COPD?"

response = rag(query)

print("Question:", query)
print("Response:", response)

In [None]:
query = "What is Osteoporosis?"

response = rag(query)

print("Question:", query)
print("Response:", response)

In [None]:
query = "What is High Blood Pressure?"

response = rag(query)


print(f"\033[1;31mQuestion:\033[0m \033[1;36m{query}\033[0m")
print(f"\033[1;31mResponse:\033[0m \033[1;36m{response}\033[0m")