In [1]:
import pandas as pd
import faiss
import numpy as np
from sklearn.preprocessing import normalize
from transformers import BertTokenizer, BertModel
import torch
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

# Download required resources
nltk.download('stopwords')
nltk.download('wordnet')
from langchain_core.prompts import PromptTemplate
from transformers import GPT2LMHeadModel, GPT2Tokenizer


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jithi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jithi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
data=pd.read_csv(r"C:\Users\jithi\Downloads\conversational_dataset.csv")

In [3]:
data

Unnamed: 0.1,Unnamed: 0,Question,Answer
0,0,can you recommend effective core ab exercises?,"sure! planks, bicycle crunches, and leg raises..."
1,1,how can i incorporate physical activity into m...,take the stairs instead of the elevator whenev...
2,2,why is physical activity important?,regular physical activity can help maintain yo...
3,3,how can i manage stress and maintain a healthy...,move your body. exercise regularly and engage ...
4,4,i have a busy travel schedule. how can i maint...,"travel can disrupt routines, but it's manageab..."
...,...,...,...
960,960,is 10 minutes of stretching enough?,"if youre warming up for a workout, 5 to 10 min..."
961,961,should stretching be painful?,its normal to feel mild to moderate discomfort...
962,962,is there a wrong way to stretch?,"regularly stretching to the point of pain, try..."
963,963,is it okay to stretch every day?,"not only is it okay to stretch every day, its ..."


In [4]:
df=data

In [5]:
questions = df['Question'].tolist()
answers = df['Answer'].tolist()

# Step 2: Load Stop Words and Initialize Lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Step 3: Preprocessing Function
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenize the text
    words = text.split()
    # Remove stopwords and lemmatize
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    # Join words back into a single string
    return ' '.join(words)

# Apply preprocessing to questions
preprocessed_questions = [preprocess_text(question) for question in questions]


In [6]:

# Step 4: Initialize the BERT Tokenizer and Model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')


tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [7]:

# Step 5: Function to Encode Questions into BERT Embeddings
def encode_questions(questions):
    embeddings = []
    for question in questions:
        inputs = tokenizer(question, return_tensors='pt', padding=True, truncation=True)
        with torch.no_grad():
            outputs = model(**inputs)
            # Use the mean of the last hidden state
            embeddings.append(outputs.last_hidden_state.mean(dim=1).numpy())
    return embeddings

In [8]:
# Step 6: Encode and Normalize the Preprocessed Question Embeddings
question_embeddings = encode_questions(preprocessed_questions)
normalized_embeddings = normalize(np.vstack(question_embeddings))  # Normalize embeddings

In [9]:
# Step 7: Create a FAISS Index Using Inner Product (Cosine Similarity)
index = faiss.IndexFlatIP(normalized_embeddings.shape[1])  # Inner Product index for cosine similarity
index.add(normalized_embeddings)  # Add normalized question embeddings to FAISS index

In [10]:
# Save the FAISS index to a file
faiss.write_index(index, 'faiss_index.index')  # Specify the file name and path
print("FAISS index saved successfully.")

FAISS index saved successfully.


In [11]:
# Step 8: Search Function Returning Cosine Similarity Scores
def search(query, k=5):
    # Preprocess and encode the query
    preprocessed_query = preprocess_text(query)
    query_embedding = encode_questions([preprocessed_query])[0]
    normalized_query = normalize(query_embedding.reshape(1, -1))  # Normalize the query embedding
    D, I = index.search(normalized_query, k)  # Perform the search

    # Retrieve the top-k answers along with their cosine similarity scores
    results = [(questions[i],answers[i],D[0][idx]) for idx, i in enumerate(I[0])]
    return results

In [23]:
# Example Usage
query = "how can i improve flexibility"
results = search(query)
qa=[]
# Display the Top 3 Results with Cosine Similarity Scores
for question, answer, score in results:
    print(f"Question: {question}\nAnswer: {answer}\nCosine Similarity: {score}\n")
    qa.append(f"Question: {question}\nAnswer: {answer}")

Question: how can i improve my flexibility?
Answer: stretch regularly, practice yoga, and incorporate dynamic stretches before workouts.
Cosine Similarity: 1.0000001192092896

Question: how can i improve flexibility?
Answer: flexibility improves with consistency. dynamic stretches before workouts and static stretches after, help. you can also try yoga or pilates. breathe deeply and be patient. over time, flexibility will increase.
Cosine Similarity: 1.0000001192092896

Question: how can i improve my flexibility if i'm not very flexible?
Answer: improving flexibility starts with gentle stretching exercises targeting major muscle groups. you can try basic stretches like toe touches, shoulder rolls, and hamstring stretches. yoga or pilates classes designed for beginners can also be helpful. consistency is key, so aim to stretch regularly and listen to your body's limits.
Cosine Similarity: 0.8801306486129761

Question: how can i improve my posture?
Answer: good posture is essential. stren

In [27]:
qa

['Question: how can i improve my flexibility?\nAnswer: stretch regularly, practice yoga, and incorporate dynamic stretches before workouts.',
 'Question: how can i improve flexibility?\nAnswer: flexibility improves with consistency. dynamic stretches before workouts and static stretches after, help. you can also try yoga or pilates. breathe deeply and be patient. over time, flexibility will increase.',
 "Question: how can i improve my flexibility if i'm not very flexible?\nAnswer: improving flexibility starts with gentle stretching exercises targeting major muscle groups. you can try basic stretches like toe touches, shoulder rolls, and hamstring stretches. yoga or pilates classes designed for beginners can also be helpful. consistency is key, so aim to stretch regularly and listen to your body's limits.",
 'Question: how can i improve my posture?\nAnswer: good posture is essential. strengthen core muscles with exercises like planks and bridges. be mindful of your sitting and standing 

In [None]:
# replace with azure_openai credentials 
import getpass
import os

os.environ["AZURE_OPENAI_API_KEY"] = getpass.getpass()

from langchain_openai import AzureChatOpenAI

llm = AzureChatOpenAI(
    azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
    azure_deployment=os.environ["AZURE_OPENAI_DEPLOYMENT_NAME"],
    openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"],
)

In [None]:
#here we are pasing top 5 retrived arguments in the prompt to the llm model to decide the best one
prompt = PromptTemplate.from_template(f"Use the following list of questions and answers {qa} to answer the question at the end. Pick the most relavent answer based on this user query {query}.
If you don't know the answer or the relavent answer not found for the query, just say that you don't know,strictly don't try to make up an answer.")

chain = prompt | llm
answer=chain.invoke()

In [None]:
# evaluation is mentioned in separate jupyter notebook file rag_eval