In [1]:
from dotenv import load_dotenv
import os
import sys
import logging

import pandas as pd
from llama_index.core import StorageContext, load_index_from_storage

from llama_index.core.retrievers import VectorIndexRetriever

from llama_index.core import Settings

from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI

from llama_index.core.data_structs import Node
from llama_index.core.schema import NodeWithScore
from llama_index.core import get_response_synthesizer


import openai

import nest_asyncio
nest_asyncio.apply()

import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

from llama_index.llms.openai import OpenAI
from IPython.display import Markdown, display

from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core import QueryBundle
from llama_index.postprocessor.rankgpt_rerank import RankGPTRerank

import pandas as pd
from IPython.display import display, HTML


load_dotenv()  # Load environment variables from .env file
openai.api_key = os.getenv("OPENAI_API_KEY")

OPENAI_API_KEY = openai.api_key

Settings.llm = OpenAI(temperature=0, model="gpt-3.5-turbo")
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-large")


logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))


%load_ext autoreload
%autoreload 2




In [2]:
# Load the Excel file
file_path = "../data/processed_data/df_cluster.xlsx"
df = pd.read_excel(file_path)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44 entries, 0 to 43
Data columns (total 22 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   Unnamed: 0                 44 non-null     int64         
 1   Country                    44 non-null     object        
 2   Requester_Type             44 non-null     object        
 3   Product                    44 non-null     object        
 4   Indication                 44 non-null     object        
 5   Question                   44 non-null     object        
 6   Channel                    44 non-null     object        
 7   Date_Time_Open             44 non-null     datetime64[ns]
 8   Date_Time_Closed           44 non-null     datetime64[ns]
 9   Answer_Solution            44 non-null     object        
 10  Duration                   44 non-null     int64         
 11  Day_of_Week                44 non-null     object        
 12  Week_Numbe

In [18]:
from dotenv import load_dotenv
import os
import logging
from llama_index.core import Settings
from llama_index.llms.openai import OpenAI
import json

# Load environment variables from .env file
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

# Configure logging
logging.basicConfig(level=logging.INFO)

class QuestionAugmenter:
    def __init__(self, temperature=0, model="gpt-3.5-turbo"):
        # Set the LLM settings
        Settings.llm = OpenAI(temperature=temperature, model=model)
        self.prompt = """
        You are an AI assistant designed to help generate insightful questions based on a given medical topic, 
        specifically focusing on Keytruda, a medication used in cancer treatment. 
        Your task is to generate new questions that maintain the same core meaning, context, 
        and intent as the original question while providing unique perspectives or phrasings.

        Please follow these guidelines strictly:
        1. Retain the main topic and medical context of the original question.
        2. Do not introduce new information or change the clinical meaning.
        3. Ensure each question addresses the same subject matter in different words, focusing on Keytruda.
        4. Use precise medical language consistent with the original question.

        Here is an example based on these guidelines:

        Given the question: "What are the common side effects of Keytruda?"
        
        Generate 5 unique augmented questions in this JSON format:

        {{
            "question_1": "What are the most frequently reported side effects of Keytruda?",
            "question_2": "Which side effects are commonly associated with Keytruda treatment?",
            "question_3": "What adverse effects should patients be aware of when taking Keytruda?",
            "question_4": "How often do patients experience side effects while on Keytruda?",
            "question_5": "What are the typical side effects observed in patients using Keytruda?"
        }}

        Notice how each augmented question maintains the focus on the side effects of Keytruda 
        without altering the original medical context or introducing unrelated topics.

        Now, based on the following question:
        "{question}"
        
        Please provide {num_questions} augmented questions in the same format, ensuring they preserve the original meaning and medical context.
        """

    def generate_augmented_questions(self, question, num_questions=5):
        # Correctly format the prompt with the actual question and number of questions
        prompt = self.prompt.format(question=question, num_questions=num_questions)
        
        # Use the LLM to generate responses
        response = Settings.llm.complete(prompt)
        
        # Parse JSON into a dictionary
        try:
            augmented_questions_dict = json.loads(response.text)
        except json.JSONDecodeError:
            logging.error("Failed to parse JSON. Please check the response format.")
            return []

        # Extract the questions into a list
        augmented_questions = list(augmented_questions_dict.values())
        
        return augmented_questions


In [19]:
row = df.iloc[0]
original_question = row["Question"]
print(original_question)
augmenter = QuestionAugmenter()
augmented_questions = augmenter.generate_augmented_questions(original_question)
augmented_questions


What are the common side effects of Keytruda?
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


['What are the frequently encountered adverse reactions of Keytruda?',
 'Which side effects are typically seen in patients receiving Keytruda therapy?',
 'What are the usual side effects associated with Keytruda treatment?',
 'How common are the side effects experienced by individuals using Keytruda?',
 'What are the standard side effects reported in patients treated with Keytruda?']

In [20]:
df["augmented_questions"] = df.apply(lambda row: augmenter.generate_augmented_questions(row["Question"]) , axis = 1)


df = df.explode("augmented_questions")
df.reset_index(drop=True, inplace=True)

df.to_parquet("../data/processed_data/df_eval.parquet")

df.head




INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 

<bound method NDFrame.head of      Unnamed: 0 Country Requester_Type   Product Indication  \
0             0      UK            HCP  Keytruda      NSCLC   
1             0      UK            HCP  Keytruda      NSCLC   
2             0      UK            HCP  Keytruda      NSCLC   
3             0      UK            HCP  Keytruda      NSCLC   
4             0      UK            HCP  Keytruda      NSCLC   
..          ...     ...            ...       ...        ...   
215          49  Canada     Pharmacist  Keytruda      NSCLC   
216          49  Canada     Pharmacist  Keytruda      NSCLC   
217          49  Canada     Pharmacist  Keytruda      NSCLC   
218          49  Canada     Pharmacist  Keytruda      NSCLC   
219          49  Canada     Pharmacist  Keytruda      NSCLC   

                                              Question Channel  \
0        What are the common side effects of Keytruda?   email   
1        What are the common side effects of Keytruda?   email   
2        What a

In [22]:
row = df.iloc[10]
print(row["Question"])
print(row["augmented_questions"])


Is Keytruda safe for pregnant women?
What is the safety profile of Keytruda in pregnant women?
