In [5]:
import openai
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
from dotenv import load_dotenv
import os

In [20]:
load_dotenv("/Users/priyal/Documents/honours/.env/key.env")
openai.api_key = os.getenv("OPENAI_API_KEY")

In [21]:
def generate_question(abstract):
    prompt = (
        "Based on the following abstract, generate a simple and natural-sounding "
        "question that an average person might ask to get practical health advice. "
        "Avoid technical or research-specific phrasing.\n\n"
        f"Abstract: {abstract}\n\n"
        "Example questions:\n"
        "- What are some natural ways to relieve joint pain?\n"
        "- How can Ayurveda help with digestion problems?\n"
        "- What herbs are good for reducing stress?\n\n"
        "Generate a similar question based on the abstract above."
    )

    try:
        client = openai.OpenAI()
        response = client.chat.completions.create(
            model="gpt-4",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.7,
            max_tokens=50,
        )
        question = response.choices[0].message.content.strip()
        return question
    except Exception as e:
        print(f"Error generating question: {e}")
        return None

In [22]:
def generate_answer(abstract, title):

    prompt = (
        "Based on the following abstract, extract the key Ayurvedic remedy information "
        "that answers potential user queries. Include the research paper title as a citation "
        "in the answer in the following format: 'According to [Paper Title], ...'.\n\n"
        f"Abstract: {abstract}\n\nPaper Title: {title}\n\nAnswer:"
    )
    try:
        client = openai.OpenAI()
        response = client.chat.completions.create(
            model="gpt-4",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.7,
            max_tokens=150,
        )
        answer = response.choices[0].message.content.strip()
        return answer
    except Exception as e:
        print(f"Error generating answer: {e}")
        return None

In [23]:
# def paraphrase_text(text):
#     """
#     Uses GPT-4 to paraphrase text, which is useful for data augmentation.
#     """
#     prompt = (
#         "Paraphrase the following text to create a variation in phrasing while keeping "
#         "the meaning intact:\n\n"
#         f"{text}\n\nParaphrased version:"
#     )
#     try:
#         response = openai.ChatCompletion.create(
#             model="gpt-4",
#             messages=[{"role": "user", "content": prompt}],
#             temperature=0.8,
#             max_tokens=150,
#         )
#         paraphrased = response.choices[0].message["content"].strip()
#         return paraphrased
#     except Exception as e:
#         print(f"Error paraphrasing text: {e}")
#         return None

In [24]:
# def augment_example(instruction, output):
#     """
#     Creates an augmented example by paraphrasing the question (instruction) and the answer (output).
#     """
#     augmented_instruction = paraphrase_text(instruction)
#     augmented_output = paraphrase_text(output)
#     return augmented_instruction, augmented_output

In [25]:
def main(input_csv, output_csv):
   
    df = pd.read_csv(input_csv)
    df = df.dropna(subset=["Abstract"])
    df = df.head(8137)  

    dataset = [] 

    for index, row in df.iterrows():
        title = row["Title"]
        abstract = row["Abstract"]

        question = generate_question(abstract)
        if not question:
            continue

        answer = generate_answer(abstract, title)
        if not answer:
            continue

        example = {"instruction": question, "input": abstract, "output": answer}
        dataset.append(example)

    print(f"Processed {len(dataset)} QA pairs")
    return dataset  

In [26]:
input_csv = "/Users/priyal/Documents/honours/datasets/extracted_papers_info.csv"
output_csv = "/Users/priyal/Documents/honours/datasets/qa_dataset_bigger_normal.csv"

dataset = main(input_csv, output_csv)

Error generating question: Connection error.
Error generating question: Connection error.
Error generating question: Connection error.
Error generating question: Connection error.
Error generating question: Connection error.
Error generating question: Connection error.
Error generating question: Connection error.
Error generating question: Connection error.
Error generating question: Connection error.
Error generating question: Connection error.
Error generating question: Connection error.
Error generating question: Connection error.
Error generating question: Connection error.
Error generating question: Connection error.
Error generating question: Connection error.
Error generating question: Connection error.
Error generating question: Connection error.
Error generating question: Connection error.
Error generating question: Connection error.
Error generating question: Connection error.
Error generating question: Connection error.
Error generating question: Connection error.
Error gene

In [27]:
dataset_df = pd.DataFrame(dataset)  
dataset_df.to_csv(output_csv, index=False)
print(f"QA dataset saved to {output_csv}")

QA dataset saved to /Users/priyal/Documents/honours/datasets/qa_dataset_bigger_normal.csv
