In [27]:
import requests
import json
import pandas as pd
import re

ollama_url_gen = "http://localhost:11434/api/generate"
ollama_model_name = "llama3.2:latest"

def generate_qa_pairs(chunk, model=ollama_model_name):
    prompt = f"""
You are an expert assistant skilled at generating question-answer pairs for text comprehension. 
Given a text chunk, generate diverse, relevant, and accurate question-answer pairs that cover the key information in the chunk. 
Your output should be a list of pairs in JSON format, where each pair includes a "question" and its corresponding "answer."

Example:
Chunk: "The Eiffel Tower is a wrought-iron lattice tower located on the Champ de Mars in Paris, France. It was designed by Gustave Eiffel's engineering company and completed in 1889."

Output:
[
  {{"question": "Where is the Eiffel Tower located?", "answer": "The Eiffel Tower is located on the Champ de Mars in Paris, France."}},
  {{"question": "Who designed the Eiffel Tower?", "answer": "The Eiffel Tower was designed by Gustave Eiffel's engineering company."}},
  {{"question": "When was the Eiffel Tower completed?", "answer": "The Eiffel Tower was completed in 1889."}}
]

Now, process the following chunk:
Chunk: {chunk}
"""
    
    payload = {"model": model, "prompt": prompt, "stream": False}
    headers = {"Content-Type": "application/json"}
    response = requests.post(ollama_url_gen, headers=headers, data=json.dumps(payload))
    
    if response.status_code != 200:
        raise Exception(f"Error from Ollama: {response.text}")
    
    content = response.json().get("response", "")
    try:
        json_match = re.search(r"\[.*?\]", content, re.DOTALL)
        if json_match:
            raw_json = json_match.group()
            sanitized_json = re.sub(r",\s*]", "]", raw_json)
            qa_pairs = json.loads(sanitized_json)
            return qa_pairs
        else:
            raise Exception(f"Valid JSON not found in response: {content}")
    except json.JSONDecodeError as e:
        raise Exception(f"Failed to parse JSON response: {content}")

# Sample chunk data
data = {"chunk_id": [1, 2], "chunk_text": ["This is a sample test. Bangladesh was liberated in 1971.", "Mujibur was the main leader for Liberation of Bangladesh."]}
df_chunks = pd.DataFrame(data)

qa_data = []
for _, row in df_chunks.iterrows():
    chunk_id = row["chunk_id"]
    chunk_text = row["chunk_text"]
    try:
        qa_pairs = generate_qa_pairs(chunk_text)
        for pair in qa_pairs:
            qa_data.append({
                "chunk_id": chunk_id,
                "question": pair["question"],
                "answer": pair["answer"]
            })
    except Exception as e:
        print(f"Error processing chunk {chunk_id}: {e}")

df_qa = pd.DataFrame(qa_data)
df_qa.to_csv("qa_pairs.csv", index=False)
print("QA pairs saved to qa_pairs.csv")

QA pairs saved to qa_pairs.csv


In [28]:
df_qa

Unnamed: 0,chunk_id,question,answer
0,1,This is a sample test.,This is a sample test.
1,1,When was Bangladesh liberated?,Bangladesh was liberated in 1971.
2,2,Who was the main leader for the Liberation of ...,Mujibur was the main leader for Liberation of ...
