In [30]:
import pickle
with open("scraped_chunks.pkl", "rb") as f:
    data = pickle.load(f)
print(len(data))

299


In [33]:
import requests
import json
import pandas as pd
import re

ollama_url_gen = "http://localhost:11434/api/generate"
ollama_model_name = "llama3.2:latest"

def generate_qa_pairs(chunk, model=ollama_model_name):
    prompt = f"""
You are an expert assistant skilled at generating question-answer pairs for text comprehension. 
Given a text chunk, generate diverse, relevant, and accurate question-answer pairs that cover the key information in the chunk. 
Your output should be a list of pairs in JSON format, where each pair includes a "question" and its corresponding "answer."

Example:
Chunk: "The Eiffel Tower is a wrought-iron lattice tower located on the Champ de Mars in Paris, France. It was designed by Gustave Eiffel's engineering company and completed in 1889."

Output:
[
  {{"question": "Where is the Eiffel Tower located?", "answer": "The Eiffel Tower is located on the Champ de Mars in Paris, France."}},
  {{"question": "Who designed the Eiffel Tower?", "answer": "The Eiffel Tower was designed by Gustave Eiffel's engineering company."}},
  {{"question": "When was the Eiffel Tower completed?", "answer": "The Eiffel Tower was completed in 1889."}}
]

Now, process the following chunk:
Chunk: {chunk}
"""
    
    payload = {"model": model, "prompt": prompt, "stream": False}
    headers = {"Content-Type": "application/json"}
    response = requests.post(ollama_url_gen, headers=headers, data=json.dumps(payload))
    
    if response.status_code != 200:
        raise Exception(f"Error from Ollama: {response.text}")
    
    content = response.json().get("response", "")
    try:
        json_match = re.search(r"\[.*?\]", content, re.DOTALL)
        if json_match:
            raw_json = json_match.group()
            sanitized_json = re.sub(r",\s*]", "]", raw_json)
            qa_pairs = json.loads(sanitized_json)
            return qa_pairs
        else:
            raise Exception(f"Valid JSON not found in response: {content}")
    except json.JSONDecodeError as e:
        raise Exception(f"Failed to parse JSON response: {content}")

In [43]:
df_chunks = pd.DataFrame(data)[1:3]

In [44]:
df_chunks

Unnamed: 0,0,1
1,List | War | Death range | Date | Combatants |...,https://en.wikipedia.org/wiki/List_of_wars_by_...
2,List | War | Death range | Date | Combatants |...,https://en.wikipedia.org/wiki/List_of_wars_by_...


In [45]:
from tqdm.notebook import tqdm

qa_data = []
for _, row in tqdm(df_chunks.iterrows(), total=len(df_chunks)):
    chunk_id = row[1]
    chunk_text = row[0]
    try:
        qa_pairs = generate_qa_pairs(chunk_text)
        for pair in qa_pairs:
            qa_data.append({
                "chunk_id": chunk_id,
                "question": pair["question"],
                "answer": pair["answer"]
            })
    except Exception as e:
        print(f"Error processing chunk {chunk_id}: {e}")

df_qa = pd.DataFrame(qa_data)
df_qa.to_csv("qa_pairs.csv", index=False)
print("QA pairs saved to qa_pairs.csv")

  0%|          | 0/2 [00:00<?, ?it/s]

QA pairs saved to qa_pairs.csv


In [46]:
df_qa

Unnamed: 0,chunk_id,question,answer
0,https://en.wikipedia.org/wiki/List_of_wars_by_...,What was the approximate number of deaths caus...,50-85 million
1,https://en.wikipedia.org/wiki/List_of_wars_by_...,What were the combatants involved in World War...,Allied Powers vs. Axis Powers
2,https://en.wikipedia.org/wiki/List_of_wars_by_...,When did World War II take place?,1939–1945
3,https://en.wikipedia.org/wiki/List_of_wars_by_...,Which location was affected by World War II?,Global
4,https://en.wikipedia.org/wiki/List_of_wars_by_...,What were the approximate number of deaths cau...,20-60 million
5,https://en.wikipedia.org/wiki/List_of_wars_by_...,Who did the Mongols invade and conquer during ...,Mongol Empire vs. various states in Eurasia
6,https://en.wikipedia.org/wiki/List_of_wars_by_...,Where did the Mongol invasions and conquests t...,Asia and Europe
7,https://en.wikipedia.org/wiki/List_of_wars_by_...,What was the approximate number of deaths caus...,34 million
8,https://en.wikipedia.org/wiki/List_of_wars_by_...,Which dates mark the beginning and end of the ...,220–280
9,https://en.wikipedia.org/wiki/List_of_wars_by_...,Who were the various sides involved in the Thr...,Multiple sides


In [47]:
import requests
import json
import pandas as pd
import re
from tqdm.notebook import tqdm
import pickle

ollama_url_gen = "http://localhost:11434/api/generate"
ollama_model_name = "llama3.2:latest"


def generate_qa_pairs(chunk, model=ollama_model_name):
    prompt = f"""
You are an expert assistant skilled at generating question-answer pairs for text comprehension. 
Given a text chunk, generate diverse, relevant, and accurate question-answer pairs that cover the key information in the chunk. 
Your output should be a list of pairs in JSON format, where each pair includes a "question" and its corresponding "answer."

Example:
Chunk: "The Eiffel Tower is a wrought-iron lattice tower located on the Champ de Mars in Paris, France. It was designed by Gustave Eiffel's engineering company and completed in 1889."

Output:
[
  {{"question": "Where is the Eiffel Tower located?", "answer": "The Eiffel Tower is located on the Champ de Mars in Paris, France."}},
  {{"question": "Who designed the Eiffel Tower?", "answer": "The Eiffel Tower was designed by Gustave Eiffel's engineering company."}},
  {{"question": "When was the Eiffel Tower completed?", "answer": "The Eiffel Tower was completed in 1889."}}
]

Now, process the following chunk:
Chunk: {chunk}
"""
    payload = {"model": model, "prompt": prompt, "stream": False}
    headers = {"Content-Type": "application/json"}
    response = requests.post(ollama_url_gen, headers=headers, data=json.dumps(payload))
    
    if response.status_code != 200:
        raise Exception(f"Error from Ollama: {response.text}")
    
    content = response.json().get("response", "")
    try:
        json_match = re.search(r"\[.*?\]", content, re.DOTALL)
        if json_match:
            raw_json = json_match.group()
            sanitized_json = re.sub(r",\s*]", "]", raw_json)
            qa_pairs = json.loads(sanitized_json)
            return qa_pairs
        else:
            raise Exception(f"Valid JSON not found in response: {content}")
    except json.JSONDecodeError as e:
        raise Exception(f"Failed to parse JSON response: {content}")


with open("scraped_chunks.pkl", "rb") as f:
    data = pickle.load(f)[1:3]
    
df_chunks = pd.DataFrame(data, columns=["chunk_text", "url"])

rag_chunks = []
for _, row in tqdm(df_chunks.iterrows(), total=len(df_chunks)):
    chunk_text = row["chunk_text"]
    url = row["url"]
    try:
        qa_pairs = generate_qa_pairs(chunk_text)
        for pair in qa_pairs:
            new_chunk = {
                "text": f"Q: {pair['question']} A: {pair['answer']}",
                "url": url
            }
            rag_chunks.append(new_chunk)
    except Exception as e:
        print(f"Error processing chunk: {e}")

df_rag_chunks = pd.DataFrame(rag_chunks)
df_rag_chunks.to_pickle("rag_chunks.pkl")
df_rag_chunks.to_csv("rag_chunks.csv", index=False)

print("RAG chunks saved to 'rag_chunks.pkl' and 'rag_chunks.csv'")


  0%|          | 0/2 [00:00<?, ?it/s]

Error processing chunk: Valid JSON not found in response: Here are the question-answer pairs for the given chunk in JSON format:

[
  {"question": "What was the estimated number of deaths in the crusades?", "answer": "1 – 9 million"},
  {"question": "Where did the crusades take place?", "answer": "The crusades took place in Europe and the Middle East."},
  {"question": "Who were the main combatants in the crusades?", "answer": "Originally byzantine empire vs. seljuk empire, but evolved into christians vs. muslims"},
  {"question": "When did the crusades occur?", "answer": "The crusades occurred from 1095 – 1291"},
  {"question": "What was the name of the conflict that followed the Reconquista?", "answer": "Reconquista"},
  {"question": "How many people were killed in the Reconquista?", "answer": "7 million"},
  {"question": "Where did the Reconquista take place?", "answer": "The Reconquista took place in the Iberian Peninsula"},
  {"question": "Who were the main combatants in the Recon

In [48]:
df_rag_chunks

Unnamed: 0,text,url
0,Q: What were the estimated death ranges for Wo...,https://en.wikipedia.org/wiki/List_of_wars_by_...
1,Q: When did World War II take place? A: World ...,https://en.wikipedia.org/wiki/List_of_wars_by_...
2,Q: Which sides fought in World War II? A: Alli...,https://en.wikipedia.org/wiki/List_of_wars_by_...
3,Q: What was the estimated death range for Mong...,https://en.wikipedia.org/wiki/List_of_wars_by_...
4,Q: When did the Mongol invasions and conquests...,https://en.wikipedia.org/wiki/List_of_wars_by_...
5,Q: Which sides fought in the Mongol invasions ...,https://en.wikipedia.org/wiki/List_of_wars_by_...
6,Q: What was the estimated death range for Thre...,https://en.wikipedia.org/wiki/List_of_wars_by_...
7,Q: When did the Three Kingdoms take place? A: ...,https://en.wikipedia.org/wiki/List_of_wars_by_...
8,Q: Which sides fought in the Three Kingdoms? A...,https://en.wikipedia.org/wiki/List_of_wars_by_...
9,Q: What was the estimated death range for Taip...,https://en.wikipedia.org/wiki/List_of_wars_by_...
