In [19]:
# Step 1: Setup and Installation

# Install required libraries
!pip install PyPDF2 pdfplumber transformers torch datasets openai python-dotenv
!pip install sentence-transformers
!pip install --upgrade openai

# Import libraries
import json
import PyPDF2
import pdfplumber
import re
from typing import List, Dict
import pandas as pd
from datetime import datetime



In [20]:
# Step 2: Upload and Extract PDF Text

# Upload PDF file to Colab
from google.colab import files
uploaded = files.upload()

# Get the filename
pdf_filename = list(uploaded.keys())[0]
print(f"Uploaded file: {pdf_filename}")

# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    text_content = []

    # Using pdfplumber for better text extraction
    with pdfplumber.open(pdf_path) as pdf:
        total_pages = len(pdf.pages)
        print(f"Total pages: {total_pages}")

        for page_num, page in enumerate(pdf.pages):
            try:
                text = page.extract_text()
                if text:
                    text_content.append({
                        'page': page_num + 1,
                        'text': text.strip()
                    })

                # Progress indicator
                if (page_num + 1) % 50 == 0:
                    print(f"Processed {page_num + 1}/{total_pages} pages")

            except Exception as e:
                print(f"Error processing page {page_num + 1}: {e}")
                continue

    return text_content

# Extract text
print("Extracting text from PDF...")
extracted_text = extract_text_from_pdf(pdf_filename)
print(f"Successfully extracted text from {len(extracted_text)} pages")

Saving Insurance_Handbook_20103.pdf to Insurance_Handbook_20103 (2).pdf
Uploaded file: Insurance_Handbook_20103 (2).pdf
Extracting text from PDF...
Total pages: 205
Processed 50/205 pages
Processed 100/205 pages
Processed 150/205 pages
Processed 200/205 pages
Successfully extracted text from 205 pages


In [21]:
# Step 3: Text Preprocessing and Chunking

# Function to clean and chunk text
def preprocess_and_chunk_text(text_data, chunk_size=500, overlap=200):
    chunks = []

    for page_data in text_data:
        page_num = page_data['page']
        text = page_data['text']

        # Clean text
        text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
        text = re.sub(r'\n+', '\n', text)  # Remove extra newlines
        text = text.strip()

        # Skip very short texts
        if len(text) < 100:
            continue

        # Split into chunks
        words = text.split()
        for i in range(0, len(words), chunk_size - overlap):
            chunk_words = words[i:i + chunk_size]
            chunk_text = ' '.join(chunk_words)

            if len(chunk_text.strip()) > 50:  # Only keep meaningful chunks
                chunks.append({
                    'page': page_num,
                    'chunk_id': len(chunks),
                    'text': chunk_text,
                    'word_count': len(chunk_words)
                })

    return chunks

# Process and chunk the text
print("Processing and chunking text...")
text_chunks = preprocess_and_chunk_text(extracted_text)
print(f"Created {len(text_chunks)} text chunks")

Processing and chunking text...
Created 327 text chunks


In [22]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [23]:
# Set the API key

import openai
filepath = "/content/drive/MyDrive/Master Thesis/Colab Notebook/"

with open(filepath + "OpenAI_API_Key.txt", "r") as f:
  openai.api_key = ' '.join(f.readlines())

In [None]:
# Step 4: Generate Questions and Answers (old code for older versions of Open AI)
# Option A: Using OpenAI API

import openai
from google.colab import userdata

# Get API key from Colab secrets
openai.api_key = userdata.get('OPENAI_API_KEY')  # Add API key to Colab secrets


def generate_qa_with_openai(text_chunk, num_questions=5):
    prompt = f"""
Based on the following text, generate {num_questions} question-answer pairs.
Make the questions diverse (factual, analytical, conceptual) and ensure answers are accurate and complete.

Text: {text_chunk}

Format your response as JSON:
{{
  "qa_pairs": [
    {{"question": "question here", "answer": "answer here"}},
    {{"question": "question here", "answer": "answer here"}}
  ]
}}
"""

    try:
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.7,
            max_tokens=1500
        )

        result = json.loads(response.choices[0].message.content)
        return result.get('qa_pairs', [])

    except Exception as e:
        print(f"Error generating QA: {e}")
        return []

In [30]:
# Step 4: Generate Questions and Answers (new code for newer versions of OpenAI)
# Option A: Using OpenAI API v1.x.x

import openai
import json
from openai import OpenAI
from google.colab import userdata

# ✅ Set API key (retrieved securely from Colab Secrets)
client = OpenAI(api_key=userdata.get("OPENAI_API_KEY"))

def generate_qa_with_openai(text_chunk, num_questions=5):
    prompt = f"""
Based on the following text, generate {num_questions} question-answer pairs.
Make the questions diverse (factual, analytical, conceptual) and ensure answers are accurate and complete.

Text: {text_chunk}

Format your response as JSON:
{{
  "qa_pairs": [
    {{"question": "question here", "answer": "answer here"}},
    {{"question": "question here", "answer": "answer here"}}
  ]
}}
"""

    try:
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            #model="gpt-4o",
            #model="gpt-4o-mini",
            messages=[{"role": "user", "content": prompt}],
            #temperature=0.7,
            temperature=0.4,
            max_tokens=1500
        )

        content = response.choices[0].message.content.strip()

        # Parse the JSON block from the model response
        result = json.loads(content)
        return result.get("qa_pairs", [])

    except Exception as e:
        print(f"Error generating QA: {e}")
        return []


In [27]:
# Option B: Using Hugging Face Transformers (Free alternative)

from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM

# Load question generation model
question_generator = pipeline("text2text-generation",
                            model="valhalla/t5-base-qg-hl",
                            tokenizer="valhalla/t5-base-qg-hl")

def generate_qa_with_transformers(text_chunk, num_questions=5):
    qa_pairs = []

    # Split text into sentences for better question generation
    sentences = text_chunk.split('. ')

    for i, sentence in enumerate(sentences[:num_questions]):
        if len(sentence.strip()) < 20:
            continue

        try:
            # Generate question
            # This creates a prompt for the AI model. The model is trained to generate questions when it sees "generate question: [text]"
            input_text = f"generate question: {sentence}"
            question_result = question_generator(input_text,
                                               max_length=100,
                                               num_return_sequences=1)
            question = question_result[0]['generated_text'].strip()

            # Use the original sentence/context as answer
            answer = sentence.strip()

            if question and answer:
                qa_pairs.append({
                    "question": question,
                    "answer": answer
                })

        except Exception as e:
            print(f"Error generating question for sentence {i}: {e}")
            continue

    return qa_pairs

Device set to use cuda:0


In [31]:
# Step 5: Generate Q&A Dataset (openai or transformers)

def create_qa_dataset(text_chunks, method='openai', batch_size=10):
    dataset = []

    print(f"Generating Q&A pairs for {len(text_chunks)} chunks...")
    print("Method used", method)

    for i, chunk in enumerate(text_chunks):
        try:
            # Choose generation method
            if method == 'openai':
                qa_pairs = generate_qa_with_openai(chunk['text'])
            else:
                qa_pairs = generate_qa_with_transformers(chunk['text'])

            # Add metadata to each Q&A pair
            for qa in qa_pairs:
                dataset.append({
                    #'id': len(dataset),
                    'input': qa['question'],
                    'output': qa['answer'],
                    #'source_page': chunk['page'],
                    #'chunk_id': chunk['chunk_id'],
                    #'context': chunk['text'][:500] + "..." if len(chunk['text']) > 500 else chunk['text']
                })

            # Progress update
            if (i + 1) % batch_size == 0:
                print(f"Processed {i + 1}/{len(text_chunks)} chunks. Generated {len(dataset)} Q&A pairs so far.")

        except Exception as e:
            print(f"Error processing chunk {i}: {e}")
            continue

    return dataset

# Generate the dataset
qa_dataset = create_qa_dataset(text_chunks[:], method='openai')  # Start with first 100 chunks
print(f"Generated {len(qa_dataset)} question-answer pairs")

Generating Q&A pairs for 327 chunks...
Method used openai
Processed 10/327 chunks. Generated 50 Q&A pairs so far.
Processed 20/327 chunks. Generated 100 Q&A pairs so far.
Processed 30/327 chunks. Generated 150 Q&A pairs so far.
Processed 40/327 chunks. Generated 200 Q&A pairs so far.
Processed 50/327 chunks. Generated 250 Q&A pairs so far.
Processed 60/327 chunks. Generated 300 Q&A pairs so far.
Processed 70/327 chunks. Generated 350 Q&A pairs so far.
Processed 80/327 chunks. Generated 400 Q&A pairs so far.
Processed 90/327 chunks. Generated 450 Q&A pairs so far.
Processed 100/327 chunks. Generated 500 Q&A pairs so far.
Error generating QA: Expecting value: line 8 column 3 (char 1660)
Processed 110/327 chunks. Generated 545 Q&A pairs so far.
Processed 120/327 chunks. Generated 595 Q&A pairs so far.
Processed 130/327 chunks. Generated 645 Q&A pairs so far.
Processed 140/327 chunks. Generated 695 Q&A pairs so far.
Processed 150/327 chunks. Generated 745 Q&A pairs so far.
Processed 160/32

In [32]:
# Step 6: Save Dataset as JSON
# Create final dataset structure
final_dataset = {
    'metadata': {
        'source_document': pdf_filename,
        'total_pages': len(extracted_text),
        'total_chunks_processed': len(text_chunks),
        'total_qa_pairs': len(qa_dataset),
        'generation_date': datetime.now().isoformat(),
        'generation_method': 'openai' # or 'transformers'
    },
    'qa_pairs': qa_dataset
}

# Save to JSON file
output_filename = f"qa_dataset_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"

with open(output_filename, 'w', encoding='utf-8') as f:
    json.dump(final_dataset, f, indent=2, ensure_ascii=False)

print(f"Dataset saved as: {output_filename}")

# Download the file
files.download(output_filename)

Dataset saved as: qa_dataset_20250715_145607.json


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [33]:
# Step 7: Quality Check and Statistics. Analyze the generated dataset

def analyze_dataset(dataset):
    qa_pairs = dataset['qa_pairs']

    print("=== Dataset Analysis ===")
    print(f"Total Q&A pairs: {len(qa_pairs)}")

    # Question length statistics
    q_lengths = [len(qa['input'].split()) for qa in qa_pairs]
    print(f"Average question length: {sum(q_lengths)/len(q_lengths):.1f} words")

    # Answer length statistics
    a_lengths = [len(qa['output'].split()) for qa in qa_pairs]
    print(f"Average answer length: {sum(a_lengths)/len(a_lengths):.1f} words")

    # Page coverage
    #pages_covered = set(qa['source_page'] for qa in qa_pairs)
    #print(f"Pages covered: {len(pages_covered)}")

    # Show sample Q&A pairs
    print("\n=== Sample Q&A Pairs ===")
    for i, qa in enumerate(qa_pairs[:3]):
        print(f"\nQ{i+1}: {qa['input']}")
        print(f"A{i+1}: {qa['output']}")
        #print(f"Source: Page {qa['source_page']}")

# Analyze the generated dataset
analyze_dataset(final_dataset)

=== Dataset Analysis ===
Total Q&A pairs: 1625
Average question length: 10.8 words
Average answer length: 20.0 words

=== Sample Q&A Pairs ===

Q1: What is the purpose of an insurance handbook?
A1: The purpose of an insurance handbook is to provide information on what insurance is, what it does, and how it works.

Q2: How can insurance help individuals and businesses?
A2: Insurance can help individuals and businesses by providing financial protection against potential risks and losses.

Q3: What does the Insurance Information Institute do?
A3: The Insurance Information Institute is an organization that provides information and resources about insurance to the public.
