In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/datapdf/how to talk anyone.pdf
/kaggle/input/datapdf/Never-Split-the-Difference.pdf
/kaggle/input/datapdf/The_Art_of_Public_Speaking-Dale_Carnegie.pdf
/kaggle/input/pdffff/WHAT TO SAY WHEN YOU TALK TO YOURSELF.pdf
/kaggle/input/pdf-data/Knowledge-Based Social Entrepreneurship.pdf
/kaggle/input/pdf-data/Silicon Valley North A High-Tech Cluster.pdf
/kaggle/input/pdf-data/The Lean Startup How Todays Entrepreneurs.pdf


In [1]:
# Installing the PyMuPDF library to extract content from PDF files.

In [2]:
pip install PyMuPDF


Collecting PyMuPDF
  Downloading PyMuPDF-1.24.8-cp310-none-manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting PyMuPDFb==1.24.8 (from PyMuPDF)
  Downloading PyMuPDFb-1.24.8-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.4 kB)
Downloading PyMuPDF-1.24.8-cp310-none-manylinux2014_x86_64.whl (3.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading PyMuPDFb-1.24.8-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (15.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.9/15.9 MB[0m [31m58.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: PyMuPDFb, PyMuPDF
Successfully installed PyMuPDF-1.24.8 PyMuPDFb-1.24.8
Note: you may need to restart the kernel to use updated packages.


In [3]:
import fitz  # PyMuPDF

# Function to extract content from a PDF
def extract_content_from_pdf(pdf_path):
    document = fitz.open(pdf_path)
    content = []
    for page_num in range(len(document)):
        page = document.load_page(page_num)
        content.append(page.get_text())
    return content

# Paths to the provided PDFs
pdf_paths = [
    "/kaggle/input/datapdf/Never-Split-the-Difference.pdf",
    "/kaggle/input/datapdf/how to talk anyone.pdf",
    "/kaggle/input/pdffff/WHAT TO SAY WHEN YOU TALK TO YOURSELF.pdf"
]

# Extract content from each PDF
pdf_contents = {}
for pdf_path in pdf_paths:
    pdf_contents[pdf_path] = extract_content_from_pdf(pdf_path)

# Display the number of pages extracted from each PDF for verification
for pdf_path, content in pdf_contents.items():
    print(f"{pdf_path}: {len(content)} pages extracted.")


/kaggle/input/datapdf/Never-Split-the-Difference.pdf: 375 pages extracted.
/kaggle/input/datapdf/how to talk anyone.pdf: 364 pages extracted.
/kaggle/input/pdffff/WHAT TO SAY WHEN YOU TALK TO YOURSELF.pdf: 258 pages extracted.


In [2]:
# Defining the TreeNode class to represent nodes in the hierarchical tree

In [4]:
import json

class TreeNode:
    def __init__(self, id, name, parent=None):
        self.id = id
        self.name = name
        self.parent = parent
        self.children = []
        self.content = []

    def add_child(self, node):
        self.children.append(node)
        node.parent = self

    def add_content(self, text):
        self.content.append(text)

    def to_dict(self):
        return {
            "id": self.id,
            "name": self.name,
            "parent": self.parent.id if self.parent else None,
            "children": [child.to_dict() for child in self.children],
            "content": self.content
        }

    def __repr__(self):
        return f"TreeNode(id={self.id}, name={self.name})"

# Function to analyze structure and create hierarchical tree
def create_tree_from_content(content):
    root = TreeNode(id="root", name="Textbook")
    current_node = root
    node_id = 1
    
    for page in content:
        for line in page.split('\n'):
            if line.startswith("Chapter") or line.startswith("CHAPTER"):
                chapter_node = TreeNode(id=f"chapter_{node_id}", name=line)
                root.add_child(chapter_node)
                current_node = chapter_node
                node_id += 1
            elif line.startswith("Section") or line.startswith("SECTION"):
                section_node = TreeNode(id=f"section_{node_id}", name=line, parent=current_node)
                current_node.add_child(section_node)
                current_node = section_node
                node_id += 1
            elif line.strip():
                current_node.add_content(line)
    
    return root

# Create hierarchical trees for each PDF
pdf_trees = {}
for pdf_path, content in pdf_contents.items():
    pdf_trees[pdf_path] = create_tree_from_content(content)

# Convert trees to dictionaries and then to JSON for storage
pdf_trees_json = {pdf_path: json.dumps(tree.to_dict()) for pdf_path, tree in pdf_trees.items()}


In [3]:
# Storing hierarchical tree structures in an SQLite database


In [5]:
import sqlite3

# Create an SQLite database to store the hierarchical tree structures
conn = sqlite3.connect('/kaggle/working/textbook_hierarchical_index.db')
cursor = conn.cursor()

# Create a table to store the tree data
cursor.execute('''
    CREATE TABLE IF NOT EXISTS trees (
        id INTEGER PRIMARY KEY,
        pdf_path TEXT UNIQUE,
        tree_structure TEXT
    )
''')

# Insert the hierarchical tree structures into the database
for pdf_path, tree_json in pdf_trees_json.items():
    cursor.execute('''
        INSERT OR REPLACE INTO trees (pdf_path, tree_structure) VALUES (?, ?)
    ''', (pdf_path, tree_json))

conn.commit()
conn.close()

print("Hierarchical tree structures have been stored in the database.")


Hierarchical tree structures have been stored in the database.


In [4]:
# Install the Whoosh library for indexing and searching text

In [6]:
!pip install whoosh


Collecting whoosh
  Downloading Whoosh-2.7.4-py2.py3-none-any.whl.metadata (3.1 kB)
Downloading Whoosh-2.7.4-py2.py3-none-any.whl (468 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.8/468.8 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: whoosh
Successfully installed whoosh-2.7.4


In [7]:
from whoosh import index
from whoosh.fields import Schema, TEXT, ID
from whoosh.qparser import QueryParser
import os

# Define the schema for the index
schema = Schema(node_id=ID(stored=True), content=TEXT)

# Create the index directory if it doesn't exist
if not os.path.exists("indexdir"):
    os.mkdir("indexdir")

# Create the index
ix = index.create_in("indexdir", schema)
writer = ix.writer()

# Assuming pdf_trees is a dictionary where keys are PDF paths and values are root nodes of the hierarchical tree
def index_tree(writer, node):
    writer.add_document(node_id=node.id, content=" ".join(node.content))
    for child in node.children:
        index_tree(writer, child)

# Index the hierarchical trees
for pdf_path, tree in pdf_trees.items():
    index_tree(writer, tree)

writer.commit()


In [5]:
# Install the Faiss library with GPU support

In [8]:
pip install faiss-gpu


Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m00:01[0mm00:01[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2
Note: you may need to restart the kernel to use updated packages.


In [23]:
!pip install nltk transformers




In [25]:
pdf_trees

{'/kaggle/input/datapdf/Never-Split-the-Difference.pdf': TreeNode(id=root, name=Textbook),
 '/kaggle/input/datapdf/how to talk anyone.pdf': TreeNode(id=root, name=Textbook),
 '/kaggle/input/pdffff/WHAT TO SAY WHEN YOU TALK TO YOURSELF.pdf': TreeNode(id=root, name=Textbook)}

In [29]:
!pip install rank-bm25

import sqlite3
import json
from whoosh import index
from whoosh.fields import Schema, TEXT, ID
from whoosh.qparser import QueryParser
import os
import faiss
import numpy as np
import nltk
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer
from transformers import (
    DPRQuestionEncoder, DPRQuestionEncoderTokenizer,
    DPRContextEncoder, DPRContextEncoderTokenizer,
    AutoModelForSequenceClassification, AutoTokenizer,
    T5ForConditionalGeneration, T5Tokenizer
)
from rank_bm25 import BM25Okapi

# Download required NLTK data
nltk.download('wordnet')
nltk.download('omw-1.4')

Collecting rank-bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank-bm25
Successfully installed rank-bm25-0.2.2
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [6]:
# Load hierarchical tree structures from an SQLite database

In [30]:
def load_trees_from_db():
    conn = sqlite3.connect('/kaggle/working/textbook_hierarchical_index.db')
    cursor = conn.cursor()
    cursor.execute('SELECT pdf_path, tree_structure FROM trees')
    trees = {row[0]: json.loads(row[1]) for row in cursor.fetchall()}
    conn.close()
    return trees

pdf_trees = load_trees_from_db()

In [7]:
# Index hierarchical tree structures from database using Whoosh

In [31]:
schema = Schema(node_id=ID(stored=True), content=TEXT)
if not os.path.exists("indexdir"):
    os.mkdir("indexdir")
ix = index.create_in("indexdir", schema)

def index_tree(writer, node):
    writer.add_document(node_id=node['id'], content=" ".join(node['content']))
    for child in node['children']:
        index_tree(writer, child)

writer = ix.writer()
for pdf_path, tree in pdf_trees.items():
    index_tree(writer, tree)
writer.commit()

In [8]:
# Encode hierarchical tree nodes using DPR and create a FAISS index for context embeddings

In [32]:
question_encoder = DPRQuestionEncoder.from_pretrained('facebook/dpr-question_encoder-single-nq-base')
question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained('facebook/dpr-question_encoder-single-nq-base')
context_encoder = DPRContextEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')
context_tokenizer = DPRContextEncoderTokenizer.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')

context_embeddings = []
node_ids = []

def encode_context(node):
    inputs = context_tokenizer(" ".join(node['content']), return_tensors='pt', truncation=True, max_length=512)
    embeddings = context_encoder(**inputs).pooler_output.detach().numpy()
    context_embeddings.append(embeddings)
    node_ids.append(node['id'])
    for child in node['children']:
        encode_context(child)

for pdf_path, tree in pdf_trees.items():
    encode_context(tree)

context_embeddings = np.concatenate(context_embeddings, axis=0)
faiss_index = faiss.IndexFlatL2(context_embeddings.shape[1])
faiss_index.add(context_embeddings)

Some weights of the model checkpoint at facebook/dpr-question_encoder-single-nq-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.bias', 'ctx_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRContextEncoder from the

In [9]:
# Expand and stem user queries using synonyms and stemming

In [33]:
stemmer = PorterStemmer()

def expand_query_with_synonyms(query):
    expanded_terms = set(query.split())
    for word in query.split():
        for syn in wordnet.synsets(word):
            for lemma in syn.lemmas():
                expanded_terms.add(lemma.name())
    return " ".join(expanded_terms)

def stem_query(query):
    return " ".join([stemmer.stem(word) for word in query.split()])

def expand_and_stem_query(query):
    expanded_query = expand_query_with_synonyms(query)
    stemmed_query = stem_query(expanded_query)
    return stemmed_query

In [10]:
# Setup BM25 for hierarchical tree content retrieval

In [47]:
def setup_bm25(trees):
    corpus = []
    for tree in trees.values():
        def extract_content(node):
            content = " ".join(node['content'])
            for child in node['children']:
                content += " " + extract_content(child)
            return content
        corpus.append(extract_content(tree))
    tokenized_corpus = [doc.split() for doc in corpus]
    return BM25Okapi(tokenized_corpus)

bm25 = setup_bm25(pdf_trees)

In [11]:
# Hybrid retrieval using DPR, BM25, and Whoosh


In [48]:
def hybrid_retrieval(query, k=10):
    expanded_query = expand_and_stem_query(query)
    
    # DPR retrieval
    question_input = question_tokenizer(expanded_query, return_tensors='pt')
    question_embedding = question_encoder(**question_input).pooler_output.detach().numpy()
    dpr_scores, dpr_indices = faiss_index.search(question_embedding, k)
    
    # BM25 retrieval
    bm25_scores = bm25.get_scores(expanded_query.split())
    bm25_indices = np.argsort(bm25_scores)[::-1][:k]
    
    # Whoosh retrieval
    with ix.searcher() as searcher:
        whoosh_query = QueryParser("content", ix.schema).parse(expanded_query)
        whoosh_results = searcher.search(whoosh_query, limit=k)
        whoosh_indices = [int(hit['node_id']) for hit in whoosh_results]
    
    # Combine results
    combined_indices = list(set(dpr_indices[0].tolist() + bm25_indices.tolist() + whoosh_indices))
    return [node_ids[i] for i in combined_indices[:k]]

In [12]:
# Rerank retrieved results using a cross-encoder model

In [49]:
reranker_model = AutoModelForSequenceClassification.from_pretrained("cross-encoder/ms-marco-MiniLM-L-12-v2")
reranker_tokenizer = AutoTokenizer.from_pretrained("cross-encoder/ms-marco-MiniLM-L-12-v2")

def rerank_results(query, retrieved_contents):
    pairs = [[query, content] for content in retrieved_contents]
    inputs = reranker_tokenizer(pairs, padding=True, truncation=True, return_tensors="pt", max_length=512)
    scores = reranker_model(**inputs).logits.squeeze(-1)
    reranked_indices = scores.argsort(descending=True)
    return [retrieved_contents[i] for i in reranked_indices]

In [13]:
# Generate answers using a RAG pipeline with T5 for conditional generation

In [50]:
generator_model = T5ForConditionalGeneration.from_pretrained("t5-base")
generator_tokenizer = T5Tokenizer.from_pretrained("t5-base")

def generate_answer(query, context):
    input_text = f"question: {query} context: {context}"
    input_ids = generator_tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True).input_ids
    outputs = generator_model.generate(input_ids, max_length=100, num_return_sequences=1, do_sample=True)
    return generator_tokenizer.decode(outputs[0], skip_special_tokens=True)

def get_node_content(node_id):
    for tree in pdf_trees.values():
        def find_node(node):
            if node['id'] == node_id:
                return " ".join(node['content'])
            for child in node['children']:
                result = find_node(child)
                if result:
                    return result
        content = find_node(tree)
        if content:
            return content
    return ""

def rag_pipeline(query):
    retrieved_node_ids = hybrid_retrieval(query)
    retrieved_contents = [get_node_content(node_id) for node_id in retrieved_node_ids]
    reranked_contents = rerank_results(query, retrieved_contents)
    context = " ".join(reranked_contents[:3])  # Use top 3 reranked results as context
    answer = generate_answer(query, context)
    return answer

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [51]:
query = "What is BEND THEIR REALITY?"
answer = rag_pipeline(query)
print(f"Query: {query}")
print(f"Answer: {answer}")

Query: What is BEND THEIR REALITY?
Answer: Compromise and concession, even to the truth, feels like defeat
