In [1]:
import pandas as pd
import numpy as np

!pip install transformers



In [2]:
!pip install gradio

Collecting gradio
  Downloading gradio-4.39.0-py3-none-any.whl (12.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.4/12.4 MB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl (15 kB)
Collecting fastapi (from gradio)
  Downloading fastapi-0.111.1-py3-none-any.whl (92 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.2/92.2 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ffmpy (from gradio)
  Downloading ffmpy-0.3.2.tar.gz (5.5 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting gradio-client==1.1.1 (from gradio)
  Downloading gradio_client-1.1.1-py3-none-any.whl (318 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m318.2/318.2 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting httpx>=0.24.1 (from gradio)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━

In [4]:
import re
import json

def create_hierarchical_index(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    root = {"name": "Textbook", "content": "", "children": []}

    # Spliting content into chapters
    chapters = re.split(r'(?=Chapter \d+)', content)

    for chapter in chapters:
        if chapter.strip():
            chapter_match = re.match(r'Chapter (\d+)', chapter)
            if chapter_match:
                chapter_num = chapter_match.group(1)
                chapter_title = f"Chapter {chapter_num}"
                chapter_content = chapter.strip()
                chapter_node = {"name": chapter_title, "content": chapter_content, "children": []}

                # Here splitting chapter into sections
                sections = re.split(r'(?=(\d+\.\d+))', chapter_content)

                # Combining section numbers with their content
                combined_sections = ["".join(i) for i in zip(sections[1::2], sections[2::2])]

                for section in combined_sections:
                    if section.strip():
                        section_match = re.match(r'(\d+\.\d+)', section)
                        if section_match:
                            section_num = section_match.group(1)
                            section_title = f"Section {section_num}"
                            section_content = section.strip()
                            section_node = {"name": section_title, "content": section_content, "children": []}
                            chapter_node["children"].append(section_node)

                root["children"].append(chapter_node)

    return root

# Creating hierarchical indices for each book
book_files = {
    "book1": "/content/book1.txt",
    "book2": "/content/book2.txt",
    "book3": "/content/book3.txt"
}

hierarchical_indices = {}
for book_name, file_path in book_files.items():
    hierarchical_indices[book_name] = create_hierarchical_index(file_path)

# Saving the hierarchical indices
for book_name, index in hierarchical_indices.items():
    with open(f'hierarchical_index_{book_name}.json', 'w', encoding='utf-8') as f:
        json.dump(index, f, ensure_ascii=False, indent=2)

In [10]:
import gradio as gr
from transformers import pipeline

# Here we are load the pre-trained model for question answering
nlp = pipeline("question-answering", model="deepset/roberta-base-squad2")

# We will Load here the hierarchical indices for all books
hierarchical_indices = {}
for book_name in book_files.keys():
    with open(f'hierarchical_index_{book_name}.json', 'r', encoding='utf-8') as f:
        hierarchical_indices[book_name] = json.load(f)

# Using this function we can extract content from the hierarchical tree
def extract_content(node):
    if node is None:
        return ''

    content = node.get('content', '')

    for child in node.get('children', []):
        child_content = extract_content(child)
        if child_content:
            content += ' ' + child_content

    return content

def answer_question(book_choice, question):
    if book_choice not in hierarchical_indices:
        return "Invalid book choice"

    hierarchical_index = hierarchical_indices[book_choice]
    context = extract_content(hierarchical_index)

    if not context:
        return "The context for the question is empty."

    # To answer the question
    result = nlp(question=question, context=context)
    return result['answer']

# Gradio interface
def main():
    with gr.Blocks() as demo:
        gr.Markdown("## Question Answering with Hierarchical Index")

        book_choice = gr.Dropdown(choices=list(book_files.keys()), label="Choose a Book")
        question = gr.Textbox(label="Enter your question")
        output = gr.Textbox(label="Answer", interactive=False)

        # Defining the interaction
        question.submit(fn=answer_question, inputs=[book_choice, question], outputs=output)

    demo.launch()

main()

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://ba22aba66eade067b9.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)
