# PDF To HTML

In [81]:
import pdfplumber
from html import escape
from docx import Document
from pptx import Presentation

def pdf_to_html(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        html = '<html><body>'
        
        for page in pdf.pages:
            # Extract text
            page_text = page.extract_text()
            html += f'<p>{page_text}</p>'
            
            # Extract tables
            for table in page.extract_tables():
                html += '<table>'
                for row in table:
                    html += '<tr>'
                    for cell in row:
                        html += f'<td>{escape(cell)}</td>' if cell is not None else '<td></td>'
                        # escape html tags in text
                        
                    html += '</tr>'
                html += '</table>'
        html += '</body></html>'
    
    return html

def docx_to_html(docx_path):
    doc = Document(docx_path)
    html = '<html><body>'
    
    for para in doc.paragraphs:
        html += f'<p>{escape(para.text)}</p>'
        
    for table in doc.tables:
        html += '<table>'
        for row in table.rows:
            html += '<tr>'
            for cell in row.cells:
                html += f'<td>{escape(cell.text)}</td>'
            html += '</tr>'
        html += '</table>'
    
    html += '</body></html>'
    return html

def pptx_to_html(pptx_path):
    prs = Presentation(pptx_path)
    html = '<html><body>'
    
    for slide in prs.slides:
        for shape in slide.shapes:
            if hasattr(shape, 'text'):
                html += f'<p>{escape(shape.text)}</p>'
    
    html += '</body></html>'
    return html

def file_to_html(file_path, html_path):
    if file_path.endswith('.pdf'):
        html = pdf_to_html(file_path)
    elif file_path.endswith('.docx'):
        html = docx_to_html(file_path)
    elif file_path.endswith('.pptx'):
        html = pptx_to_html(file_path)
    else:
        raise ValueError("Unsupported file type. Please provide a .pdf, .docx, or .pptx file.")
    
    # Write HTML to file
    with open(html_path, 'w', encoding='utf-8') as html_file:
        html_file.write(html)
    return html

file_to_html('data/N_PR_8715_0026.pdf', 'data/output.html')

KeyboardInterrupt: 

Exception ignored in: 'zmq.backend.cython._zmq.Frame.__del__'
Traceback (most recent call last):
  File "_zmq.py", line 141, in zmq.backend.cython._zmq._check_rc
KeyboardInterrupt: 


'<html><body><p>NPR 8715.26 -- TOC Page 1 of 35\n| NODIS Library | Program Management(8000s) | Search |\nNASA NPR 8715.26\nEffective Date: February 03,\nProcedural\n2022\nExpiration Date: February 03,\nRequirements\n2027\nCOMPLIANCE IS MANDATORY FOR NASA EMPLOYEES\nNuclear Flight Safety\nResponsible Office: Office of Safety and Mission Assurance\nTable of Contents\nPreface\nP.1 Purpose\nP.2 Applicability\nP.3 Authority\nP.4 Applicable Documents and Forms\nP.5 Measurement/Verification\nP.6 Cancellation\nChapter 1. Overview\n1.1 Introduction\n1.2 Delegation of Responsibilities\n1.3 General Terms\n1.4 Request for Relief\nChapter 2. Roles and Responsibilities\n2.1 Administrator\n2.2 Mission Directorate Associate Administrators\n2.3 NASA Program and Project Managers\n2.4 Office of International and Interagency Relations\n2.5 Office of Protective Services\n2.6 Center Directors\n2.7 Center Radiation Safety Officer\n2.8 Chief, Safety and Mission Assurance\n2.9 Nuclear Flight Safety Officer\n2.

# Split Text Into Tokenized Chunks

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

rec_text_splitter = RecursiveCharacterTextSplitter(
   chunk_size = 128000,
   chunk_overlap = 0,
   length_function = len,
)

with open("data/output.html", "r", encoding="utf8") as file:
   text_content = file.read()

raw_chunks = rec_text_splitter.split_text(text_content)

print("Chunks: " + str(len(raw_chunks)))

for i, _ in enumerate(raw_chunks):
   print(f"\nchunk # {i}, size: {len(raw_chunks[i])}")
   print(raw_chunks[i])

Chunks: 1

chunk # 0, size: 12833
<html><body><p>NPR 8715.26 -- TOC Page 1 of 35
| NODIS Library | Program Management(8000s) | Search |
NASA NPR 8715.26
Effective Date: February 03,
Procedural
2022
Expiration Date: February 03,
Requirements
2027
COMPLIANCE IS MANDATORY FOR NASA EMPLOYEES
Nuclear Flight Safety
Responsible Office: Office of Safety and Mission Assurance
Table of Contents
Preface
P.1 Purpose
P.2 Applicability
P.3 Authority
P.4 Applicable Documents and Forms
P.5 Measurement/Verification
P.6 Cancellation
Chapter 1. Overview
1.1 Introduction
1.2 Delegation of Responsibilities
1.3 General Terms
1.4 Request for Relief
Chapter 2. Roles and Responsibilities
2.1 Administrator
2.2 Mission Directorate Associate Administrators
2.3 NASA Program and Project Managers
2.4 Office of International and Interagency Relations
2.5 Office of Protective Services
2.6 Center Directors
2.7 Center Radiation Safety Officer
2.8 Chief, Safety and Mission Assurance
2.9 Nuclear Flight Safety Officer
2.10

# Intelligently Split Chunks using OpenAI

In [None]:
import os
from openai import OpenAI
from dotenv import load_dotenv, find_dotenv

# load the .env file
_ = load_dotenv(find_dotenv())
client = OpenAI(
   api_key = os.environ.get("OPENAI_API_KEY")
)

model = "gpt-4o"
temperature = 0

chunks = []

for raw_chunks in raw_chunks:
   prompt = f"""#
   Given the text provided, split it into smaller, coherent chunks. The goal is to divide the text in a way that each chunk makes sense on its 
   own and is organized logically. Please ensure the chunks maintain topic continuity and logical separation. Do not add or remove any words or reorganize 
   the text, but rather split them logically where each chunk contains all the same relevant information. Label each chunk with "/-/-/-/" before only.
   """

   messages = [
            {"role": "system", "content": raw_chunks},
            {"role": "user", "content": prompt},
         ]
   
   # Make API request        
   def get_summary(client, model, messages, temperature):
      completion = client.chat.completions.create(
         model=model,
         messages=messages,
         temperature=temperature,
      )
      return completion.choices[0].message.content
   
   temp = get_summary(client, model, messages, temperature).split("/-/-/-/")
   temp.pop(0)
   chunks.extend(temp)

# Print Chunks for Testing  
i = 1
for chunk in chunks:
   print("Chunk " + str(i) + ":" + chunk)
   i += 1

Chunk 1:
<html><body><p>NPR 8715.26 -- TOC Page 1 of 35
| NODIS Library | Program Management(8000s) | Search |
NASA NPR 8715.26
Effective Date: February 03,
Procedural
2022
Expiration Date: February 03,
Requirements
2027
COMPLIANCE IS MANDATORY FOR NASA EMPLOYEES
Nuclear Flight Safety
Responsible Office: Office of Safety and Mission Assurance
Table of Contents
Preface
P.1 Purpose
P.2 Applicability
P.3 Authority
P.4 Applicable Documents and Forms
P.5 Measurement/Verification
P.6 Cancellation
Chapter 1. Overview
1.1 Introduction
1.2 Delegation of Responsibilities
1.3 General Terms
1.4 Request for Relief
Chapter 2. Roles and Responsibilities
2.1 Administrator
2.2 Mission Directorate Associate Administrators
2.3 NASA Program and Project Managers
2.4 Office of International and Interagency Relations
2.5 Office of Protective Services
2.6 Center Directors
2.7 Center Radiation Safety Officer
2.8 Chief, Safety and Mission Assurance
2.9 Nuclear Flight Safety Officer
2.10 NASA INSRB Representativ

# Transform Document to Triples and T-box

In [None]:
i = 0
for chunk in chunks:
   prompt = f"""# Knowledge Graph Instructions for GPT-4
   Step 1: (Label this as "Step 1: Triples" in output, do not add any other characters in this line)
   Split each sentence from the text into a set of entailed clauses that are maximally shortened. Format the clauses into RDF triples that have only two commas and show them only. No explanation needed. 

   For instance, the below sentence:
   This directive defines the roles and responsibilities for managing and overseeing NASA's nuclear flight safety activities. Lions, zebras, and whales are animals.

   Should be split like so:
   This directive, defines, the roles and responsibilities
   The roles and responsibilities, are for, managing and overseeing NASA's nuclear flight safety activities
   Lions, are, animals
   zebras, are, animals
   whales, are, animals

   Step 2: (Label this as "/-/-/-/Step 2: T-Box" in output, do not add any other characters in this line)
   Treat the triples as an A-box ontology and generate a corresponding OWL2-DL T-box ontology in turtle format. Add an ontology individual with the IRI "http://terminology" and use it a namespace 
   for all terms. Derive general names for classes of subjects and objects (avoid using individual names from the triples). However, use predicate names as property names without change. Make 
   sure all classes are used and are related as either domains of ranges of object properties. Do not add ``` around the T-box.

   Note: No need for further explanation under any of the steps.
   """

   messages = [
            {"role": "system", "content": chunk},
            {"role": "user", "content": prompt},
         ]
   
   # Make API request        
   def get_summary(client, model, messages, temperature):
      completion = client.chat.completions.create(
         model=model,
         messages=messages,
         temperature=temperature,
      )
      return completion.choices[0].message.content
   
   # number all chunks under tbox directory
   with open("output/tboxes/chunk" + str(i) + ".txt", "w") as file:
      file.write(get_summary(client, model, messages, temperature) + "\n\n\n")
   i += 1

# Split T-boxes and Triples
input_directory = "output/tboxes/"
i = 0
for file_name in os.listdir(input_directory):
   with open(input_directory + file_name, "r") as file:
      # split file content into list of two elements, (0) = triples, (1) = tbox 
      content = file.read().split("/-/-/-/")
   # write chunks to their respective folders 
   with open("output/triples/chunk" + str(i) + ".txt", "w") as file:
      file.write(content[0])   
   with open("output/tboxes/chunk" + str(i) + ".txt", "w") as file:
      file.write(content[1])
   i += 1

### Generalize T-box

In [None]:
import os
from openai import OpenAI
from dotenv import load_dotenv, find_dotenv

# load the .env file
_ = load_dotenv(find_dotenv())
client = OpenAI(
   api_key = os.environ.get("OPENAI_API_KEY")
)

model = "gpt-4o"
temperature = 0

input_directory = "output/tboxes/"
current_tbox = ""

# Open a chunk from tbox directory
with open(input_directory + os.listdir(input_directory)[0], "r") as file:
   current_tbox = file.read()
os.remove(input_directory + os.listdir(input_directory)[0])

# Generalize each subsquent tbox file with the previous one
for file_name in os.listdir(input_directory):
   with open(input_directory + file_name, "r") as file:
      content = file.read()
      prompt = "Given these two tboxes, generalize them into one tbox. Do not add additional explanation or text."
      messages =[
            {"role": "system", "content": current_tbox},
            {"role": "system", "content": content},
            {"role": "user", "content": prompt},
         ]
    
      def get_summary(client, model, messages, temperature):
         completion = client.chat.completions.create(
            model=model,
            messages=messages,
            temperature=temperature,
         )
         return completion.choices[0].message.content
      
   current_tbox = get_summary(client, model, messages, temperature)
   os.remove(input_directory + file_name)
   
with open(input_directory + "tbox.txt", "w") as file:
   file.write(current_tbox)

# Convert to A-box

In [84]:
import os
from openai import OpenAI
from dotenv import load_dotenv, find_dotenv

# load the .env file
_ = load_dotenv(find_dotenv())
client = OpenAI(
   api_key = os.environ.get("OPENAI_API_KEY")
)

input_directory = "output/triples/"
current_abox = "Empty A-box"
with open("output/tboxes/tbox.txt", "r") as file:
      tbox = file.read()

for file_name in os.listdir(input_directory):
   with open(input_directory + file_name, "r") as file:
      triples = file.read()
   
   prompt = f"""# Knowledge Graph Instructions for GPT-4
   Parse the triples from Step 1 into a readable A-Box ontology in turtle format using the terms of the given T-box. Combine the newly generated A-Box with the passed A-Box. 
   Add a triple for an ontology individual with the IRI "http://assertions" and also use it a namespace for all generated individuals.
   Group the triples by subject. Use words from the text directly as individual names. Do not encase the file with ```"""

   messages =[
            {"role": "system", "content": triples},
            {"role": "system", "content": tbox},
            {"role": "system", "content": current_abox},
            {"role": "user", "content": prompt},
         ]
   model = "gpt-4o"
   temperature = 0
      
   # Make API request        
   def get_summary(client, model, messages, temperature):
      completion = client.chat.completions.create(
         model=model,
         messages=messages,
         temperature=temperature,
      )
      return completion.choices[0].message.content
   
   current_abox = get_summary(client, model, messages, temperature)

with open("output/abox/abox.txt", "w") as file:
   file.write(current_abox)

# Visualize Data in GraphDB

In [None]:
import requests

headers = {
    'Content-Type': 'text/turtle',
    'Accept': 'application/json',
}

with open("output/abox/abox.txt", "r") as file:
   data = file.read()

response = requests.put('http://localhost:7200/repositories/test/statements', headers=headers, data=data)

# Integrate CQ 

1. Ontology creation from questions

1. Extract concepts from CQs
    1. Code/Concepts_relations_generate.py uses Prompts/Concepts_and_relationships_extraction.txt
    2. Output: Concepts_relations/Concepts_and_relationships.txt
    3. Create ontology: Code/Ontology_creation.py, Prompt: Prompts/Ontology_creation.txt


In [None]:
# Extract concepts and relations from questions by providing the questions to the model 
# and a base ontology such as OWL, RDF.


# ask model to include the above concepts and relations in the tbox ontology

# ask model to include the above concepts and relations in the abox ontology



2. Question answering

In [None]:
# Use RAG model to generate answers to CQs based on the document



3. KG creation

In [None]:
# Prompt with CQs, their answers and the ontology. Ask model to generate key entities 
# and relations from the document that are relevant to the CQs and their answers.

