# PDF To HTML

In [8]:
import pdfplumber
from html import escape
from docx import Document
from pptx import Presentation

def pdf_to_html(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        html = '<html><body>'
        
        for page in pdf.pages:
            # Extract text
            page_text = page.extract_text()
            html += f'<p>{page_text}</p>'
            
            # Extract tables
            for table in page.extract_tables():
                html += '<table>'
                for row in table:
                    html += '<tr>'
                    for cell in row:
                        html += f'<td>{escape(cell)}</td>' if cell is not None else '<td></td>'
                        # escape html tags in text
                        
                    html += '</tr>'
                html += '</table>'
        html += '</body></html>'
    
    return html

def docx_to_html(docx_path):
    doc = Document(docx_path)
    html = '<html><body>'
    
    for para in doc.paragraphs:
        html += f'<p>{escape(para.text)}</p>'
        
    for table in doc.tables:
        html += '<table>'
        for row in table.rows:
            html += '<tr>'
            for cell in row.cells:
                html += f'<td>{escape(cell.text)}</td>'
            html += '</tr>'
        html += '</table>'
    
    html += '</body></html>'
    return html

def pptx_to_html(pptx_path):
    prs = Presentation(pptx_path)
    html = '<html><body>'
    
    for slide in prs.slides:
        for shape in slide.shapes:
            if hasattr(shape, 'text'):
                html += f'<p>{escape(shape.text)}</p>'
    
    html += '</body></html>'
    return html

def file_to_html(file_path, html_path):
    if file_path.endswith('.pdf'):
        html = pdf_to_html(file_path)
    elif file_path.endswith('.docx'):
        html = docx_to_html(file_path)
    elif file_path.endswith('.pptx'):
        html = pptx_to_html(file_path)
    else:
        raise ValueError("Unsupported file type. Please provide a .pdf, .docx, or .pptx file.")
    
    # Write HTML to file
    with open(html_path, 'w', encoding='utf-8') as html_file:
        html_file.write(html)
    return html

# Example usage
#file_to_html('data/N_PR_8715_0026.pdf', 'data/output.html')
#file_to_html('data/system document.docx', 'data/output.html')
file_to_html('data/Chocolate Cake Recipe.pptx', 'data/output.html')


'<html><body><p></p><p>Chocolate Cake Recipe</p><p>Delicious doesn’t have to be at odds with difficult!</p><p>Here’s what you’ll find in this Slidesgo template: \nA slide structure based on a multi-purpose presentation, which you can easily adapt to your needs. For more info on how to edit the template, please visit Slidesgo School or read our FAQs.\nAn assortment of pictures that are suitable for use in the presentation can be found in the alternative resources slide.\nA thanks slide, which you must keep so that proper credits for our design are given.\nA resources slide, where you’ll find links to all the elements used in the template.\nInstructions for use.\nFinal slides with: \nThe fonts and colors used in the template.\nMore infographic resources, whose size and color can be edited. \nSets of customizable icons of the following themes: general, business, avatar, creative process, education, help &amp; support, medical, nature, performing arts, SEO &amp; marketing, and teamwork.\nY

# Split text into chunks

In [3]:
import langchain
from langchain.text_splitter import RecursiveCharacterTextSplitter

rec_text_splitter = RecursiveCharacterTextSplitter(
   chunk_size = 6000,
   chunk_overlap = 0,
   length_function = len,
)

chunks = rec_text_splitter.split_text(text_content)

print("Chunks: " + str(len(chunks)))

for i, _ in enumerate(chunks):
   print(f"\nchunk # {i}, size: {len(chunks[i])}")
   print(chunks[i])

Chunks: 17

chunk # 0, size: 5964
<html><body><p>NPR 8715.26 -- TOC Page 1 of 35
| NODIS Library | Program Management(8000s) | Search |
NASA NPR 8715.26
Effective Date: February 03,
Procedural
2022
Expiration Date: February 03,
Requirements
2027
COMPLIANCE IS MANDATORY FOR NASA EMPLOYEES
Nuclear Flight Safety
Responsible Office: Office of Safety and Mission Assurance
Table of Contents
Preface
P.1 Purpose
P.2 Applicability
P.3 Authority
P.4 Applicable Documents and Forms
P.5 Measurement/Verification
P.6 Cancellation
Chapter 1. Overview
1.1 Introduction
1.2 Delegation of Responsibilities
1.3 General Terms
1.4 Request for Relief
Chapter 2. Roles and Responsibilities
2.1 Administrator
2.2 Mission Directorate Associate Administrators
2.3 NASA Program and Project Managers
2.4 Office of International and Interagency Relations
2.5 Office of Protective Services
2.6 Center Directors
2.7 Center Radiation Safety Officer
2.8 Chief, Safety and Mission Assurance
2.9 Nuclear Flight Safety Officer
2.10

# Transform Document to Triples and T-box

In [None]:
import os
from openai import OpenAI
from dotenv import load_dotenv, find_dotenv

# load the .env file
_ = load_dotenv(find_dotenv())
client = OpenAI(
   api_key = os.environ.get("OPENAI_API_KEY")
)

model = "gpt-4o"
temperature = 0

with open("data/output.txt", "r", encoding="utf8") as file:
   context = file.read()

step1_list = []

i = 0
for chunk in chunks[0:2]:
   prompt = f"""# Knowledge Graph Instructions for GPT-4
   Step 1: (Label this as "Step 1: Triples" in output, do not add any other characters in this line)
   Split each sentence from the text into a set of entailed clauses that are maximally shortened. Format the clauses into RDF triples that have only two commas and show them only. No explanation needed. 

   For instance, the below sentence:
   This directive defines the roles and responsibilities for managing and overseeing NASA's nuclear flight safety activities. Lions, zebras, and whales are animals.

   Should be split like so:
   This directive, defines, the roles and responsibilities
   The roles and responsibilities, are for, managing and overseeing NASA's nuclear flight safety activities
   Lions, are, animals
   zebras, are, animals
   whales, are, animals

   Step 2: (Label this as "/-/-/-/Step 2: T-Box" in output, do not add any other characters in this line)
   Treat the triples as an A-box ontology and generate a corresponding OWL2-DL T-box ontology in turtle format. Add an ontology individual with the IRI "http://terminology" and use it a namespace 
   for all terms. Derive general names for classes of subjects and objects (avoid using individual names from the triples). However, use predicate names as property names without change. Make 
   sure all classes are used and are related as either domains of ranges of object properties. Do not add ``` around the T-box.

   Note: No need for further explanation under any of the steps.
   """

   messages =[
            {"role": "system", "content": chunk},
            {"role": "user", "content": prompt},
         ]
   
   # Make API request        
   def get_summary(client, model, messages, temperature):
      completion = client.chat.completions.create(
         model=model,
         messages=messages,
         temperature=temperature,
      )
      return completion.choices[0].message.content
   
   # number all chunks under tbox directory
   with open("output/tboxes/chunk" + str(i) + ".txt", "w") as file:
      file.write(get_summary(client, model, messages, temperature) + "\n\n\n")
   i += 1

# Split T-boxes and Triples
input_directory = "output/tboxes/"
i = 0
for file_name in os.listdir(input_directory):
   with open(input_directory + file_name, "r") as file:
      # split file content into list of two elements, (0) = triples, (1) = tbox 
      content = file.read().split("/-/-/-/")
   # write chunks to their respective folders 
   with open("output/triples/chunk" + str(i) + ".txt", "w") as file:
      file.write(content[0])   
   with open("output/tboxes/chunk" + str(i) + ".txt", "w") as file:
      file.write(content[1])
   i += 1

### Generalize T-box

In [None]:
i = 0
input_directory = "output/tboxes/"
current_tbox = ""
# open a chunk from tbox directory
for file_name in os.listdir(input_directory):
   if i == 0:
      with open(input_directory + file_name, "r") as file:
         current_tbox = file.read()
      os.remove(input_directory + file_name)
      continue

   with open(input_directory + file_name, "r") as file:
      content = file.read()
      prompt = "Given these two tboxes, generalize them into one tbox. Do not add additional explanation or text."
      messages =[
            {"role": "system", "content": current_tbox},
            {"role": "system", "content": content},
            {"role": "user", "content": prompt},
         ]
    
      def get_summary(client, model, messages, temperature):
         completion = client.chat.completions.create(
            model=model,
            messages=messages,
            temperature=temperature,
         )
         return completion.choices[0].message.content
      
   current_tbox = get_summary(client, model, messages, temperature)
   os.remove(input_directory + file_name)
with open(input_directory + "tbox.txt", "w") as file:
   file.write(current_tbox)

# Convert to A-box

In [7]:
import os
from openai import OpenAI
from dotenv import load_dotenv, find_dotenv

# load the .env file
_ = load_dotenv(find_dotenv())
client = OpenAI(
   api_key = os.environ.get("OPENAI_API_KEY")
)

i = 0
for file_name in os.listdir("output/tboxes"):
   with open("output/triples/chunk" + str(i) + ".txt", "r") as file:
      triples = file.read()
   with open("output/tboxes/chunk" + str(i) + ".txt", "r") as file:
      tbox = file.read()
   
   prompt = f"""# Knowledge Graph Instructions for GPT-4
   Parse the triples from Step 1 into a readable A-Box ontology in turtle format using the terms of the T-box in Step 2. Add a triple for an ontology individual with the IRI "http://assertions" 
   and also use it a namespace for all generated individuals. Group the triples by subject. Use words from the text directly as individual names. Do not encase the file with ```"""

   messages =[
            {"role": "system", "content": triples},
            {"role": "system", "content": tbox},
            {"role": "user", "content": prompt},
         ]
   model = "gpt-4-turbo-preview"
   temperature = 0
      
   # Make API request        
   def get_summary(client, model, messages, temperature):
      completion = client.chat.completions.create(
         model=model,
         messages=messages,
         temperature=temperature,
      )
      return completion.choices[0].message.content

   def writeToOutput(client, model, messages, temperature):
      output_file = "output/aboxes/abox.txt"
      try:
         with open(output_file, "w") as file:
            file.write(get_summary(client, model, messages, temperature))
      except IOError: # unable to open file
            print("Error: Unable to write to the file.")
   
   writeToOutput(client, model, messages, temperature)
   i+=1

# Visualize Data in GraphDB

In [8]:
import requests

headers = {
    'Content-Type': 'text/turtle',
    'Accept': 'application/json',
}

with open("data/abox.txt", "r") as file:
   data = file.read()

response = requests.put('http://localhost:7200/repositories/myrepo/statements', headers=headers, data=data)