# PDF To HTML

In [None]:
import pdfplumber
from html import escape

def pdf_to_html(pdf_path, html_path):
    with pdfplumber.open(pdf_path) as pdf:
        html = '<html><body>'
        
        for page in pdf.pages:
            # Extract text
            page_text = page.extract_text()
            html += f'<p>{page_text}</p>'
            
            # Extract tables
            for table in page.extract_tables():
                html += '<table>'
                for row in table:
                    html += '<tr>'
                    for cell in row:
                        html += f'<td>{escape(cell)}</td>' if cell is not None else '<td></td>'
                        # escape html tags in text
                        
                    html += '</tr>'
                html += '</table>'
        html += '</body></html>'
    
    # Write HTML to file
    with open(html_path, 'w', encoding='utf-8') as html_file:
        html_file.write(html)
    return html

text_content = pdf_to_html('big sample.pdf', 'output.html')

# Transform Document to T-box

In [1]:
import os
from openai import OpenAI
from dotenv import load_dotenv, find_dotenv

# load the .env file
_ = load_dotenv(find_dotenv())
client = OpenAI(
   api_key = os.environ.get("OPENAI_API_KEY")
)

#with open("output.html", "r") as file:
#   context = file.read()
   
context = f"""This directive defines the roles and responsibilities for managing and overseeing NASA's nuclear
flight safety activities. It provides the requirements to implement NASA's policy to protect the
public, NASA workforce, high-value equipment and property, and the environment from potential
harm as a result of NASA activities and operations, by factoring safety as an integral feature of
programs, projects, technologies, operations, and facilities.
b. This directive also describes NASA's implementation of Federal requirements under National
Security Presidential Memorandum (NSPM)-20, “Presidential Memorandum on Launch of
Spacecraft Containing Space Nuclear Systems,” dated August 20, 2019, radiological contingency
planning (RCP) as a part of broader NASA emergency management activities (see NPD 8710.1 and
NPR 8715.2) and other factors, as well as agency-specific activities relating to ensuring safety and
mission success for NASA-sponsored payloads containing space nuclear systems (SNS) or other
radioactive material (note that these terms are defined in Appendix A).
c. This directive establishes a framework where other requirements, guidance, and processes (e.g.,
Department of Energy (DOE) nuclear safety and security requirements, U.S. Air and Space Force
range safety requirements, NASA payload safety processes) relevant to nuclear flight safety can be
implemented in to the overall Safety and Mission Assurance (SMA) process."""

prompt = f"""# Knowledge Graph Instructions for GPT-4
Step 1: (Label this as "Step 1: Triples" in output)
Split each sentence from the text into a set of entailed clauses that are maximally shortened. Format the clauses into RDF triples that have only two commas and show them only. No explanation needed. 

For instance, the below sentence:
This directive defines the roles and responsibilities for managing and overseeing NASA’s nuclear flight safety activities. Lions, zebras, and whales are animals.

Should be split like so:
This directive, defines, the roles and responsibilities
The roles and responsibilities, are for, managing and overseeing NASA’s nuclear flight safety activities
Lions, are, animals
zebras, are, animals
whales, are, animals

Step 2: (Label this as "Step 2: T-Box" in output)
Treat the triples as an A-box ontology and generate a corresponding OWL2-DL T-box ontology in turtle format. Add an ontology individual with the IRI "http://terminology" and use it a namespace 
for all terms. Derive general names for classes of subjects and objects (avoid using individual names from the triples). However, use predicate names as property names without change. Make 
sure all classes are used and are related as either domains of ranges of object properties. Do not add ``` around the T-box.

Note: No need for further explanation under any of the steps.
"""

messages =[
         {"role": "system", "content": context},
         {"role": "user", "content": prompt},
      ]
model = "gpt-4-turbo-preview"
temperature = 0
   
# Make API request        
def get_summary(client, model, messages, temperature):
   completion = client.chat.completions.create(
      model=model,
      messages=messages,
      temperature=temperature,
   )
   return completion.choices[0].message.content

def writeToOutput(client, model, messages, temperature):
   output_file = "data/tbox.txt"
   try:
      with open(output_file, "w") as file:
         file.write(get_summary(client, model, messages, temperature))
   except IOError: # unable to open file
         print("Error: Unable to write to the file.")

writeToOutput(client, model, messages, temperature)

# Convert to A-box

In [2]:
import os
from openai import OpenAI
from dotenv import load_dotenv, find_dotenv

# load the .env file
_ = load_dotenv(find_dotenv())
client = OpenAI(
   api_key = os.environ.get("OPENAI_API_KEY")
)

with open("data/tbox.txt", "r") as file:
   context = file.read()
   
prompt = f"""# Knowledge Graph Instructions for GPT-4
Parse the triples from Step 1 into a readable A-Box ontology in turtle format using the terms of the T-box in Step 2. Add a triple for an ontology individual with the IRI "http://assertions" 
and also use it a namespace for all generated individuals. Group the triples by subject. Use words from the text directly as individual names. Do not encase the file with ```"""

messages =[
         {"role": "system", "content": context},
         {"role": "user", "content": prompt},
      ]
model = "gpt-4-turbo-preview"
temperature = 0
   
# Make API request        
def get_summary(client, model, messages, temperature):
   completion = client.chat.completions.create(
      model=model,
      messages=messages,
      temperature=temperature,
   )
   return completion.choices[0].message.content

 #open("data/abox.txt", "r")
 #     file.readlines()

def writeToOutput(client, model, messages, temperature):
   output_file = "data/abox.txt"
   try:
      with open(output_file, "w") as file:
         file.write(get_summary(client, model, messages, temperature))
   except IOError: # unable to open file
         print("Error: Unable to write to the file.")
  


writeToOutput(client, model, messages, temperature)

# Visualize Data in GraphDB

In [4]:
import requests

headers = {
    'Content-Type': 'text/turtle',
    'Accept': 'application/json',
}

with open("data/abox.txt", "r") as file:
   data = file.read()

response = requests.put('http://localhost:7200/repositories/myrepo/statements', headers=headers, data=data)

@prefix : <http://terminology#> .
@prefix assertions: <http://assertions#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

assertions:ontologyIndividual rdf:type owl:NamedIndividual .

assertions:ThisDirective rdf:type :Directive ;
    :defines assertions:RolesAndResponsibilities ;
    :provides assertions:TheRequirements ;
    :describes assertions:NASAsImplementation ;
    :includes assertions:RadiologicalContingencyPlanning ;
    :relates_to assertions:AgencySpecificActivities ;
    :establishes assertions:TheFramework .

assertions:RolesAndResponsibilities rdf:type :RoleAndResponsibility ;
    :are_for assertions:NASAsNuclearFlightSafetyActivities .

assertions:NASAsNuclearFlightSafetyActivities rdf:type :Activity ;
    :factor assertions:Safety .

assertions:TheRequirements rdf:type :Requirement ;
    :is_to_protect assertions:ThePublic, assertions:NASAWorkforce, asse