# PDF and Prompt extracted tables

In [25]:
import csv, rdflib, re

g = rdflib.Graph()
base = rdflib.Namespace('http://Ameneh.org/aircraft.owl#')
g.bind(None, base)
parts_catalog = csv.DictReader(open('pdf-extracted/parts-catalog.csv'))
for line in parts_catalog:
    system, assembly, cls = [
        base[re.sub('[^a-zA-Z]', '', line[key].title())]
        for key in ['Section','Figure','Type']
    ]
    label = line['Specifics'].strip() + ' ' + line['Type'].lower()
    s = base['partnr-' + line['Part Number']]
    triples = [
        (s, rdflib.RDFS['subClassOf'], cls),
        (s, base['partOf'], assembly),
        (s, base['partOf'], system),
        (s, base['partNumber'], rdflib.Literal(line['Part Number']) ),
        (s, rdflib.RDFS['label'], rdflib.Literal(label) ),

        (assembly, rdflib.RDFS['subClassOf'], base['Assembly']),
        (assembly, rdflib.RDFS['label'], rdflib.Literal(line['Figure'])),

        (system, rdflib.RDFS['subClassOf'], base['System']),
        (system, rdflib.RDFS['label'], rdflib.Literal(line['Section'])),
    ]
    for t in triples:
        g.add(t)

lines = csv.DictReader(
    open('prompt-extracted/part-classes.tsv'),
    delimiter='\t'
)
for line in lines:
    part, cls = [
        base[ re.sub('[^a-zA-Z]', '', line[key].split('(')[0].title()) ]
        for key in ['Part','subClassOf']
    ]

    triples = [
        (part, rdflib.RDFS['label'], rdflib.Literal(line['Part'])),
        (part, rdflib.RDFS['subClassOf'], cls),
        (cls, rdflib.RDFS['label'], rdflib.Literal(line['subClassOf'])),
        (cls, rdflib.RDFS['subClassOf'], base['Part']),
    ]
    for t in triples:
        g.add(t)

g.serialize('generated-rdf/parts-catalog.ttl', format='ttl')

<Graph identifier=N7aa5537f44b24bfe9d8a78a5d168d750 (<class 'rdflib.graph.Graph'>)>

In [26]:
import csv, rdflib, re

g = rdflib.Graph()
base = rdflib.Namespace('http://Ameneh.org/aircraft.owl#')
g.bind(None, base)
troubleshooting = csv.DictReader(open('pdf-extracted/troubleshooting.csv'))
for line in troubleshooting:
    trouble, cause = [
        base[ re.sub('[^a-zA-Z]', '', line[key].split('(')[0].title()) ]
        for key in ['TROUBLE','PROBABLE CAUSE']
    ]
    remedy = rdflib.BNode()

    triples = [
        (trouble, rdflib.RDFS['subClassOf'], base['Problem']),
        (trouble, rdflib.RDFS['label'], rdflib.Literal(line['TROUBLE']) ),
        (cause, rdflib.RDFS['subClassOf'], base['Problem']),
        (cause, rdflib.RDFS['label'], rdflib.Literal(line['PROBABLE CAUSE']) ),
        (trouble, base['hasCause'], cause),
        
        (remedy, rdflib.RDFS['label'], rdflib.Literal(line['REMEDY'])),
        (remedy, rdflib.RDFS['subClassOf'], base['Solution']),
        (remedy, base["solves"], cause),
    ]
    for t in triples:
        g.add(t)

g.serialize('generated-rdf/troubleshooting.ttl', format='ttl')

<Graph identifier=N158fda171aa34d949667eab45c8d268f (<class 'rdflib.graph.Graph'>)>

In [27]:
import csv, rdflib, re

g = rdflib.Graph()
base = rdflib.Namespace('http://Ameneh.org/aircraft.owl#')
g.bind(None, base)
lines = csv.DictReader(
    open('prompt-extracted/problem-component-function.tsv'),
    delimiter='\t'
)
for line in lines:
    problem, component, function = [
        base[ re.sub('[^a-zA-Z]', '', line[key].split('(')[0].title()) ]
        for key in ['defines','functionOf','Function']
    ]

    triples = [
        (function, rdflib.RDFS['subClassOf'], base['Function']),
        (function, rdflib.RDFS['label'], rdflib.Literal(line['Function'])),
        (function, base['defines'], problem),
        
        (problem, rdflib.RDFS['label'], rdflib.Literal(line['defines'])),
        
        (component, rdflib.RDFS['subClassOf'], base['Component']),
        (component, rdflib.RDFS['label'], rdflib.Literal(line['functionOf'])),
        (component, base['hasFunction'], function),
    ]
    for t in triples:
        g.add(t)


lines = csv.DictReader(
    open('prompt-extracted/functions.tsv'),
    delimiter='\t'
)
for line in lines:
    component, function = [
        base[ re.sub('[^a-zA-Z]', '', line[key].split('(')[0].title()) ]
        for key in ['Component','hasFunction']
    ]

    triples = [
        (function, rdflib.RDFS['label'], rdflib.Literal(line['hasFunction'])),
        (function, rdflib.RDFS['subClassOf'], base['Function']),
        (component, base['hasFunction'], function),
        (component, rdflib.RDFS['subClassOf'], base['Component']),
    ]
    for t in triples:
        g.add(t)


lines = csv.DictReader(
    open('prompt-extracted/subfunction.tsv'),
    delimiter='\t'
)
for line in lines:
    function, subfunction = [
        base[ re.sub('[^a-zA-Z]', '', line[key].split('(')[0].title()) ]
        for key in ['subFunctionOf','Function']
    ]

    triples = [
        (subfunction, base['subFunctionOf'], function),
    ]
    for t in triples:
        g.add(t)

lines = csv.DictReader(
    open('prompt-extracted/dependsOn.tsv'),
    delimiter='\t'
)
for line in lines:
    c1, c2 = [
        base[ re.sub('[^a-zA-Z]', '', line[key].split('(')[0].title()) ]
        for key in ['Component','dependsOn']
    ]

    triples = [
        (c1, base['dependsOn'], c2),
    ]
    for t in triples:
        g.add(t)


g.serialize('generated-rdf/functions.ttl', format='ttl')

<Graph identifier=N8c715219340a4ffda44d4f4ae3ddf9ae (<class 'rdflib.graph.Graph'>)>

# Maintenance logbook extraction tables

In [11]:
import csv, rdflib, re, tqdm
a = rdflib.RDF['type']
subclassof = rdflib.RDFS['subClassOf']
label = rdflib.RDFS['label']

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
def lemma(v):
  return lemmatizer.lemmatize(v.lower(), pos='v').upper()

events = csv.DictReader(
    open('Aircraft_Annotation_DataFile.csv', encoding='utf-8-sig'),
)
events = { e['IDENT']:e for e in events }

for source in ['regex', 'chatgpt_4o']:
  
  g = rdflib.Graph()
  base = rdflib.Namespace('http://Ameneh.org/aircraft.owl#')
  g.bind(None, base)

  fname = f'log-extracted/problem_extractions_{source}.csv'
  lines = csv.DictReader(open(fname))
  for line in tqdm.tqdm(lines, desc=fname):
      event = events[line['id']]
      event_uri = base['event-' + str(event['IDENT'])]
      problem = rdflib.BNode()
      g.add( (event_uri, base['hasProblem'], problem) )
      g.add( (problem, rdflib.DC['description'], rdflib.Literal(event['PROBLEM'])) )
      
      # id,part,problem,rest,engine,cyl
      if line['problem']:
        line['problem'] = lemma(line['problem'])
        problem_type_label = line['problem'].title() + ' Problem'
        problemtype = base[problem_type_label.replace(' ','')]
        g.add((problem, a, problemtype ))
        g.add((problemtype, subclassof, base['Problem'] ))
        g.add((problemtype, label, rdflib.Literal(problem_type_label) ))

      cyls = re.findall('\d', line['cylinders']) if line['cylinders'] else [None]
      
      for c in cyls:
        if line['part']:
          line['part'] = re.sub('S$', '', line['part'])
          part = rdflib.BNode()
          g.add((problem, base['involves'], part ))
          g.add((part, label, rdflib.Literal(line['part'].lower()) ))
          parttype = base[line['part'].split()[-1].title()]
          g.add((part, a, parttype))
          g.add((parttype, subclassof, base['Part']))
          if line['engine']:
            g.add((part, base['atEngine'], rdflib.Literal(line['engine']) ))
          if c:
            g.add((part, base['atCylinder'], rdflib.Literal(int(c)) ))

  fname = f'log-extracted/action_extractions_{source}.csv'
  lines = csv.DictReader(open(fname))
  for line in tqdm.tqdm(lines, desc=fname):
      event = events[line['id']]
      event_uri = base['event-' + str(event['IDENT'])]
      action = rdflib.BNode()
      g.add( (event_uri, base['hasAction'], action) )
      g.add( (action, rdflib.DC['description'], rdflib.Literal(event['ACTION'])) )

      # id,part,action,engine,cyl
      if line['action']:
        action_type_label = line['action'].title() + ' Action'
        actiontype = base[action_type_label.replace(' ','')]
        g.add((action, a, actiontype ))
        g.add((actiontype, subclassof, base['Action'] ))
        g.add((actiontype, label, rdflib.Literal(action_type_label) ))

      cyls = re.findall('\d', line['cylinders']) if line['cylinders'] else [None]
      
      for c in cyls:
        if line['part']:
          part = rdflib.BNode()
          g.add((action, base['involves'], part ))
          g.add((part, label, rdflib.Literal(line['part'].lower()) ))
          parttype = base[line['part'].split()[-1].title()]
          g.add((part, a, parttype))
          g.add((parttype, subclassof, base['Part']))
          if line['engine']:
            g.add((part, base['atEngine'], rdflib.Literal(line['engine']) ))
          if c:
            g.add((part, base['atCylinder'], rdflib.Literal(int(c)) ))

  g.serialize(f'generated-rdf/extractions_{source}.ttl', format='ttl')

log-extracted/problem_extractions_regex.csv: 6169it [00:02, 2233.36it/s]
log-extracted/action_extractions_regex.csv: 6169it [00:01, 4475.45it/s]
log-extracted/problem_extractions_chatgpt_4o.csv: 6102it [00:03, 1719.19it/s]
log-extracted/action_extractions_chatgpt_4o.csv: 6065it [00:02, 2573.48it/s]
