# Knowledge Graphs Workshop: Pattern exercise

First, we'll load the dataset and do some basic pre-processing.
Then we'll show the table in a handy interface.

In [None]:
import pandas as pd
from itables import show
df = pd.read_csv('Aircraft_Annotation_DataFile.csv')
df.columns = [c.lower() for c in df.columns]
df['problem'] = df['problem'].str.strip('.').str.strip()
df['action'] = df['action'].str.strip('.').str.strip()
show(df)

## Define a pattern for Problem strings

In this example pattern, we extract the location of the problem, the part, and a problem keyword.

Try to modify the pattern to extract more problem types, or make a different kind of problem pattern!

In [None]:
problem_pat = (
    '^'
    # The location often mentions the engine side and cylinder number
    '(?P<location>(?:(?:L|R)/H (?:ENG )?)?(?:CYL ?)?(?:#?\d(?: ?. \d)*)(?: CYL ?)?)? ?'
    
    # A part name ends with a letter, ignore the last "S" (for plural words)
    '(?P<part>.*?\w)S? ' 

    # Match the verb but don't extract it
    '(?:IS |ARE |HAS |HAVE )?(?:A )?'

    # Some pre-defined problem keywords to match
    '(?P<problem>LEAK|LOOSE|TORN|CRACKED|BROKEN|DAMAGED|WORN|BAD|SHEAR|DIRTY)'
)
problem_extractions = df['problem'].str.extract(problem_pat)

# Show the most common problem extractions
show(problem_extractions.fillna('').value_counts().rename('count'))

In [None]:
# Show non-matching problems
show(df['problem'].loc[problem_extractions.isna().all(axis=1)])

## Define a pattern for Action strings

In this example pattern, we extract the location of the action, the part, and an action keyword.

Try to modify the pattern to extract more action types, or make a different kind of action pattern!

In [None]:
action_pat = (
    '^(?:REMOVED & )?(?:RE)?'
    # Pre-defined action keywords
    '(?P<action>REPLACED|TIGHTENED|SECURED|ATTACHED|FASTENED|TORQUED|CLEANED|STOP DRILLED) ?'

    # The location often mentions the engine side and cylinder number
    '(?P<location>(?:(?:L|R)/H (?:ENG )?)?(?:CYL ?)?(?:#?\d(?: ?. \d)*)(?: CYL ?)?)? ?'

    # Often, replacements mention "W/ NEW"; ignore it
    '(?:W/ )?(?:NEW )?'

    # A part name ends with a letter, ignore the last "S" (for plural words)
    '(?P<part>.*?\w)S?'
    
    '(?: W/ .*)?$'
)
action_extractions = df['action'].str.extract(action_pat)
show(action_extractions.fillna('').value_counts())

In [None]:
# Show non-matching actions
show(df['action'].loc[action_extractions.isna().all(axis=1)])

## Loading extractions into graph

Now, we'll transform our extractions into graphs and load them into the Knowledge Graph.

In [None]:
from helperFunctions import obj_to_triples
import re
from rdflib import Graph, URIRef, BNode, Literal, RDF, RDFS, DC, Namespace
ZORRO = Namespace("https://zorro-project.nl/example/")

def create_problem_obj(row):
    ent = ZORRO[f'problem{row.ident}']
    
    problem_match = re.search(problem_pat, row.problem)
    problem_fields = problem_match.groupdict() if problem_match else {}
    action_match = re.search(action_pat, row.action)
    action_fields = action_match.groupdict() if action_match else {}

    def camelcase(fields, name):
        # Convert string into a clean CamelCase name
        return re.subn('\W', '', fields.get(name, '').title())[0]
    
    return {
        '@id': ent,
        RDF.type: ZORRO[camelcase(problem_fields, 'problem') + 'Problem'],
        DC.description: Literal(row.problem),
        
        ZORRO.involvedPart: {
            RDF.type: ZORRO[camelcase(problem_fields, 'part') + 'Part'],
            ZORRO.location: Literal((problem_fields.get('location') or '').strip())
        } if problem_fields.get('part') else None,
        
        ZORRO.requiredAction: {
            DC.description: Literal(row.action),
            RDF.type: ZORRO[camelcase(action_fields, 'action') + 'Action'],
            
            ZORRO.involvedPart: {
                RDF.type: ZORRO[camelcase(action_fields, 'part') + 'Part'],
                ZORRO.location: Literal((action_fields.get('location') or '').strip())
            } if action_fields.get('part') else None
        }
    }

# Show the turtle serialization of the first 5 extractions
g = Graph()
g.namespace_manager.bind('', ZORRO)
for obj in df.head(5).apply(create_problem_obj, axis=1):
    for t in obj_to_triples(obj):
        g.add(t)
print(g.serialize())

In [None]:
# Run on entire dataset, takes a few seconds!
g = Graph()
g.namespace_manager.bind('', zorro)
for obj in df.apply(create_problem_obj, axis=1):
    for t in obj_to_triples(obj):
        g.add(t)
g.serialize('pattern_graph.ttl')

In [None]:
%load_ext ipython_sparql_pandas
from helperFunctions import GraphDB

db = GraphDB()
repo_name = 'zorro'
db.create_repo(repo_name).text

response = db.load_data(repo_name, 'pattern_graph.ttl', 
          graph_name = "https://zorro-project.nl/example/PatternGraph")
print(response.text)

In [None]:
%%sparql http://localhost:{db.port}/repositories/{repo_name} -s result
PREFIX : <https://zorro-project.nl/example/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

select (count(*) as ?c) where { 
	?prob a :Problem .
}

### But didn't we add many more instances ..?

These are only the ones for which we couldn't extract a more specific problem class!

To get our instances, we also load our schema about maintenance:

In [None]:
response = db.load_data(repo_name, 'maintenance.ttl', 
          graph_name = "https://zorro-project.nl/example/MaintenanceGraph")
print(response.text)

In [None]:
%%sparql http://localhost:{db.port}/repositories/{repo_name} -s result
PREFIX : <https://zorro-project.nl/example/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

select (count(*) as ?c) where { 
	 ?prob a :Problem .
}

In [None]:
%%sparql http://localhost:{db.port}/repositories/{repo_name} -s result
PREFIX : <https://zorro-project.nl/example/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

select ?problemClass (count(*) as ?count) where { 
    ?prob a :Problem .
    ?prob a ?problemClass . 
}
GROUP BY ?problemClass

In [None]:
result.set_index('problemClass')['count'].plot.barh()