In [41]:
import pandas as pd
import string
import math
import numpy as np
from rdflib import Graph, URIRef, BNode, Literal, Namespace, plugins
from rdflib.namespace import RDF, RDFS, OWL, XSD
from rdflib.serializer import Serializer
import rdflib
import owlrl
from lookup import DBpediaLookup
from stringcmp import isub
from owlready2 import get_ontology
import Levenshtein as Lev
import re

## Initial Exploratory Data Analysis

In [42]:
df = pd.read_csv('/Users/Jake/Documents/uni/Semantic Web Technologies/coursework/INM713_coursework_data_pizza_8358_1_reduced.csv')
df.sample(10)

Unnamed: 0,name,address,city,country,postcode,state,categories,menu item,item value,currency,item description
1616,J & G Restaurant,35 S Main St,East Granby,US,6026,CT,"American Restaurant,Italian Restaurant,America...",White Pizza (medium),12.5,USD,"Ricotta and parmesan cheese, garlic and herbs."
1909,Valentino Pizza I,5536 Torresdale Ave,Philadelphia,US,19124,PA,Pizza Place,Pizza Skins,3.95,USD,"With sour cream, sauce and mozzarella cheese"
3051,Bella Piazza,286 Us Highway 46,Fairfield,US,7004,NJ,Italian Restaurant,Gnocchi Bella Pizza,8.99,USD,
315,D. Vino Italian Food and Wine Bar - Monte Carlo,3770 Las Vegas Blvd S,Las Vegas,US,89109,NV,Restaurant,Caprese Pizza,14.0,USD,"Fresh Mozzarella, Slices of Fresh Roma Tomatoe..."
1541,Giovanni's Pizzeria,250 Sicklerville Rd,Sicklerville,US,8081,NJ,"Restaurants,Pizza Place,Pizza",Stuffed Cheese Pizza,13.5,USD,
1536,Giovanni's Pizzeria,250 Sicklerville Rd,Sicklerville,US,8081,NJ,"Restaurants,Pizza Place,Pizza",1 Ingredient Pizza,17.2,USD,
2075,Pasta Loft,241 Union Sq,Milford,US,3055,NH,"Italian Restaurant,Restaurant",White Pizza,11.99,USD,
497,Woolworth Tower Kitchen,233 Broadway,New York,US,10279,Nyc,"Restaurant,New York City",Grilled Pizza,10.0,USD,
3105,Casey's General Store,400 W Mondamin St,Minooka,US,60447,IL,"Restaurants,Food & Entertainment,Gas Station",Breakfast Pizza Slice,1.99,USD,
1256,Jerry's Pizza & Subs,924 W 436,Altamonte Springs,US,32714,FL,Pizza Place,Pizza By The Slice,2.0,USD,


In [43]:
df.head()

Unnamed: 0,name,address,city,country,postcode,state,categories,menu item,item value,currency,item description
0,Little Pizza Paradise,Cascade Village Mall Across From Target,Bend,US,97701,OR,Pizza Place,Bianca Pizza,22.5,USD,
1,Little Pizza Paradise,Cascade Village Mall Across From Target,Bend,US,97701,OR,Pizza Place,Cheese Pizza,18.95,USD,
2,The Brentwood,148 S Barrington Ave,Los Angeles,US,90049,Brentwood,"American Restaurant,Bar,Bakery","Pizza, Margherita",12.0,USD,
3,The Brentwood,148 S Barrington Ave,Los Angeles,US,90049,Brentwood,"American Restaurant,Bar,Bakery","Pizza, Mushroom",13.0,USD,
4,The Brentwood,148 S Barrington Ave,Los Angeles,US,90049,Brentwood,"American Restaurant,Bar,Bakery","Pizza, Puttenesca",13.0,USD,"Olives, onions, capers, tomatoes"


It can be seen that there a multiple different pizzas/items for one specific establishment, and so it may be necessary to combine the item name and the establishment name in to one variable to load in to the ontology, in order for easy reading and distinction, as well as not getting lots of duplicate named pizzas in each part of the pizza part of the ontology.

In [44]:
print(df['country'].unique())

['US']


All restaurants in this data set are in the US.

In [45]:
df.shape

(3510, 11)

There are 3510 observations.

In [46]:
# dropping the unnecessary currency column

df.drop('currency', axis = 1, inplace = True)

In [47]:
# checking for missing values

df.isna().sum()

name                   0
address                0
city                   0
country                0
postcode              65
state                  0
categories             0
menu item              0
item value           562
item description    1984
dtype: int64

There are 1984 observations which have missing item descriptions, something that may need addressing to retrieve suitable information about them. Postcodes and prices are also missing, but these cannot be deduced from other features.

## Data Wrangling

Before features can be engineered, which will likely involve some simple text processing, it is necessary to strip the data of punctuation and capitalisation of words, as well as any non_ascii characters.

In [48]:
# removal of non-ascii characters. solution found at:
# https://stackoverflow.com/questions/36340627/remove-non-ascii-characters-from-pandas-column

df['name'] = df['name'].apply(lambda x: ''.join([" " if ord(i) < 32 or ord(i) > 126 else i for i in x]))
df['city'] = df['city'].apply(lambda x: ''.join([" " if ord(i) < 32 or ord(i) > 126 else i for i in x]))
df['country'] = df['country'].apply(lambda x: ''.join([" " if ord(i) < 32 or ord(i) > 126 else i for i in x]))
df['state'] = df['state'].apply(lambda x: ''.join([" " if ord(i) < 32 or ord(i) > 126 else i for i in x]))
df['categories'] = df['categories'].apply(lambda x: ''.join([" " if ord(i) < 32 or ord(i) > 126 else i for i in x]))
df['menu item'] = df['menu item'].apply(lambda x: ''.join([" " if ord(i) < 32 or ord(i) > 126 else i for i in x]))


# punctuation translator solution found from:
# https://stackoverflow.com/questions/34293875/how-to-remove-punctuation-marks-from-a-string-in-python-3-x-using-translate/34294022

# removing punctuation and upper case letters from all necessary string variables to make suitable for
# URI formatting formatting later.

# intializing translator
translator = str.maketrans('', '', string.punctuation)

# applying to cols (cols with NaN values will have to be dealt with differently)
df['name'] = [string.translate(translator).lower().replace('-', '_').replace(' ', '_') for string in df['name']]
df['city'] = [string.translate(translator).lower().replace('-', '_').replace(' ', '_') for string in df['city']]
df['country'] = [string.translate(translator).lower().replace('-', '_').replace(' ', '_') for string in df['country']]
df['state'] = [string.translate(translator).lower().replace('-', '_').replace(' ', '_') for string in df['state']]
df['categories'] = [string.translate(translator).lower().replace(',', '_').replace('-', '_').replace(' ', '_') for string in df['categories']]
df['menu item'] = [string.translate(translator).lower().replace('-', '_').replace(' ', '_') for string in df['menu item']]

# replacing 'us' with 'USA':
df['country'] = df['country'].apply(lambda x: x.replace('us', 'USA'))

In [49]:
# Applying the same changes to the string columns that contain NaN values

for idx, row in df.iterrows():
    if row['postcode'] == row['postcode']:
        df.iloc[idx, 4] = row['postcode'].translate(translator).lower().replace('-', '').replace(' ', '')
    if row['item description'] == row['item description']:
        df.iloc[idx, 9] = row['item description'].translate(translator).lower().replace('-', '').replace(' ', '_')

In [50]:
# Merging the name of establishment and item name in new col

df['item_name'] = df['menu item']+'_at_'+df['name']

In [51]:
# Creating dictionary with all 50 states with their respective abbreviations, in order to be able to link state column
# more successfully to Dbpedia:

states = {'al': 'Alabama', 'ak': 'Alaska', 'as': 'Arkansas', 'az': 'Arizona', 'ca': 'California', 'co': 'Colorado',
         'ct': 'Connecticut', 'de': 'Delaware', 'fl': 'Florida', 'ga': 'Georgia', 'hi': 'Hawaii', 'id': 'Idaho', 
         'il': 'Illinois', 'in': 'Indiana', 'ia': 'Iowa', 'ks': 'Kansas', 'ky': 'Kentucky', 'la': 'Louisianna', 
         'ma': 'Maryland', 'mi': 'Michigan', 'mn': 'Minnesota', 'mo': 'Missouri', 'ms': 'Mississipi', 'mt': 'Montana',
         'nv': 'Nevada', 'ne': 'Nebraska', 'nh': 'New_Hampshire', 'nj': 'New_Jersey', 'nm': 'New_Mexico', 
         'ny': 'New_York', 'nc': 'North_Carolina', 'nd': 'North_Dakota', 'oh': 'Ohio', 'ok': 'Oklahoma', 'or': 'Oregon',
         'pa': 'Pennsylvania', 'ri': 'Rhode_Island', 'sc': 'South_Carolina', 'sd': 'South_Dakota', 'tx': 'Texas',
         'tn': 'Tennessee', 'ut': 'Utah', 'vt': 'Vermont', 'va': 'Virginia', 'wy': 'Wyoming', 'wi': 'Wisconsin',
         'wa': 'Washington', 'wv': 'West_Virginia'}

# iterating over the df to change the states that have abbreviations to full state names:

for idx, row in df.iterrows():
    if row['state'] in states.keys():
        full_state = states[row['state']]
        df.iloc[idx, 5] = full_state


In [52]:
# intitializing new columns to one-hot encode different types of pizzas.

df[['Vegetarian_pizza', 'Meat_pizza', 'Seafood_pizza', 'Bianca', 'Hawaiian', 'Americana', 'Margherita', 'Dessert_pizza',
   'Thin_crust', 'Deep_dish']] = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

## One-hot encoding of different classes/data properties

In [53]:
# initialising lit of buzzwords to search for for each category

bianca_search = ['bianca', 'white pizza', 'tomato free']
vegetarian_search = ['vegetarian', 'veggie', 'meat_free', 'meatfree', 'no_meat', 'vegetable', 'margherita', 'four_cheese',
                    'bufalina', 'cheese_pizza', 'pizza_mushroom', 'mushroom_pizza', 'pizza_putanesca',]
americana_search = ['american_pizza', 'americana', 'pepperoni_pizza', 'pizza_americana']
margherita_search = ['margherita']
hawaiian_search = ['hawaiian', 'pineapple']
meat_search = ['meat', 'chicken', 'beef', 'pork', 'lamb', 'pepperoni', 'pastrami', 'salami', 'sausage', 'bacon',
              'steak', 'chorizo', 'prosciutto', 'pancetta', 'sarda', 'diavola', 'capricciosa', 'mare_e_monti', 'ham']
seafood_search = ['tuna', 'prawn', 'shrimp', 'oyster', 'salmon', 'anchov', 'sardine', 'clams', 'cuttlefish',
                 'seafood', 'scallop', 'mussel', 'frutti_di_mare', 'crab']
thin_crust_search = ['thin_crust', 'thin_base', 'crispy_base', 'crispy_crust']
deep_dish_search = ['deep_dish', 'chicago_']
dessert_search = ['dessert', 'chocolate', 'nutella', 'marshmallow']

# applying one-hot encoding of different kinds of pizza through buzzword search
for idx, row in df.iterrows():
    
    for each in bianca_search:
        if each in row['menu item'] or each in str(row['item description']):
            df.iloc[idx, 14] = 1
    
    for each in vegetarian_search:
        if each in row['menu item'] or each in str(row['item description']):
            df.iloc[idx, 11] = 1
            
    for each in americana_search:
        if each in row['menu item'] or each in str(row['item description']):
            df.iloc[idx, 16] = 1
            
    for each in margherita_search:
        if each in row['menu item'] or each in str(row['item description']):
            df.iloc[idx, 17] = 1
            
    for each in hawaiian_search:
        if each in row['menu item'] or each in str(row['item description']):
            df.iloc[idx, 15] = 1
            
    for each in meat_search:
        if each in row['menu item'] or each in str(row['item description']):
            df.iloc[idx, 12] = 1
            
    for each in seafood_search:
        if each in row['menu item'] or each in str(row['item description']):
            df.iloc[idx, 13] = 1
            
    for each in thin_crust_search:
        if each in row['menu item'] or each in str(row['item description']):
            df.iloc[idx, 19] = 1
            
    for each in vegetarian_search:
        if each in row['menu item'] or each in str(row['item description']):
            df.iloc[idx, 20] = 1
            
    for each in dessert_search:
        if each in row['menu item'] or each in str(row['item description']):
            df.iloc[idx, 18] = 1

            
# re-iterating to ensure no disjoint properties have been created e.g. meat & seafood pizza with vegetarian pizza

for idx, row in df.iterrows():
    if ((row['Vegetarian_pizza'] == 1) &
        (row['Meat_pizza'] == 1)):
        
        df.iloc[idx, 11] = 0
    
    if ((row['Vegetarian_pizza'] ==1) &
        (row['Seafood_pizza'] ==1)):
        
        df.iloc[idx, 11] = 0
        
    if ((row['Vegetarian_pizza'] ==1) &
        (row['Hawaiian'] ==1)):
        
        df.iloc[idx, 11] = 0
        
    if ((row['Meat_pizza'] ==1) &
        (row['Margherita'] ==1)):
        
        df.iloc[idx, 17] = 0
        
    if ((row['Seafood_pizza'] ==1) &
        (row['Margherita'] ==1)):
        
        df.iloc[idx, 17] = 0

In [54]:
df.head()

Unnamed: 0,name,address,city,country,postcode,state,categories,menu item,item value,item description,...,Vegetarian_pizza,Meat_pizza,Seafood_pizza,Bianca,Hawaiian,Americana,Margherita,Dessert_pizza,Thin_crust,Deep_dish
0,little_pizza_paradise,Cascade Village Mall Across From Target,bend,USA,97701,Oregon,pizza_place,bianca_pizza,22.5,,...,0,0,0,1,0,0,0,0,0,0
1,little_pizza_paradise,Cascade Village Mall Across From Target,bend,USA,97701,Oregon,pizza_place,cheese_pizza,18.95,,...,1,0,0,0,0,0,0,0,0,1
2,the_brentwood,148 S Barrington Ave,los_angeles,USA,90049,brentwood,american_restaurantbarbakery,pizza_margherita,12.0,,...,1,0,0,0,0,0,1,0,0,1
3,the_brentwood,148 S Barrington Ave,los_angeles,USA,90049,brentwood,american_restaurantbarbakery,pizza_mushroom,13.0,,...,1,0,0,0,0,0,0,0,0,1
4,the_brentwood,148 S Barrington Ave,los_angeles,USA,90049,brentwood,american_restaurantbarbakery,pizza_puttenesca,13.0,olives_onions_capers_tomatoes,...,0,0,0,0,0,0,0,0,0,0


Any duplicates can now be dropped using the newly adjusted item name column which has standardised information for both the item and the estblishment that sells it. This will hopefully leave all unique entries in the data set.

In [55]:
print("before dropping duplicates: {}".format(df.shape))
df.drop_duplicates(subset = ['item_name'], inplace = True)
print("after dropping duplicates: {}".format(df.shape))

before dropping duplicates: (3510, 21)
after dropping duplicates: (3161, 21)


# 2.3 Tabular Data to Knowledge Graph (Task RDF)

### Subtask RDF.2

In [56]:
# initialising graph and namespace with prefix

g = Graph()
namespace = 'http://www.semanticweb.org/jake/ontologies/2021/2/jbrown/'
jb = Namespace(namespace)
g.bind("jb", jb)


# adding creator annotation property

g.add((URIRef('http://www.semanticweb.org/jake/ontologies/2021/2/jbrown'), jb.Created_by, Literal('Jakob Brown', datatype = RDFS.Literal)))


# iterating over the df to create triples

for idx, row in df.iterrows():
    
    
# initalising URIs
    
    # name
    establishment = URIRef(namespace+row['name'])
    # city
    #city = URIRef(namespace+row['city'])
    # country
    #country = URIRef(namespace+row['country'])
    # state
    #state = URIRef(namespace+row['state'])
    # pizza
    pizza = URIRef(namespace+row['item_name'])
    
    
    
# adding classes for non-geographic domains
    
    # establishment
    g.add((establishment, RDF.type, jb.Establishment))
    
    
# differentiating the different kinds of pizza on prior encoding, allocating to their respective classes
    
    if row['Vegetarian_pizza'] == 1:
        g.add((pizza, RDF.type, jb.Vegetarian_pizza))
    if row['Meat_pizza'] == 1:
        g.add((pizza, RDF.type, jb.Meat_pizza))
    if row['Seafood_pizza'] ==1:
        g.add((pizza, RDF.type, jb.Seafood_pizza))
    if row['Bianca'] == 1:
        g.add((pizza, RDF.type, jb.Bianca))
    if row['Hawaiian'] == 1:
        g.add((pizza, RDF.type, jb.Hawaiian))
    if row['Americana'] ==1:
        g.add((pizza, RDF.type, jb.Americana))
    if row['Margherita'] == 1:
        g.add((pizza, RDF.type, jb.Margherita))
    if row['Dessert_pizza'] == 1:
        g.add((pizza, RDF.type, jb.Dessert_pizza))
        
    if ((row['Vegetarian_pizza'] == 0) &
    (row['Meat_pizza'] == 0) & 
    (row['Seafood_pizza'] ==0) &
    (row['Bianca'] == 0) &
    (row['Hawaiian'] == 0) &
    (row['Margherita'] == 0) &
    (row['Dessert_pizza'] == 0)):
        g.add((pizza, RDF.type, jb.Pizza))
        
        
# specifying base type object property
    
    if row['Thin_crust'] == 1:
        g.add((pizza, jb.Has_base, jb.Thin_crust))
    if row['Deep_dish'] == 1:
        g.add((pizza, jb.Has_base, jb.Deep_dish))
        
        
# other object properties (locations)
    
    # item to establishment
    g.add((pizza, jb.Served_at, establishment))
    g.add((establishment, jb.Serves, pizza))
    
    
# adding data properties
    
    # address
    g.add((establishment, jb.Has_address, Literal(row['address'], datatype = RDFS.Literal)))
    
    # price
    if row['item value'] == row['item value']:
        g.add((pizza, jb.Has_price, Literal(row['item value'], datatype = XSD.float)))
    
    # postcode
    if row['postcode'] == row['postcode']:
        g.add((establishment, jb.Has_postcode, Literal(row['postcode'], datatype = RDFS.Literal)))
    
    # establishment name
    g.add((establishment, jb.Has_name, Literal(row['name'], datatype = RDFS.Literal))) 
    
    # pizza name
    g.add((pizza, jb.Has_name, Literal(row['menu item'], datatype = RDFS.Literal)))

In [57]:
print("the number of triples at this stage (without reasoning or parsing ontology) are {}".format(len(g)))

the number of triples at this stage (without reasoning or parsing ontology) are 19952


### Subtask RDF.3

### For the cells in the columns city, country and state; instead of creating new URIs (e.g., new individuals) for the information in the table cells, reuse an entity URI from DBPedia, Wikidata or Google’s Knowledge Graph

In [58]:
# Slightly adapting the original function provided in lab 6 for non-OOP approach
# with the hope of filtering out irrelevant resources that lexical similarity measures may return
# by specifying types of information desired e.g. geographical/locations/places/populatedplaces:

def getExternalKGURI(name, place_type, attempts = 1):
        
        
        # "US" is the only country in the df, so the United States resource can always be returned
        if place_type == 'Country':
            uri = "http://dbpedia.org/resource/United_States"
            return uri
        else:
            # intialising DBpedia lookup
            dbpedia = DBpediaLookup() 
        
            # identifying the entities that are similar lexically
            entities = dbpedia.getKGEntities(name, 5)
            if entities == []: # <-- if there are no results to iterate over
                #print("no matches found")
                outcome = ''
                return outcome
        
            else:  
                current_sim = -1
                current_uri=''
                for ent in entities:
            
                    types = ent.types #  isolating the set of "types" each resource belongs to
            
            
                    if place_type == "City":
                
                        # filters out returned resources which do not have "City" as one of its types
                        if 'http://dbpedia.org/ontology/City' in types:
                            isub_score = isub(name, ent.label) 
                            if current_sim < isub_score:
                                current_uri = ent.ident
                                current_sim = isub_score
                                     
            
            
                    elif place_type == "State":
                
                        # filters out returned resources which do not have "Location" as one of its types
                        # as there did not seem to be a "state" type, for US states at least
                        if 'http://dbpedia.org/ontology/Place' in types:
                            isub_score = isub(name, ent.label) 
                            if current_sim < isub_score:
                                current_uri = ent.ident
                                current_sim = isub_score
                    
                
                        
            
                return current_uri


In [59]:
# adding all locatable DBpedia resources using the above function

    
# initialising DBpedia namespace

dbp_namespace = "http://dbpedia.org/resource/"
dbp = Namespace(dbp_namespace)
g.bind("dbp", dbp)
    
    
# iterating over the geographical features (city, state, and country)
# to check for relevant DBP URIs which can be entwined and added to the present KG:
    
for idx, row in df.iterrows():
        
    try:
        # city
        dbp_URI_city = getExternalKGURI(row['city'], place_type = "City")
        #print(dbp_URI_city)
        if dbp_URI_city != '': # <- if a similar dbpedia uri was found
            city = URIRef(dbp_URI_city) # <- assign it as the URI 
        else:
            city = URIRef(namespace+row['city']) # <- else use the name found in the data set with the jb namespace
            
        # state
        dbp_URI_state = getExternalKGURI(row['state'], place_type = "State")
        #print(dbp_URI_state)
        if dbp_URI_state != '': # <- if a similar dbpedia uri was found
            state = URIRef(dbp_URI_state) # <- assign it as the URI 
        else:
            state = URIRef(namespace+row['state']) # <- else use the name found in the data set with the jb namespace
            

        # country    
        dbp_URI_country = getExternalKGURI(row['country'], place_type = "Country")
        #print(dbp_URI_country)
        if dbp_URI_country != '': # <- if a similar dbpedia uri was found
            country = URIRef(dbp_URI_country) # <- assign it as the URI
        else:
            country = URIRef(namespace+row['country']) # <- else use the name found in the data set with the jb namespace
            
    except Exception as e:
        print(e)
    
    
    # intialising establishment URI to draw object property connections with new DBpedia URIs
    establishment = URIRef(namespace+row['name'])

        
    # adding triples (classes)
    g.add((city, RDF.type, jb.City))
    g.add((state, RDF.type, jb.State))
    g.add((country, RDF.type, jb.Country))
    
    # adding object properties
    g.add((establishment, jb.Is_located_in, city))
    g.add((city, jb.Is_located_in, state))
    g.add((state, jb.Is_located_in, country))
    g.add((country, jb.Has_location, state))
    g.add((state, jb.Has_location, city))
    g.add((city, jb.Has_location, establishment))
    
    
    # adding data properties 
    g.add((city, jb.Has_name, Literal(row['city'], datatype = RDFS.Literal)))
    g.add((state, jb.Has_name, Literal(row['state'], datatype = RDFS.Literal)))
    g.add((country, jb.Has_name, Literal(row['country'], datatype = RDFS.Literal)))

In [60]:
print("the number of triples at this stage (without reasoning or parsing ontology) are {}".format(len(g)))

g.serialize('RDF_data.ttl', format = 'ttl')
# parsing the ontology
g.parse("base_ontology.ttl", format = 'ttl')
g.serialize("ontology_with_data.ttl", format = 'ttl')
print("the number of triples at this stage (with parsing ontology) are {}".format(len(g)))

the number of triples at this stage (without reasoning or parsing ontology) are 25704
the number of triples at this stage (with parsing ontology) are 26021


# 2.4 SPARQL and Reasoning (Task SPARQL)

### Subtask SPARQL.1 Perform reasoning with the created ontology and the generated data. Save the extended graph in turtle format 

In [61]:
# defining function implementing SQL ask query
def ask_query(g, fact, number):
    
    qres = g.query(
    
    """ASK {""" + fact + """ }""")
    
    number = number
    for row in qres:
        print("does triple {} hold? Answer: {}".format(number, row))
        
# defining reasoning function to perform reasoning with owlrl semantics, as well as check some specified entailments 
def perform_reasoning():
    
    g = Graph()
    g.parse('ontology_with_data.ttl', format = 'ttl')
    
    print("the number of triples before reasoning: {}".format(len(g)))
    
    # RDFS reasoning using owlrl semantics:
    owlrl.DeductiveClosure(owlrl.OWLRL_Semantics, axiomatic_triples=True, datatype_axioms=False).expand(g)

    
    print("the number of triples after reasoning: {}".format(len(g)))

    # intialising some triples to check their entailment:
    
    t1 = "jb:Bianca rdfs:subClassOf jb:Specific_pizza ."
    t2 = "dbp:United_States a jb:Country ."
    t3 = "jb:Vegetarian_pizza rdfs:subClassOf jb:Pizza ."
    t4 = "jb:Anchovy rdfs:subClassOf jb:Seafood_topping ."
    
    
    # checking entailments using SPARQL ASK query:
    print("\nChecking entailments")
    ask_query(g, t1, 1)
    ask_query(g, t2, 2)
    ask_query(g, t3, 3)
    ask_query(g, t4, 4)
    
    # save new extended graph:
    print("Saving extended graph...")
    g.serialize(destination = 'ontology_with_data_post_reasoning.ttl', format = 'ttl')
    print("graph saved")

    
perform_reasoning() 

the number of triples before reasoning: 26021
the number of triples after reasoning: 109005

Checking entailments
does triple 1 hold? Answer: True
does triple 2 hold? Answer: True
does triple 3 hold? Answer: True
does triple 4 hold? Answer: True
Saving extended graph...
graph saved


### Subtask SPARQL.2 Return all the details of the restaurants that sell pizzas without tomate (i.e. pizza bianca). Return the results as a CSV file 

In [62]:
g = Graph()
g.parse('ontology_with_data_post_reasoning.ttl', format = 'ttl')

<Graph identifier=N415d1b66a113493f8cca738833150715 (<class 'rdflib.graph.Graph'>)>

In [63]:
qres = g.query(

""" 
PREFIX jb: <http://www.semanticweb.org/jake/ontologies/2021/2/jbrown/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>

SELECT ?establishment ?address ?postcode (SAMPLE(?city) AS ?city) (SAMPLE(?state) AS ?state) ?country
WHERE {
?establishment jb:Serves ?pizza .
?pizza rdf:type jb:Bianca .
?establishment jb:Has_address ?address .
?establishment jb:Has_postcode ?postcode .
?establishment jb:Is_located_in ?city .
?city rdf:type jb:City .
?city jb:Is_located_in ?state .
?state rdf:type jb:State .
?state jb:Is_located_in ?country .
?country rdf:type jb:Country .

}
GROUP BY ?establishment

""")

# creating and opening csv file
f_out = open("SPARQL_task2_query_results.csv","w+")

count = 0
for row in qres:
    
    count+=1
    
    # writing results to csv file
    line_str = '\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\"\n' % (row.establishment, row.address, row.postcode, row.city, row.state, row.country)
    f_out.write(line_str)
            
     
f_out.close()

print("There are a total of {} restaurants that serve a bianca pizza. Details of these establishments have\
 been written to the above csv file.".format(count))

There are a total of 18 restaurants that serve a bianca pizza. Details of these establishments have been written to the above csv file.


### Subtask SPARQL.3 Return the average prize of a Margherita pizza

In [64]:
qres = g.query(

""" 
PREFIX jb: <http://www.semanticweb.org/jake/ontologies/2021/2/jbrown/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>

SELECT (AVG(?price) AS ?avgprice) 
WHERE
{
?pizza rdf:type jb:Margherita .
?pizza jb:Has_price ?price . }

""")

# printing result
for row in qres:
    print("The average price, as can be seen above, of a Margherita pizza, is {} USD.\
 This result will not be returned as a CSV file, seeing as there is only one value\
 to return and it can be clearly seen above.".format(round(float(row.avgprice), 2)))

The average price, as can be seen above, of a Margherita pizza, is 12.44 USD. This result will not be returned as a CSV file, seeing as there is only one value to return and it can be clearly seen above.


### Subtask SPARQL.4 Return number of restaurants by city, sorted by state and number of restaurants

In [65]:
qres = g.query(

""" 
PREFIX jb: <http://www.semanticweb.org/jake/ontologies/2021/2/jbrown/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>

SELECT (COUNT(DISTINCT ?restaurant) AS ?count) ?city ?state
WHERE {
?restaurant jb:Is_located_in ?city .
?city rdf:type jb:City .
?city jb:Is_located_in ?state .
?state rdf:type jb:State .
}
GROUP BY ?city
ORDER BY ?state DESC(?count)

""")
    

row_limit = 30
for row in qres:
    
    # printing results
    if row_limit > 0:
        print("restaurants: "+str(row[0])+" -city: "+str(row[1])+" -state: "+str(row[2]))
        row_limit -=1

restaurants: 2 -city: http://dbpedia.org/resource/Bayonne,_New_Jersey -state: http://dbpedia.org/resource/45th_Street_station_(Hudson–Bergen_Light_Rail)
restaurants: 1 -city: http://dbpedia.org/resource/Novato,_California -state: http://dbpedia.org/resource/Al-Sanamayn
restaurants: 1 -city: http://dbpedia.org/resource/Mobile,_Alabama -state: http://dbpedia.org/resource/Alabama
restaurants: 1 -city: http://dbpedia.org/resource/Montgomery,_Alabama -state: http://dbpedia.org/resource/Alabama
restaurants: 1 -city: http://dbpedia.org/resource/Fairbanks,_Alaska -state: http://dbpedia.org/resource/Alaska
restaurants: 4 -city: http://dbpedia.org/resource/Phoenix,_Arizona -state: http://dbpedia.org/resource/Arizona
restaurants: 3 -city: http://dbpedia.org/resource/Scottsdale,_Arizona -state: http://dbpedia.org/resource/Arizona
restaurants: 1 -city: http://www.semanticweb.org/jake/ontologies/2021/2/jbrown/cave_creek -state: http://dbpedia.org/resource/Arizona
restaurants: 1 -city: http://dbpedia

The query results in their entirety have been saved to the above csv file.

### Subtask SPARQL.5 Return the list of restaurants with missing postcode

In [66]:
qres = g.query(

""" 
PREFIX jb: <http://www.semanticweb.org/jake/ontologies/2021/2/jbrown/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>

SELECT DISTINCT ?res
WHERE {
?res rdf:type jb:Establishment .
FILTER NOT EXISTS {?res jb:Has_postcode ?postcode}
}


""")



count = 0
for row in qres:
    
    count +=1
            
    

print("There appear to be a total of {} restaurants that do not have a postcode.\
 Details in the csv file made above".format(count))

There appear to be a total of 12 restaurants that do not have a postcode. Details in the csv file made above


# 2.5 Ontology Alignment (Task OA)

### Subtask OA.1 Compute equivalences between the entities of the input ontologies

In [67]:
# defining ontology loading function from lab 8 code "AccessEntityLabels.py"
# slightly adapted to suit present needs

def getClasses(onto):        
    return onto.classes()
    
def getDataProperties(onto):        
    return onto.data_properties()
    
def getObjectProperties(onto):        
    return onto.object_properties()
    
def getIndividuals(onto):    
    return onto.individuals()


def getRDFSLabelsForEntity(entity):
    #if hasattr(entity, "label"):
    return entity.label


def getRDFSLabelsForEntity(entity):
    #if hasattr(entity, "label"):
    return entity.label    


def processClasses(uri):
    
    #Method from owlready
    onto = get_ontology(uri).load()
    # number of classes
    print("Classes in {}: {}".format(str(uri), len(list(getClasses(onto)))))
    
    class_list = []
    for cls in getClasses(onto): 
        # class names
        print("\t"+cls.name)
        class_list.append(cls.name)
    
    return class_list

def processDataProperties(uri):
    
    #Method from owlready
    onto = get_ontology(uri).load()
    # number of data properties
    print("Data Properties in {}: {}".format(str(uri), len(list(getDataProperties(onto))))) 
    
    data_properties_list = []
    for dp in getDataProperties(onto): 
        # data property names
        print("\t"+dp.name)
        data_properties_list.append(dp.name)
    
    return data_properties_list

def processObjectProperties(uri):
    
    #Method from owlready
    onto = get_ontology(uri).load()
    # number of object properties
    print("Object Properties in {}: {}".format(str(uri), len(list(getObjectProperties(onto)))))
    
    object_properties_list = []
    for op in getObjectProperties(onto): 
        # object property names
        print("\t"+op.name)
        object_properties_list.append(op.name)
    
    return object_properties_list    

In [68]:
# creating lists of classes for each ontology
pi_uri="pizza.owl"
jb_uri = "base_ontology.owl"

pi_classes = processClasses(pi_uri)
jb_classes = processClasses(jb_uri)

Classes in pizza.owl: 99
	Pizza
	PizzaBase
	Food
	Spiciness
	PizzaTopping
	American
	NamedPizza
	MozzarellaTopping
	PeperoniSausageTopping
	TomatoTopping
	AmericanHot
	HotGreenPepperTopping
	JalapenoPepperTopping
	AnchoviesTopping
	FishTopping
	ArtichokeTopping
	VegetableTopping
	Mild
	AsparagusTopping
	Cajun
	OnionTopping
	PeperonataTopping
	PrawnsTopping
	TobascoPepperSauce
	CajunSpiceTopping
	HerbSpiceTopping
	Hot
	RosemaryTopping
	CaperTopping
	Capricciosa
	HamTopping
	OliveTopping
	Caprina
	GoatsCheeseTopping
	SundriedTomatoTopping
	CheeseTopping
	CheeseyPizza
	CheeseyVegetableTopping
	ChickenTopping
	MeatTopping
	Country
	DomainConcept
	DeepPanBase
	ThinAndCrispyBase
	ValuePartition
	Fiorentina
	GarlicTopping
	ParmesanTopping
	SpinachTopping
	FourCheesesTopping
	FourSeasons
	MushroomTopping
	FruitTopping
	FruttiDiMare
	MixedSeafoodTopping
	Medium
	Giardiniera
	LeekTopping
	PetitPoisTopping
	SlicedTomatoTopping
	GorgonzolaTopping
	GreenPepperTopping
	PepperTopping
	HotSpicedBeefTo

In [69]:
# defining function to adapt lexical formatting of created ontology entities to that of pizza.owl
# e.g. removing underscores between words, and capitalizing the start of each new word
def jb2pi_syntax(list):
    for idx, each in enumerate(list):
        if '_' in each:
            words = each.split('_')
            for i in range(1, len(words)):
                words[i] = words[i].capitalize()
            adapted_word = ''.join(words)
            list[idx] = adapted_word
    return list

# defining a function to revert a word from pizza.owl syntax to jb syntax
# e.g. only capitalizing the first letter in the entitiy name and placing underscores between each word
def pi2jb_syntax(word):
    words = re.findall('[A-Z][a-z]*', word)
    words = [s.lower() for s in words]
    joined = '_'.join(words)
    return joined.capitalize()

# defining a function to look at the suggested matches yielded for the below lexical matching code
# which can then be adapted accordingly to find the best matcher.
def evaluate_matcher(dic):
    for key, value in dic.items():
        print("Is {} equal to {}?".format(key, value))

In [70]:
g = Graph()

pizza_owl_namespace = 'http://www.co-ode.org/ontologies/pizza#'
jb_namespace = 'http://www.semanticweb.org/jake/ontologies/2021/2/jbrown/'
pi = Namespace(pizza_owl_namespace)
jb = Namespace(jb_namespace)
g.bind("pi", pi)
g.bind("jb", jb)


# matching classes

# modifying the terms in jb_classes to bear better lexical similarity to pizza.owl in order to find more matches
jb_classes = jb2pi_syntax(jb_classes)

# intialising dictionary to allow evalution of lexical matcher and look at proposed equivalent classes

jw = {}
match_count = 0

# iterating over the classes, creating triples for apparent matches
for each in pi_classes:
    
    # if there is an identical match:
    if each in jb_classes:
        
        pi_version = URIRef(pizza_owl_namespace+str(each))
        jb_version = URIRef(jb_namespace+str(pi2jb_syntax(each)))
        
        # adding owl:equivalentClass triple
        g.add((pi_version, OWL.equivalentClass, jb_version))
        
        
    
    elif (("Topping" in each) & (len(str(each)) > 7)):
        if each == "PizzaTopping":
            
            pi_version = URIRef(pizza_owl_namespace+str(each))
            jb_version = URIRef(jb_namespace+str("Pizza_topping"))
            
            # adding owl:equivalentClass triple
            g.add((pi_version, OWL.equivalentClass, jb_version))
            

        else:
            topping = each.split("Topping")[0]
        
            if topping in jb_classes:
                
                pi_version = URIRef(pizza_owl_namespace+str(each))
                jb_version = URIRef(jb_namespace+str(topping))
                
                # adding owl:equivalentClass triple
                g.add((pi_version, OWL.equivalentClass, jb_version))
                
            else:
                
                # leaving the rest to relatively high standard lexical matching, to prevent mismatches
                for word in jb_classes:
                    jw_score = Lev.jaro_winkler(topping, word)
                    
                    # ascertained the best threshold with limited to no mismatches through trial and error
                    threshold = 0.9
                    
                    # filtering out one condition which was not a correct matching of equivalent classes:
                    if ((topping == "Vegetarian") & (word == "VegetarianPizza")):
                        continue
                    else:
                        if jw_score > threshold:
                        
                            jw[each] = word
                        
                            pi_version = URIRef(pizza_owl_namespace+str(each))
                            jb_version = URIRef(jb_namespace+str(pi2jb_syntax(word)))
                        
                            # adding owl:equivalentClass triple
                            g.add((pi_version, OWL.equivalentClass, jb_version))
    else: 
        
        # utilising the same lexical matcher for classes that are not toppings
        for word in jb_classes:
            
            jw_score = Lev.jaro_winkler(each, word)
            # ascertaining the best threshold with limited to no mismatches through trial and error
            threshold = 0.9
            
            # filtering out one condition which was passing through some mismatched entities as equivalent classes
            if (("Equivalent" in each) & (word == "VegetarianFriendlyTopping")):
                continue
            else:
                
                if jw_score > threshold:
                
                    jw[each] = word
                    pi_version = URIRef(pizza_owl_namespace+str(each))
                    jb_version = URIRef(jb_namespace+str(pi2jb_syntax(word)))
                    g.add((pi_version, OWL.equivalentClass, jb_version))

In [71]:
len(g)

20

In [72]:
evaluate_matcher(jw)

Is American equal to Americana?
Is AmericanHot equal to Americana?
Is AnchoviesTopping equal to Anchovy?
Is PepperTopping equal to Pepperoni?
Is MeatyPizza equal to MeatPizza?
Is VegetarianPizzaEquivalent1 equal to VegetarianPizza?
Is VegetarianTopping equal to VegetarianFriendlyTopping?
Is VegetarianPizzaEquivalent2 equal to VegetarianPizza?


In [73]:
# creating list of object properties for each ontology

pi_op = processObjectProperties(pi_uri)
jb_op = processObjectProperties(jb_uri)

Object Properties in pizza.owl: 8
	hasBase
	hasIngredient
	isBaseOf
	hasCountryOfOrigin
	isIngredientOf
	hasSpiciness
	hasTopping
	isToppingOf
Object Properties in base_ontology.owl: 10
	Has_base
	Has_ingredient
	Has_location
	Has_topping
	Is_base_of
	Is_ingredient_of
	Is_located_in
	Is_topping_of
	Served_at
	Serves


In [74]:
# matching object properties

# defining function to transform ontology entity syntax to match the other, increase likelihood of correct matching
# slightly different to the class syntax (no capital letter for first word with properties)
def jb2pi_syntax(list):
    for idx, each in enumerate(list):
        if '_' in each:
            words = each.split('_')
            for i in range(0, len(words)):
                if i < 1:
                    words[i] = words[i].lower()
                else:
                    words[i] = words[i].capitalize()
            adapted_word = ''.join(words)
            list[idx] = adapted_word
    return list

In [75]:
jb_op = jb2pi_syntax(jb_op)

# iterating over pi object properties
for each in pi_op:
    
    # if the same object property is present in jb object properties:
    if each in jb_op:
        
        # create URIs
        pi_version = URIRef(pizza_owl_namespace+str(each))
        jb_version = URIRef(jb_namespace+str(pi2jb_syntax(each)))
        
        # adding owl:equivalentProperty triple
        g.add((pi_version, OWL.equivalentProperty, jb_version))  
    

In [76]:
print(len(g))
g.serialize('ontology_alignments.ttl', format = 'ttl')

26


### Subtask OA.2 Perform reasoning with (i) the created ontology, (ii) the pizza.owl ontology and (iii) the computed alignment (without the data) and list the number of unsatisfiable classes

In [77]:
def perform_reasoning(file, parsing_format = 'ttl'):
    
    g = Graph()
    g.parse(file, format = parsing_format)
    
    print("the number of triples before reasoning: {}".format(len(g)))
    
    # RDFS reasoning using owlrl semantics:
    owlrl.DeductiveClosure(owlrl.OWLRL_Semantics, axiomatic_triples=True, datatype_axioms=False).expand(g)
    
    print("the number of triples after reasoning: {}".format(len(g)))
    
    unsatisfiable_count = 0
    for s, p, o in g:
        if OWL.Nothing in o:
            unsatisfiable_count +=1
        if OWL.Nothing in p:
            unsatisfiable_count +=1
        if OWL.Nothing in s:
            unsatisfiable_count +=1
    
    print("the number of unsatisfiable classes in this reasoned ontology are {}".format(unsatisfiable_count))

In [78]:
# reasoning with i: the created ontology:

perform_reasoning('base_ontology.ttl')

the number of triples before reasoning: 317
the number of triples after reasoning: 2579
the number of unsatisfiable classes in this reasoned ontology are 113


In [80]:
# reasoning with ii: the pizza.owl ontology

perform_reasoning('pizza_owl.ttl')

the number of triples before reasoning: 1944
the number of triples after reasoning: 13638
the number of unsatisfiable classes in this reasoned ontology are 397


In [81]:
# reasoning with iii: the aligned ontology

g = Graph()
g.parse('base_ontology.ttl', format = 'ttl')
g.parse('pizza_owl.ttl', format = 'ttl')
g.parse('ontology_alignments.ttl', format = 'ttl')

print("the number of triples before reasoning: {}".format(len(g)))
    
# RDFS reasoning using owlrl semantics:
owlrl.DeductiveClosure(owlrl.OWLRL_Semantics, axiomatic_triples=True, datatype_axioms=False).expand(g)
    
print("the number of triples after reasoning: {}".format(len(g)))
    
unsatisfiable_count = 0
for s, p, o in g:
    if OWL.Nothing in o:
         unsatisfiable_count +=1
    if OWL.Nothing in p:
         unsatisfiable_count +=1
    if OWL.Nothing in s:
        unsatisfiable_count +=1
    
print("the number of unsatisfiable classes in this reasoned ontology are {}".format(unsatisfiable_count))


the number of triples before reasoning: 2287
the number of triples after reasoning: 16240
the number of unsatisfiable classes in this reasoned ontology are 491


reasoning with iv: the aligned ontology, including the RDF data.

*for some reason, although there appear to be empty interpretations of things for all 3 previous ontologies,
loading them in to protege proved no problem with no unsatisfiables other than ice cream
so, Subtask OA.2.b was carried out, querying for the 'MeatyPizzas' using the pizza.owl ontology term.

In [84]:
g = Graph()
g.parse('RDF_data.ttl', format = 'ttl')
g.parse('base_ontology.ttl', format = 'ttl')
g.parse('pizza_owl.ttl', format = 'ttl')
g.parse('ontology_alignments.ttl', format = 'ttl')

g.serialize('aligned_ontology_with_data.ttl', format = 'ttl')

print("the number of triples before reasoning: {}".format(len(g)))
    
# RDFS reasoning using owlrl semantics:
owlrl.DeductiveClosure(owlrl.OWLRL_Semantics, axiomatic_triples=True, datatype_axioms=False).expand(g)
    
print("the number of triples after reasoning: {}".format(len(g)))
    
unsatisfiable_count = 0
for s, p, o in g:
    if OWL.Nothing in o:
         unsatisfiable_count +=1
    if OWL.Nothing in p:
         unsatisfiable_count +=1
    if OWL.Nothing in s:
        unsatisfiable_count +=1
    
print("the number of unsatisfiable classes in this reasoned ontology are {}".format(unsatisfiable_count))

the number of triples before reasoning: 27991
the number of triples after reasoning: 128665
the number of unsatisfiable classes in this reasoned ontology are 491


### Subtask OA.2.b - Create a query to return the pizzas with type pizza:MeatyPizza.

In [85]:
qres = g.query(
""" 
PREFIX jb: <http://www.semanticweb.org/jake/ontologies/2021/2/jbrown/>
PREFIX pizza: <http://www.co-ode.org/ontologies/pizza#>

SELECT DISTINCT ?pizzas
WHERE {

?pizzas rdf:type pizza:MeatyPizza .

}


""")

# creating and opening csv file
f_out = open("OA_meatypizza_query_results.csv","w+")

row_limit = 10
print("Sample of returned meaty pizzas")
for row in qres:
    
    # printing results
    if row_limit > 0:
        print(str(row[0]))
        row_limit -=1
    
    # writing results to csv file
    line_str = '\"%s\"\n' % (str(row[0]))
    f_out.write(line_str)
            
     
f_out.close()



Sample of returned meaty pizzas
http://www.semanticweb.org/jake/ontologies/2021/2/jbrown/pepperoni_pizza_at_marzanos
http://www.semanticweb.org/jake/ontologies/2021/2/jbrown/bbq_chicken_pizza_at_bearnos_pizza
http://www.semanticweb.org/jake/ontologies/2021/2/jbrown/sunny_side_up_farm_egg_pizza_at_mc_kitchen
http://www.semanticweb.org/jake/ontologies/2021/2/jbrown/chicken_pesto_gourmet_pizza_at_nolita
http://www.semanticweb.org/jake/ontologies/2021/2/jbrown/steak_and_cheese_pizza_at_stone_and_paddle
http://www.semanticweb.org/jake/ontologies/2021/2/jbrown/hawaiian_pizza_at_good_fellas_pizza
http://www.semanticweb.org/jake/ontologies/2021/2/jbrown/chicken_and_broccoli_pizza_medium_at_j__g_restaurant
http://www.semanticweb.org/jake/ontologies/2021/2/jbrown/cheesesteak_pizza_at_riccardos_pizza
http://www.semanticweb.org/jake/ontologies/2021/2/jbrown/philly_cheesesteak_pizza_at_phat_boyz_new_york_style_pizzeria
http://www.semanticweb.org/jake/ontologies/2021/2/jbrown/buffalo_chicken_pizza_a