In [1]:
# !pip install owlready2

## Retrieve ontology

In [2]:
from owlready2 import *
import pandas as pd
import json
import math



In [None]:
# change path below to match your local copy of the .owl file you want to use

In [3]:
onto = get_ontology("file:///Users/kevinlin/Documents/classes/cs270/final-project/cs270-final-project/ontology/business.owl").load()

## Retrieve & prepare dataset

In [4]:
business_pkl = '../yelp_dataset/business.pkl'

In [5]:
business_df = pd.read_pickle(business_pkl)

In [6]:
# retrieve businesses that have 'Restaurant' and 'Food' in 'categories'
df = business_df[business_df['categories'].notnull()]
df = df[df['categories'].str.contains('Restaurants')]
df = df[df['categories'].str.contains('Food')]

# parse 'attributes.DietaryRestrictions'
df['attributes.DietaryRestrictions'] = df['attributes.DietaryRestrictions'].replace([float('nan'), 'None'], "{'dairy-free': False, 'gluten-free': False, 'vegan': False, 'kosher': False, 'halal': False, 'soy-free': False, 'vegetarian': False}")
df['attributes.DietaryRestrictions'] = df['attributes.DietaryRestrictions'].str.replace("\'", "\"").str.replace("False", "\"False\"").str.replace("True", "\"True\"")
df = df.join(df['attributes.DietaryRestrictions'].apply(json.loads).apply(pd.Series))

# parse 'attributes.Ambience' attribute
df['attributes.Ambience'] = df['attributes.Ambience'].replace(float('nan'), "{'touristy': False, 'hipster': False, 'romantic': False, 'divey': False, 'intimate': False, 'trendy': False, 'upscale': False, 'classy': False, 'casual': False}")
df['attributes.Ambience'] = df['attributes.Ambience'].str.replace("\'", "\"").str.replace("False", "\"False\"").str.replace("True", "\"True\"").str.replace("None", "\"False\"")
df = df.join(df['attributes.Ambience'].apply(json.loads).apply(pd.Series))

# ... add more as necessary

In [32]:
#df.columns

In [35]:
df

Unnamed: 0,hours.Wednesday,attributes.RestaurantsDelivery,attributes.Open24Hours,attributes.DogsAllowed,attributes.CoatCheck,postal_code,attributes.Smoking,hours.Thursday,attributes.DietaryRestrictions,city,...,0,casual,classy,divey,hipster,intimate,romantic,touristy,trendy,upscale
0,11:0-23:0,,,False,,80302,,11:0-23:0,"{""dairy-free"": ""False"", ""gluten-free"": ""False""...",Boulder,...,,True,False,False,False,False,False,False,False,False
12,11:0-21:0,True,,True,,01960,,11:0-21:0,"{""dairy-free"": ""False"", ""gluten-free"": ""False""...",Peabody,...,,True,False,False,False,False,False,False,False,False
13,11:0-18:0,False,,False,,32806,,11:0-18:0,"{""dairy-free"": ""False"", ""gluten-free"": ""False""...",Orlando,...,,False,False,False,False,False,False,False,False,False
16,7:0-22:0,,,,,32830,,7:0-22:0,"{""dairy-free"": ""False"", ""gluten-free"": ""False""...",Orlando,...,,False,False,False,False,False,False,False,False,False
19,,True,,,,32809,,,"{""dairy-free"": ""False"", ""gluten-free"": ""False""...",Pine Castle,...,,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
160535,10:0-21:0,True,,,,43230,,10:0-21:0,"{""dairy-free"": ""False"", ""gluten-free"": ""False""...",Columbus,...,,True,False,False,False,False,False,False,False,False
160544,11:0-21:30,True,,True,,32828,,11:0-21:30,"{""dairy-free"": ""False"", ""gluten-free"": ""False""...",Orlando,...,,True,True,False,True,False,False,False,True,False
160552,8:0-17:0,,,,,02130,,8:0-17:0,"{""dairy-free"": ""False"", ""gluten-free"": ""False""...",Jamaica Plain,...,,False,False,False,False,False,False,False,False,False
160572,10:30-21:30,True,,False,,32803,,10:30-21:30,"{""dairy-free"": ""False"", ""gluten-free"": ""False""...",Orlando,...,,True,True,False,False,False,False,False,False,False


## Validate dataset properties

In [8]:
# unique values of 'stars'
print(business_df['stars'].unique())
print(type(business_df['stars'].unique()[0]))

[4.  4.5 3.  3.5 5.  2.5 2.  1.  1.5]
<class 'numpy.float64'>


In [9]:
# check types of all 'review_count' values
print(set([type (x) for x in business_df['review_count'].unique()]))

{<class 'numpy.int64'>}


In [10]:
# check types of all 'name' values
print(set([type (x) for x in business_df['name'].unique()]))

{<class 'str'>}


In [33]:
#print(business_df['hours.Monday'].unique())
print(set([type (x) for x in business_df['city'].unique()]))

{<class 'str'>}


In [12]:
print(set([type (x) for x in business_df['latitude'].unique()]))

{<class 'numpy.float64'>}


In [13]:
print(set([type (x) for x in business_df['longitude'].unique()]))

{<class 'numpy.float64'>}


In [14]:
print(set([type (x) for x in business_df['categories'].unique()]))

{<class 'float'>, <class 'str'>}


In [15]:
for x in business_df['attributes.Ambience'].unique():
    if isinstance(x, float):
        print(x)
print(set([type (x) for x in business_df['attributes.Ambience'].unique()]))
print(business_df['attributes.Ambience'][0])

nan
{<class 'float'>, <class 'str'>}
{'touristy': False, 'hipster': False, 'romantic': False, 'divey': False, 'intimate': False, 'trendy': False, 'upscale': False, 'classy': False, 'casual': True}


In [16]:
# check types of all 'name' values
print(set([type (x) for x in business_df['name'].unique()]))

{<class 'str'>}


In [17]:
# check values and types of all 'RestaurantsPriceRange2' values
print(business_df['attributes.RestaurantsPriceRange2'].unique())
print(type(business_df['attributes.RestaurantsPriceRange2'].unique()[-1])) # float or str

# note: not every business has a price range!

['2' '1' nan '3' '4' 'None' 1.0 2.0 3.0 4.0]
<class 'float'>


## Create instances

In [34]:
onto.Business

business.Business

In [37]:
onto['CajunRestaurant']

business.CajunRestaurant

In [18]:
# loop through dataset and create instances
i = 0
for _, row in df.iterrows():
    individual = onto.Business(row['business_id'])
    
    # fill 'characteristic' data properties
    individual.businessName = row['name']
    individual.stars = row['stars']
    individual.reviewCount = row['review_count']
    
    # fill 'location' data properties
    individual.city = row['city']
    individual.latitude = row['latitude']
    individual.longitude = row['longitude']
    ## min/max lat/long for places within 100km of business
    r = 100 / 6371
    lat_rad = math.radians(row['latitude'])
    long_rad = math.radians(row['longitude'])
    individual.minLat = math.degrees(lat_rad - r)
    individual.maxLat = math.degrees(lat_rad + r)
    d_lon = math.asin(math.sin(r) / math.cos(lat_rad))
    individual.minLon = math.degrees(long_rad - d_lon)
    individual.maxLon = math.degrees(long_rad + d_lon)
    
    # fill in 'operations' data properties
    hourAttributes = ['hours.Monday', 'hours.Tuesday', 'hours.Wednesday', 'hours.Thursday', 'hours.Friday', 'hours.Saturday', 'hours.Sunday']
    dayPrefixes = ['mon', 'tues', 'wed', 'thurs', 'fri', 'sat', 'sun']
    openProperties = [dayPrefix + 'OpenTime' for dayPrefix in dayPrefixes]
    closeProperties = [dayPrefix + 'CloseTime' for dayPrefix in dayPrefixes]
    for hourAttr, openProp, closeProp in zip(hourAttributes, openProperties, closeProperties):
        hours = row[hourAttr]
        if isinstance(hours, str):
            openTime, closeTime = hours.split('-')
            openHour, openMinute = [int(i) for i in openTime.split(':')]
            closeHour, closeMinute = [int(i) for i in closeTime.split(':')]
            setattr(individual, openProp, openHour + (openMinute * 0.01))
            # handle next day scenario
            if closeHour < openHour:
                setattr(individual, closeProp, 23.59)
            else:
                setattr(individual, closeProp, closeHour + (closeMinute * 0.01))
    
    # make multi-class individual (assign relevant parent classes)
    ## categories + dietary restriction (specialization & restaurant type)
    categories = row['categories']
    if isinstance(categories, str):
        categories = categories.split(', ')
        
        # American restaurants
        if 'American (Traditional)' in categories:
            individual.is_a.append(onto.TraditionalAmericanRestaurant)
        if 'American (New)' in categories:
            individual.is_a.append(onto.NewAmericanRestaurant)
        if 'Cajun/Creole' in categories:
            individual.is_a.append(onto.CajunRestaurant)
        if 'Tex-Mex' in categories:
            individual.is_a.append(onto.TexMexRestaurant)
        if 'Southern' in categories:
            individual.is_a.append(onto.SouthernRestaurant)
        if 'Hawaiian' in categories:
            individual.is_a.append(onto.HawaiianRestaurant)
        
        # Asian restaurants
        if 'Pan Asian' in categories:
            individual.is_a.append(onto.PanAsianRestaurant)
        if 'Taiwanese' in categories:
            individual.is_a.append(onto.TaiwaneseRestaurant)
        if 'Hakka' in categories:
            individual.is_a.append(onto.HakkaRestaurant)
        if 'Singaporean' in categories:
            individual.is_a.append(onto.SingaporeanRestaurant)
        if 'Korean' in categories:
            individual.is_a.append(onto.KoreanRestaurant)
        if 'Japanese' in categories:
            individual.is_a.append(onto.JapaneseRestaurant)
        if 'Chinese' in categories:
            individual.is_a.append(onto.ChineseRestaurant)
        if 'Shanghainese' in categories:
            individual.is_a.append(onto.ShanghaineseRestaurant)
        if 'HongKongStyleCafe' in categories:
            individual.is_a.append(onto.HongKongStyleCafe)
        if 'Cantonese' in categories:
            individual.is_a.append(onto.CantoneseRestaurant)
        if 'Asian Fusion' in categories:
            individual.is_a.append(onto.AsianFusionRestaurant)
            
        # Specializations
        if 'Dumplings' in categories:
            individual.specializesIn.append(onto.Dumplings)
        if 'Dim Sum' in categories:
            individual.specializesIn.append(onto.Dimsum)
        
        diet = row['attributes.DietaryRestrictions']
        if 'Vegetarian' in categories or row['vegetarian'] == 'True':
            individual.specializesIn.append(onto.Vegetarian)
        if 'Vegan' in categories or row['vegan'] == 'True':
            individual.specializesIn.append(onto.Vegetarian)
        
    ## ambience
    if row['casual'] == 'True':
        individual.hasAmbience.append(onto.CasualAmbience)
    if row['classy'] == 'True':
        individual.hasAmbience.append(onto.ClassyAmbience)
    if row['divey'] == 'True':
        individual.hasAmbience.append(onto.DiveyAmbience)
    if row['hipster'] == 'True':
        individual.hasAmbience.append(onto.HipsterAmbience)
    if row['intimate'] == 'True':
        individual.hasAmbience.append(onto.IntimateAmbience)
    if row['romantic'] == 'True':
        individual.hasAmbience.append(onto.RomanticAmbience)
    if row['touristy'] == 'True':
        individual.hasAmbience.append(onto.TouristyAmbience)
    if row['trendy'] == 'True':
        individual.hasAmbience.append(onto.TrendyAmbience)
    if row['upscale'] == 'True':
        individual.hasAmbience.append(onto.UpscaleAmbience)
    
    # debug
    i += 1
    if i > 1000:
        break # full run takes a long time... save for later
#     print(individual.__class__)
#     print(row)
#     break

## Save

In [19]:
#onto.save(file = '../ontology/businessWithInstances.owl', format = 'rdfxml')

## Querying
Goal: input -> best restaurants
Geerate [SPARQL queries](https://owlready2.readthedocs.io/en/latest/sparql.html)

[SPARQL documentation](https://www.w3.org/TR/sparql11-query/)

Input format:

`
{
    "lat": double,
    "lon": double,
    "day": str ('mon'...'sun'),
    "time": double (format: hour.minute e.g. 11.45),
    "categories": list of categories,
    "ambiences": list of Ambience classes,
    "minStars": double,
    "minReviewCount" = int
}
`

### Querying Experimentation

In [20]:
list(default_world.sparql("""
    SELECT ?x
    WHERE {
        ?x rdfs:subClassOf* business:RomanticAmbience
    }
"""))

[[business.RomanticAmbience]]

In [21]:
list(default_world.sparql("""
    SELECT ?x
    WHERE {
        ?x rdf:type ?type
        ?type rdfs:subClassOf* business:Restaurant
        ?type rdfs:subClassOf* business:TaiwaneseRestaurant
    }
"""))

[[business.2SbmgX5eHK4EMaIJmO1qbw],
 [business.s7k9cZiNLmA9NGYJcCnMbQ],
 [business.H1TOSeQyuK-edXGWIRgG5g],
 [business.P6u9VBwU20tkfEKIlmrOTA],
 [business.3Womhy8g-3J-VREWO31d9Q],
 [business.tVNzsMS5WsEZZ0AN7r9utw],
 [business.RPnSUIZMRS5T8KsSI1dwMA],
 [business.5hbOXgrvCYsD6m3HDdX4Ug],
 [business.O0dujTET71iNknXA9Kbwfw]]

In [22]:
# example template
list(default_world.sparql("""
    SELECT ?x ?businessName
    WHERE {
        ?x rdf:type ?type
        ?type rdfs:subClassOf* business:TaiwaneseRestaurant
        
        ?x business:businessName ?businessName
        
        ?x business:minLat ?minLat
        ?x business:maxLat ?maxLat
        ?x business:minLon ?minLon
        ?x business:maxLon ?maxLon
        FILTER (49 < ?maxLat && 49 > ?minLat && -123 < ?maxLon && -123 > ?minLon)
        
        ?x business:monOpenTime ?openTime
        ?x business:monCloseTime ?closeTime
        FILTER (11.45 > ?openTime && 11.45 < ?closeTime)
        
        ?x business:stars ?stars
        ?x business:reviewCount ?reviewCount
        FILTER (?stars > 2.0 && ?reviewCount > 0)
        
        ?x business:hasAmbience business:CasualAmbience
        ?x business:hasAmbience business:RomanticAmbience
    }
"""))

[]

### Querying Template

In [53]:
# lat = 42.0
# lon = -123.0
lat = None
lon = None
day = 'mon'
time = 11.45
categories = ['TaiwaneseRestaurant']
ambiences = []
minStars = 2.0
minReviewCount = 1

# json = parse(json that nicholas gives us)
# lat, lon ,day ... = json
# lat = json['lat']

query = "SELECT ?x\nWHERE {\n\t?x rdf:type ?type\n\t?x business:businessName ?businessName\n"
if lat is not None and lon is not None:
    query += "\t?x business:minLat ?minLat\n\t?x business:maxLat ?maxLat\n\t?x business:minLon ?minLon\n\t?x business:maxLon ?maxLon\n"
    query += "\tFILTER (" + str(lat) + " < ?maxLat && " + str(lat) + " > ?minLat && " + str(lon) + " < ?maxLon && " + str(lon) + " > ?minLon)\n"
if day is not None and time is not None:
    query += "\t?x business:" + day + "OpenTime ?openTime\n\t?x business:" + day + "CloseTime ?closeTime\n"
    query += "\tFILTER (" + str(time) + " > ?openTime && " + str(time) + " < ?closeTime)\n"
if minStars is not None:
    query += "\t?x business:stars ?stars\n\tFILTER(?stars > " + str(minStars) + ")\n"
if minReviewCount is not None:
    query += "\t?x business:reviewCount ?reviewCount\n\tFILTER(?reviewCount > " + str(minReviewCount) + ")\n"
if len(categories) > 0:
    for cat in categories:
        """
        if cat in dishes:
            query += "\t?x business:hasDish" + cat + "\n"
        elif cat in 
        """
        query += "\t?type rdfs:subClassOf* business:" + cat + "\n"
if len(ambiences) > 0:
    for amb in ambiences:
        query += "\t?x business:hasAmbience business:" + amb + "\n"

query += "}"

print(query)
results = list(default_world.sparql(query))
print(results)

"""
while len(results) < 10:
    # run the query
    # before: modify the constraints (e.g. categories.append(children of ancestors))
    # lower minStars
    # lower minReviewCount
    # etc.
    results.append(# run the query)
"""

SELECT ?x
WHERE {
	?x rdf:type ?type
	?x business:businessName ?businessName
	?x business:monOpenTime ?openTime
	?x business:monCloseTime ?closeTime
	FILTER (11.45 > ?openTime && 11.45 < ?closeTime)
	?x business:stars ?stars
	FILTER(?stars > 2.0)
	?x business:reviewCount ?reviewCount
	FILTER(?reviewCount > 1)
	?type rdfs:subClassOf* business:TaiwaneseRestaurant
}


[[business.H1TOSeQyuK-edXGWIRgG5g],
 [business.RPnSUIZMRS5T8KsSI1dwMA],
 [business.5hbOXgrvCYsD6m3HDdX4Ug],
 [business.O0dujTET71iNknXA9Kbwfw]]

In [None]:
# TODO: develope specific algorithms / strategies for PSM
# (e.g. what happens when user wants to find more results or there are no results at all)

In [49]:
# get direct parent classes
parents = []
targetClass = onto.HakkaRestaurant
for ancestor in targetClass.ancestors():
    if targetClass in list(ancestor.subclasses()):
        parents.append(ancestor)
print(parents)

[business.ChineseRestaurant, business.TaiwaneseRestaurant]


In [50]:
# get children of direct parent classes
for parent in parents:
    print(list(parent.subclasses()))

[business.CantoneseRestaurant, business.HakkaRestaurant, business.HongKongStyleCafe, business.ShanghaineseRestaurant]
[business.HakkaRestaurant]


## Debugging / Scratch Work

In [28]:
# check inconsistent classes
list(default_world.inconsistent_classes())

[]