In [1]:
# !pip install owlready2

## Retrieve ontology

In [2]:
from owlready2 import *
import pandas as pd
import json



In [3]:
onto = get_ontology("file:///Users/kevinlin/Documents/classes/cs270/final-project/cs270-final-project/ontology/business.owl").load()

## Retrieve & prepare dataset

In [4]:
business_pkl = '../yelp_dataset/business.pkl'

In [5]:
business_df = pd.read_pickle(business_pkl)

In [6]:
# retrieve businesses that have 'Restaurant' and 'Food' in 'categories'
df = business_df[business_df['categories'].notnull()]
df = df[df['categories'].str.contains('Restaurants')]
df = df[df['categories'].str.contains('Food')]

# parse 'attributes.DietaryRestrictions'
df['attributes.DietaryRestrictions'] = df['attributes.DietaryRestrictions'].replace([float('nan'), 'None'], "{'dairy-free': False, 'gluten-free': False, 'vegan': False, 'kosher': False, 'halal': False, 'soy-free': False, 'vegetarian': False}")
df['attributes.DietaryRestrictions'] = df['attributes.DietaryRestrictions'].str.replace("\'", "\"").str.replace("False", "\"False\"").str.replace("True", "\"True\"")
df = df.join(df['attributes.DietaryRestrictions'].apply(json.loads).apply(pd.Series))

# parse 'attributes.Ambience' attribute
df['attributes.Ambience'] = df['attributes.Ambience'].replace(float('nan'), "{'touristy': False, 'hipster': False, 'romantic': False, 'divey': False, 'intimate': False, 'trendy': False, 'upscale': False, 'classy': False, 'casual': False}")
df['attributes.Ambience'] = df['attributes.Ambience'].str.replace("\'", "\"").str.replace("False", "\"False\"").str.replace("True", "\"True\"").str.replace("None", "\"False\"")
df = df.join(df['attributes.Ambience'].apply(json.loads).apply(pd.Series))

# ... add more as necessary

In [7]:
df.columns

Index([                      'hours.Wednesday',
              'attributes.RestaurantsDelivery',
                      'attributes.Open24Hours',
                      'attributes.DogsAllowed',
                        'attributes.CoatCheck',
                                 'postal_code',
                          'attributes.Smoking',
                              'hours.Thursday',
              'attributes.DietaryRestrictions',
                                        'city',
                               'hours.Tuesday',
         'attributes.RestaurantsGoodForGroups',
                             'attributes.BYOB',
                      'attributes.AgesAllowed',
                                'hours.Friday',
                                       'stars',
                                    'latitude',
                          'attributes.Alcohol',
           'attributes.RestaurantsPriceRange2',
          'attributes.RestaurantsTableService',
                'attributes.HairSpeciali

## Validate dataset properties

In [8]:
# unique values of 'stars'
print(business_df['stars'].unique())
print(type(business_df['stars'].unique()[0]))

[4.  4.5 3.  3.5 5.  2.5 2.  1.  1.5]
<class 'numpy.float64'>


In [9]:
# check types of all 'review_count' values
print(set([type (x) for x in business_df['review_count'].unique()]))

{<class 'numpy.int64'>}


In [10]:
# check types of all 'name' values
print(set([type (x) for x in business_df['name'].unique()]))

{<class 'str'>}


In [11]:
# print(business_df['city'].unique())
print(set([type (x) for x in business_df['city'].unique()]))

{<class 'str'>}


In [12]:
print(set([type (x) for x in business_df['latitude'].unique()]))

{<class 'numpy.float64'>}


In [13]:
print(set([type (x) for x in business_df['longitude'].unique()]))

{<class 'numpy.float64'>}


In [14]:
print(set([type (x) for x in business_df['categories'].unique()]))

{<class 'float'>, <class 'str'>}


In [15]:
for x in business_df['attributes.Ambience'].unique():
    if isinstance(x, float):
        print(x)
print(set([type (x) for x in business_df['attributes.Ambience'].unique()]))
print(business_df['attributes.Ambience'][0])

nan
{<class 'float'>, <class 'str'>}
{'touristy': False, 'hipster': False, 'romantic': False, 'divey': False, 'intimate': False, 'trendy': False, 'upscale': False, 'classy': False, 'casual': True}


In [16]:
# check types of all 'name' values
print(set([type (x) for x in business_df['name'].unique()]))

{<class 'str'>}


In [17]:
# check values and types of all 'RestaurantsPriceRange2' values
print(business_df['attributes.RestaurantsPriceRange2'].unique())
print(type(business_df['attributes.RestaurantsPriceRange2'].unique()[-1])) # float or str

# note: not every business has a price range!

['2' '1' nan '3' '4' 'None' 1.0 2.0 3.0 4.0]
<class 'float'>


## Create instances

In [18]:
# loop through dataset and create instances
i = 0
for _, row in df.iterrows():
    individual = onto.Business(row['business_id'], businessName=row['name'])
    
    # fill 'characteristic' data properties
    individual.businessName = row['name']
    individual.stars = row['stars']
    individual.reviewCount = row['review_count']
    
    # fill 'location' data properties
    individual.city = row['city']
    individual.latitude = row['latitude']
    individual.longitude = row['longitude']
    
    # fill in 'operations' data properties
    hourAttributes = ['hours.Monday', 'hours.Tuesday', 'hours.Wednesday', 'hours.Thursday', 'hours.Friday', 'hours.Saturday', 'hours.Sunday']
    dayPrefixes = ['mon', 'tues', 'wed', 'thurs', 'fri', 'sat', 'sun']
    openProperties = [dayPrefix + 'OpenTime' for dayPrefix in dayPrefixes]
    closeProperties = [dayPrefix + 'CloseTime' for dayPrefix in dayPrefixes]
    for hourAttr, openProp, closeProp in zip(hourAttributes, openProperties, closeProperties):
        hours = row[hourAttr]
        if isinstance(hours, str):
            openTime, closeTime = hours.split('-')
            openHour, openMinute = [int(i) for i in openTime.split(':')]
            closeHour, closeMinute = [int(i) for i in closeTime.split(':')]
            setattr(individual, openProp, openHour + (openMinute * 0.01))
            setattr(individual, closeProp, closeHour + (closeMinute * 0.01))
    
    # make multi-class individual (assign relevant parent classes)
    ## categories + dietary restriction (specialization & restaurant type)
    categories = row['categories']
    if isinstance(categories, str):
        categories = categories.split(', ')
        
        # American restaurants
        if 'American (Traditional)' in categories:
            individual.is_a.append(onto.TraditionalAmericanRestaurant)
        if 'American (New)' in categories:
            individual.is_a.append(onto.NewAmericanRestaurant)
        if 'Cajun/Creole' in categories:
            individual.is_a.append(onto.CajunRestaurant)
        if 'Tex-Mex' in categories:
            individual.is_a.append(onto.TexMexRestaurant)
        if 'Southern' in categories:
            individual.is_a.append(onto.SouthernRestaurant)
        if 'Hawaiian' in categories:
            individual.is_a.append(onto.HawaiianRestaurant)
        
        # Asian restaurants
        if 'Pan Asian' in categories:
            individual.is_a.append(onto.PanAsianRestaurant)
        if 'Taiwanese' in categories:
            individual.is_a.append(onto.TaiwaneseRestaurant)
        if 'Hakka' in categories:
            individual.is_a.append(onto.HakkaRestaurant)
        if 'Singaporean' in categories:
            individual.is_a.append(onto.SingaporeanRestaurant)
        if 'Korean' in categories:
            individual.is_a.append(onto.KoreanRestaurant)
        if 'Japanese' in categories:
            individual.is_a.append(onto.JapaneseRestaurant)
        if 'Chinese' in categories:
            individual.is_a.append(onto.ChineseRestaurant)
        if 'Shanghainese' in categories:
            individual.is_a.append(onto.ShanghaineseRestaurant)
        if 'HongKongStyleCafe' in categories:
            individual.is_a.append(onto.HongKongStyleCafe)
        if 'Cantonese' in categories:
            individual.is_a.append(onto.CantoneseRestaurant)
        if 'Asian Fusion' in categories:
            individual.is_a.append(onto.AsianFusionRestaurant)
            
        # Specializations
        if 'Dumplings' in categories:
            individual.specializesIn.append(onto.Dumplings)
        if 'Dim Sum' in categories:
            individual.specializesIn.append(onto.Dimsum)
        
        diet = row['attributes.DietaryRestrictions']
        if 'Vegetarian' in categories or row['vegetarian'] == 'True':
            individual.specializesIn.append(onto.Vegetarian)
        if 'Vegan' in categories or row['vegan'] == 'True':
            individual.specializesIn.append(onto.Vegetarian)
        
    ## ambience
    if row['casual'] == 'True':
        individual.hasAmbience.append(onto.CasualAmbience)
    if row['classy'] == 'True':
        individual.hasAmbience.append(onto.ClassyAmbience)
    if row['divey'] == 'True':
        individual.hasAmbience.append(onto.DiveyAmbience)
    if row['hipster'] == 'True':
        individual.hasAmbience.append(onto.HipsterAmbience)
    if row['intimate'] == 'True':
        individual.hasAmbience.append(onto.IntimateAmbience)
    if row['romantic'] == 'True':
        individual.hasAmbience.append(onto.RomanticAmbience)
    if row['touristy'] == 'True':
        individual.hasAmbience.append(onto.TouristyAmbience)
    if row['trendy'] == 'True':
        individual.hasAmbience.append(onto.TrendyAmbience)
    if row['upscale'] == 'True':
        individual.hasAmbience.append(onto.UpscaleAmbience)
    
    # debug
    i += 1
    if i > 1000:
        break # full run takes a long time... save for later
#     print(individual.__class__)
#     print(row)
#     break

## Save

In [21]:
# onto.save(file = '../ontology/businessWithInstances.owl', format = 'rdfxml')

## Querying
Goal: input -> best restaurants
Generate [SPARQL queries](https://owlready2.readthedocs.io/en/latest/sparql.html)

Input format:

{

    'latitude': double,
    'longitude': double,
    'city': str,
    'day': str (Monday-Sunday),
    'time': double (format: hour.minute),
    'categories': [str],
    'ambience': {
        'casual' True/False,
        'classy' True/False,
        'divey' True/False,
        'hipster' True/False,
        'intimate' True/False,
        'romantic' True/False,
        'touristy' True/False,
        'trendy' True/False,
        'upscale' True/False,
    },
    'minStars': int,
    'minReviewCount': int
}

In [32]:
# e.g. Get all Taiwanese restaurants
list(default_world.sparql("""
    SELECT ?x
    WHERE {
        ?x rdf:type ?type
        ?type rdfs:subClassOf* business:TaiwaneseRestaurant
    }
"""))

[[business.2SbmgX5eHK4EMaIJmO1qbw],
 [business.s7k9cZiNLmA9NGYJcCnMbQ],
 [business.H1TOSeQyuK-edXGWIRgG5g],
 [business.P6u9VBwU20tkfEKIlmrOTA],
 [business.3Womhy8g-3J-VREWO31d9Q],
 [business.tVNzsMS5WsEZZ0AN7r9utw],
 [business.RPnSUIZMRS5T8KsSI1dwMA],
 [business.5hbOXgrvCYsD6m3HDdX4Ug],
 [business.O0dujTET71iNknXA9Kbwfw]]

## Debugging / Scratch Work

In [None]:
onto['6iYb2HFDywm3zjuRg0shjw'].specializesIn

In [None]:
onto['6iYb2HFDywm3zjuRg0shjw'].sunOpenTime

In [None]:
df[['business_id', 'hours.Monday']]

In [None]:
for x in df['hours.Monday']:
    print(x)

In [None]:
# check inconsistent classes
list(default_world.inconsistent_classes())

In [None]:
# TODO: create instances

# TODO: figure out how to create queries (i.e. given input, create necessary query to get results)

# TODO: develope specific algorithms / strategies for PSM (e.g. what happens when user wants to find more results or there are no results at all)