In [123]:
from pydantic import BaseModel
import pandas as pd
import numpy as np
import yake
import openai

from typing import List
import csv

In [24]:
load_dotenv()
openai_api_key = os.environ.get('OPENAI_API_KEY')
openai.api_key = openai_api_key

In [13]:
class keyword(BaseModel):
    keyword: str
    relevant: int

In [15]:
df = pd.read_csv("./item_with_ratings.csv")

In [16]:
df.columns

Index(['title', 'rating', 'number_of_reviews', 'address', 'review',
       'hasDelivery', 'hasTakeaway', 'hasDineIn', 'category', 'status', 'url',
       'long', 'lat', 'country'],
      dtype='object')

In [44]:
df.head(5)

Unnamed: 0,title,rating,number_of_reviews,address,review,hasDelivery,hasTakeaway,hasDineIn,category,status,url,long,lat,country
0,Natter Coffee & Gelato,6,111,"124 Tg Pagar Rd, Singapore 088533",Good option for ice cream and coffee. Decor ki...,False,True,True,Coffee shop,Open,https://www.google.com/maps/place/Natter+Coffe...,1.277715,103.84349,Singapore
1,Kinki Restaurant + Bar,5,1220,"70, #02-02 Collyer Quay, Customs House, 049323",3/5 - You pay for the view but the food isnt a...,False,True,True,Japanese restaurant,Open,https://www.google.com/maps/place/Kinki+Restau...,1.282491,103.853475,Singapore
2,Pastaria Abate,5,1563,"86 Neil Rd, Singapore 088846","3/5 - Only good for the miles and even then, n...",True,True,True,Restaurant,Open,https://www.google.com/maps/place/Pastaria+Aba...,1.279604,103.842173,Singapore
3,Underdog Inn,7,42,"115 Amoy Street, Gemmill Ln, #01-03 Along, 069935",4/5 - Nice environment chill vibes. Not sure o...,True,True,True,Restaurant,Open,https://www.google.com/maps/place/Underdog+Inn...,1.282287,103.847485,Singapore
4,Sonny's Pizza,5,173,"17 Circular Rd, Singapore 049373",3/5 - No frills bodega style pizza with some c...,True,True,True,Pizza restaurant,Open,https://www.google.com/maps/place/Sonny's+Pizz...,1.28744,103.848941,Singapore


In [46]:
class Location(BaseModel):
    title:str
    rating:float
    number_of_reviews:int
    user_review:str
    categories:list[str]

In [50]:
locations = []

for row in df.itertuples():
    location = Location(
        title = row.title,
        rating = row.rating/2,
        number_of_reviews = row.number_of_reviews,
        user_review=row.review,
        categories = [row.category]
    )
    locations.append(location)

In [51]:
len(locations)

80

In [17]:
kw_extractor = yake.KeywordExtractor()
keywords = set([])

for row in df.itertuples():
    formatted_string = f"title: {row.title}, review: {row.review}, category: {row.category}"
    new_keywords = kw_extractor.extract_keywords(formatted_string)
    extracted_keywords = [x[0] for x in new_keywords]
    for extracted_keyword in extracted_keywords:
      keywords.add(extracted_keyword)

In [91]:
class Tags(BaseModel):
    categories: List[str]
        
def generate_categories(keywords):
    keywords_with_new_lines = '\n'.join(keywords)

    prompt = f"""
    Invent categories for some restaurants. You are about to be provided with a brief description of a restrautn from google maps.
    
    Here are some categories that we have. Only consider english categories.
    {keywords_with_new_lines}
    
    Create 30 short categories that semantically group these keywords.
    
    Think step by step
    """
    
    response = openai.ChatCompletion.create(
        model = "gpt-3.5-turbo-16k",
        messages = [
            {'role':'user','content':prompt}
        ],
        functions = [
            {
                'name': 'output_categories',
                'description': 'The final list of categories',
                'parameters':Tags.schema()
            }
        ],
        function_call={
            'name':'output_categories'
        }
    )
    
    parsed_json = response.choices[0]["message"]["function_call"]["arguments"]
    categories = json.loads(parsed_json)["categories"]
    return categories

res = generate_categories(list(keywords))

In [102]:
exclude = []
categories = [i for i in res if i not in exclude]
categories

['Dessert shop',
 'Drinks',
 'Mexican restaurant',
 'Chinese restaurant',
 'Roast Meat',
 'Coffee shop',
 'Bakery',
 'Japanese restaurant',
 'Ice cream shop',
 'Falafel restaurant',
 'Italian restaurant',
 'Hawker Stall',
 'Bar',
 'Noodle shop',
 'French restaurant',
 'Café',
 'Seafood restaurant',
 'Pizza restaurant',
 'Korean restaurant',
 'Brunch spot',
 'Burger joint',
 'Vegetarian restaurant',
 'Indian restaurant',
 'Steakhouse',
 'Sushi restaurant',
 'Thai restaurant',
 'Vietnamese restaurant',
 'BBQ restaurant',
 'Chinese noodle restaurant',
 'Tapas restaurant',
 'Food court']

In [115]:
@retry(tries=3, delay=2)
def tag_restaurant(location:Location,categories:list[str]):
    joined_categories = '\n'.join(categories)
    prompt = f"""
    Given a Restaurant title and a candid user review, return a new list of 4 categories for the following restaurant
    
    You can use the following categories
    {joined_categories}
    
    Restaurant Title: {location.title},
    Existing Categories: [{','.join(location.categories)}]
    User Review: {location.user_review}
    
    You MUST only response with each chose category separated by a new line.
    You MUST not say anything after finishing.
    Your response will be used to tag the paper so don't say anything!
    
    The 4 Categories:
    """
    
    response = openai.ChatCompletion.create(
        model = "gpt-3.5-turbo-16k",
        messages = [
            {'role':'user','content':prompt}
        ]
    )
    return response["choices"][0]["message"]["content"].split("\n")

Location(title='Aisyah Restaurant 西北香', rating=4.0, number_of_reviews=375, user_review='Lamb noodles were fantastic . They have a second floor with air conditioning which is the better option', categories=['Chinese restaurant'])

In [119]:
parsed_locations = []

for location in locations:
    new_categories = tag_restaurant(location,categories)
    new_location = location.copy()
    new_location.categories.extend(new_categories)
    
    unique_categories = list(
        set(
            [i.lower().strip() for i in new_location.categories]
        )
    )
    
    new_location.categories = [i.title() for i in unique_categories]
    
    parsed_locations.append(new_location)

In [124]:
def write_locations_to_csv(locations: List[Location], file_name: str):
    fieldnames = list(Location.schema()["properties"].keys())

    with open(file_name, "w", newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        for location in locations:
            writer.writerow(location.dict())

write_locations_to_csv(parsed_locations, "locations.csv")

In [125]:
df = pd.read_csv("./locations.csv")

In [138]:
df


Unnamed: 0,title,rating,number_of_reviews,user_review,categories
0,Natter Coffee & Gelato,3.0,111,Good option for ice cream and coffee. Decor ki...,"['Dessert Shop', 'Café', 'Coffee Shop', 'Ice C..."
1,Kinki Restaurant + Bar,2.5,1220,3/5 - You pay for the view but the food isnt a...,"['Japanese Restaurant', 'View', 'Bar', 'Price']"
2,Pastaria Abate,2.5,1563,"3/5 - Only good for the miles and even then, n...","['Café', 'Italian Restaurant', 'Bbq Restaurant..."
3,Underdog Inn,3.5,42,4/5 - Nice environment chill vibes. Not sure o...,"['Café', 'Bbq Restaurant', 'Food Court', 'Brun..."
4,Sonny's Pizza,2.5,173,3/5 - No frills bodega style pizza with some c...,"['Craft Beer', 'Casual Dining', 'Pizza Restaur..."
...,...,...,...,...,...
75,Swee Lee Social Club,2.5,250,In a music shop nice picolo,"['Japanese Restaurant', 'Coffee Shop', 'Bakery..."
76,Dewgather,2.5,372,Coffee is at best average but the ambience is ...,"['Café', 'Drinks', 'Coffee Shop', 'Bakery', 'R..."
77,Xin Mei Xiang Zheng Zong Lor Mee 新美香正宗卤面 Holland,3.0,111,,"['Noodle Shop', 'Food Court', 'Chinese Restaur..."
78,Monument Lifestyle at Duxton Road,4.0,227,I enjoyed their Picolo ( Latte with two shots ...,"['Café', 'Coffee Shop', 'Bakery', 'Brunch Spot..."


In [137]:
category_set = set()
parsed_categories = [[j.strip() for j in i[1:-1].replace("'","").split(",")] for i in df["categories"].to_list()]
for cat_list in parsed_categories:
    for cat in cat_list:
        category_set.add(cat)

print(category_set)

{'View', 'Bagel Shop', 'Pizza Restaurant', 'Bistro', 'Chocolate Shop', 'Spicy Food', 'Snail', 'Hawker Stall', 'Roast Meat', 'Coffee Store', 'Dessert Restaurant', 'Banh Mi', 'Interior Design', 'Chill Vibes', 'Drinks', 'Tapas Restaurant', 'Late Night Spot', 'Snack Bar', 'Breakfast Spot', 'Street Food Stall', 'Brewery', 'Average Restaurant', 'Lamb Shoulder', 'Sesame Themed Dessert Shop', 'Bbq Restaurant', 'French Restaurant', 'Outdoor Activities', 'Izakaya Restaurant', 'Italian Restaurant', 'Chill-Out Place', 'Duck Breast', 'Falafel Restaurant', 'Japanese Restaurant', 'Bar', 'Argentinian Restaurant', 'Vietnamese Restaurant', 'Used Motorcycle Dealer', 'Mapo Tofu Dish', 'Work-Friendly Café', 'Tea', 'Ice Cream Shop', 'Bone Marrow', 'Rice Dish', 'Restaurant', 'Value For Money', 'Chill Spot', 'Dessert Shop', 'Price', 'Empanada Restaurant', 'Craft Beer', 'Indonesian Restaurant', 'Korean Restaurant', 'Traditional Cuisine', 'Taiwanese Restaurant', 'Creperie', 'Sushi Restaurant', 'Oily', 'Mexican 