In [1]:
import pandas as pd
import json
import os

pd.set_option('future.no_silent_downcasting', True)

INPUT_DIR = '../sg/'
OUTPUT_DIR = '../output/'

### Define function

In [2]:
def clean_data(filename):
    '''Load dataset from the specified filename, clean it by dropping duplicates, filtering for Singapore.'''
    df = pd.read_csv(filename, dtype=str)

    # Replace unicode
    df = df.replace('\u202f', ' ', regex=True).replace('\u2013', '-', regex=True)

    # Drop duplicates
    df.drop_duplicates(subset=['address'], inplace=True)

    # Filter out rows where country is not in SG
    df = df[df['complete_address'].astype(str).str.contains('"country":"SG"', na=False)]

    # Drop unused columns
    df = drop_columns(df)

    return df

def drop_missing_open_hours(df):
    '''Filter out rows where open_hours is not available.'''
    return df[df['open_hours'].astype(str) != "{}"]

def drop_low_ratings(df, threshold=3):
    '''Filter out rows where ratings are below a certain threshold.'''
    return df[df['rating'].astype(float) >= threshold]

def drop_columns(df):
    '''Drop unused columns from the dataframe.'''
    df.drop(columns=[
        'input_id', 
        'popular_times', 
        'plus_code',
        #  'reviews_per_rating',
        'cid',
        'status',
        'reviews_link',
        'thumbnail',
        'timezone',
        'data_id',
        'reservations',
        'order_online',
        'menu',
        'owner',
        # 'address',
        'user_reviews',
        'user_reviews_extended',
        'emails',
        ], inplace=True, errors='ignore')
    return df

def combine_dataframes(dfs):
    '''Combine multiple dataframes into one, dropping duplicates based on the 'address' column.'''
    combined = pd.concat(dfs, ignore_index=True)
    num_duplicates = combined['address'].duplicated().sum()
    combined = combined.drop_duplicates(subset=['address'])
    print(f"duplicate rows: {num_duplicates}")
    return combined

def print_unique_categories(df, exclude_keyword=None):
    """
    Print unique categories from the dataframe.
    Optionally exclude categories containing one or more keywords.
    """
    unique_categories = df['category'].unique()

    if exclude_keyword:
        exclude_keyword = [kw.lower() for kw in exclude_keyword]
        unique_categories = [
            c for c in unique_categories
            if all(kw not in str(c).lower() for kw in exclude_keyword)
        ]

    print("Unique categories:")
    for category in unique_categories:
        print(f" - {category}")

def to_csv(df, filename):
    '''Save the dataframe to a CSV file.'''
    if not df.empty:
        df.to_csv(filename, index=False)
    else:
        print(f"No data to save to {os.path.basename(filename)}.")

### Food and Beverage

In [18]:
cafe = clean_data(f"{INPUT_DIR}cafe.csv")
restaurant = clean_data(f"{INPUT_DIR}restaurant.csv")
michelin = clean_data(f"{INPUT_DIR}michelin.csv")
streetfood = clean_data(f"{INPUT_DIR}streetfood.csv")
hawker = clean_data(f"{INPUT_DIR}hawker.csv")
bakery = clean_data(f"{INPUT_DIR}bakery.csv")
pastry = clean_data(f"{INPUT_DIR}pastry.csv")
patisserie = clean_data(f"{INPUT_DIR}patisserie.csv")
ice_cream = clean_data(f"{INPUT_DIR}ice_cream.csv")
dessert = clean_data(f"{INPUT_DIR}dessert.csv")
food = combine_dataframes([cafe, restaurant, michelin, streetfood, hawker, bakery, pastry, patisserie, ice_cream, dessert])
food = drop_missing_open_hours(food)

# Cafe
cafe_keywords = ['cafe']
cafe = food[food['category'].str.contains('|'.join(cafe_keywords), case=False, na=False)]
not_cafe_keywords = ['cafeteria', 'children']
cafe = cafe[~cafe['category'].str.contains('|'.join(not_cafe_keywords), case=False, na=False)]
to_csv(cafe, f"{OUTPUT_DIR}cafe.csv")

# Takeaway
takeaway_keywords = ['takeaway', 'delivery']
takeaways = food[food['category'].str.contains('|'.join(takeaway_keywords), case=False, na=False)]
to_csv(takeaways, f"{OUTPUT_DIR}takeaway.csv")

# Bakery
bakery_keywords = ['bakery', 'patisserie', 'pastry']
bakery = food[food['category'].str.contains('|'.join(bakery_keywords), case=False, na=False)]
not_bakery_keywords = ['wholesale', 'restaurant']
bakery = bakery[~bakery['category'].str.contains('|'.join(not_bakery_keywords), case=False, na=False)]
to_csv(bakery, f"{OUTPUT_DIR}bakery.csv")

# Hawker center
hawker_keywords = ['hawker']
hawker = food[food['category'].str.contains('|'.join(hawker_keywords), case=False, na=False)]
to_csv(hawker, f"{OUTPUT_DIR}hawker_center.csv")

# Food court
food_court_keywords = ['food court']
food_courts = food[food['category'].str.contains('|'.join(food_court_keywords), case=False, na=False)]
to_csv(food_courts, f"{OUTPUT_DIR}food_court.csv")

# Restaurant
restaurant_keywords = ['restaurant', 'diner', 'bistro', 'deli', 'steak', 'grill', 'crab', 'poke', 'gastropub', 'noodle', 'pasta', 'kebab', 'tapas']
restaurant = food[food['category'].str.contains('|'.join(restaurant_keywords), case=False, na=False)]
not_restaurant_keywords = ['cafe', 'delivery', 'pancake']
restaurant = restaurant[~restaurant['category'].str.contains('|'.join(not_restaurant_keywords), case=False, na=False)]
to_csv(restaurant, f"{OUTPUT_DIR}restaurant.csv")

# Dessert
dessert_keywords = ['pancake', 'ice cream', 'creperie', 'cake', 'dessert']
dessert = food[food['category'].str.contains('|'.join(dessert_keywords), case=False, na=False)]
dessert = dessert[~dessert['category'].str.contains('restaurant', case=False, na=False)]
to_csv(dessert, f"{OUTPUT_DIR}dessert.csv")

duplicate rows: 4608


In [19]:
coffee = clean_data(f"{INPUT_DIR}coffee.csv")
coffee = coffee[coffee['category'].str.contains('coffee', case=False, na=False)]
to_csv(coffee, f"{OUTPUT_DIR}coffee.csv")

bubble_tea = clean_data(f"{INPUT_DIR}bubble_tea.csv")
bubble_tea = bubble_tea[bubble_tea['category'].str.contains('bubble tea', case=False, na=False)]
to_csv(bubble_tea, f"{OUTPUT_DIR}bubble_tea.csv")

tea = clean_data(f"{INPUT_DIR}tea.csv")
tea = tea[tea['category'].str.contains('tea', case=False, na=False)]
not_tea_keywords = ['bubble tea', 'wholesale', 'manufacture']
tea = tea[~tea['category'].str.contains('|'.join(not_tea_keywords), case=False, na=False)]
to_csv(tea, f"{OUTPUT_DIR}tea.csv")

### Nightlife

In [20]:
bar = clean_data(f"{INPUT_DIR}bar.csv")
bar = bar[bar['category'].str.contains('bar', case=False, na=False)]
to_csv(bar, f"{OUTPUT_DIR}bar.csv")

pub = clean_data(f"{INPUT_DIR}pub.csv")
pub = pub[pub['category'].str.contains('pub', case=False, na=False)]
pub = pub[~pub['category'].str.contains('gastropub', case=False, na=False)]
to_csv(pub, f"{OUTPUT_DIR}pub.csv")

club = clean_data(f"{INPUT_DIR}club.csv")
club = club[club['category'].str.contains('club', case=False, na=False)]
not_club_keywords = ['sauna', 'social', 'country', 'polo']
club = club[~club['category'].str.contains('|'.join(not_club_keywords), case=False, na=False)]
to_csv(club, f"{OUTPUT_DIR}club.csv")

brewery = clean_data(f"{INPUT_DIR}brewery.csv")
brewery_keywords = ['brewery', 'beer garden']
brewery = brewery[brewery['category'].str.contains('|'.join(brewery_keywords), case=False, na=False)]
to_csv(brewery, f"{OUTPUT_DIR}brewery.csv")


### Religious sites

In [21]:
place_of_worship = clean_data(f'{INPUT_DIR}place_of_worship.csv')
place_of_worship = drop_missing_open_hours(place_of_worship)

church = place_of_worship[place_of_worship['category'].str.contains('church', case=False, na=False)]
to_csv(church, f"{OUTPUT_DIR}church.csv")
temple = place_of_worship[place_of_worship['category'].str.contains('temple', case=False, na=False)]
to_csv(temple, f"{OUTPUT_DIR}temple.csv")
mosque = place_of_worship[place_of_worship['category'].str.contains('mosque', case=False, na=False)]
to_csv(mosque, f"{OUTPUT_DIR}mosque.csv")
cathedral = place_of_worship[place_of_worship['category'].str.contains('cathedral', case=False, na=False)]
to_csv(cathedral, f"{OUTPUT_DIR}cathedral.csv")
synagogue = place_of_worship[place_of_worship['category'].str.contains('synagogue', case=False, na=False)]
to_csv(synagogue, f"{OUTPUT_DIR}synagogue.csv")
gurudwara = place_of_worship[place_of_worship['category'].str.contains('gurudwara', case=False, na=False)]
to_csv(gurudwara, f"{OUTPUT_DIR}gurudwara.csv")

### Family Attractions

In [22]:
zoo = clean_data(f"{INPUT_DIR}zoo.csv") # aquarium is scraped too
theme_park = clean_data(f"{INPUT_DIR}theme_park.csv")
wildlife = clean_data(f"{INPUT_DIR}wildlife.csv")
family_attraction = combine_dataframes([zoo, theme_park, wildlife])
family_attraction = drop_missing_open_hours(family_attraction)

zoo = family_attraction[family_attraction['category'].str.contains('zoo', case=False, na=False)]
to_csv(zoo, f"{OUTPUT_DIR}zoo.csv")

aquarium = family_attraction[family_attraction['category'].str.contains('aquarium', case=False, na=False)]
to_csv(aquarium, f"{OUTPUT_DIR}aquarium.csv")

theme_park_keywords = ['theme', 'amusement', 'water']
theme_park = family_attraction[family_attraction['category'].str.contains('|'.join(theme_park_keywords), case=False, na=False)]
to_csv(theme_park, f"{OUTPUT_DIR}theme_park.csv")

wildlife_park = family_attraction[family_attraction['category'].str.contains('wildlife', case=False, na=False)]
to_csv(wildlife_park, f"{OUTPUT_DIR}wildlife_park.csv")

duplicate rows: 8


### Shopping

In [23]:
mall = clean_data(f'{INPUT_DIR}mall.csv')
market = clean_data(f'{INPUT_DIR}market.csv')
store = clean_data(f'{INPUT_DIR}store.csv')
vintage = clean_data(f'{INPUT_DIR}vintage.csv')
souvenir_shop = clean_data(f'{INPUT_DIR}souvenir_shop.csv')
shopping = combine_dataframes([mall, market, store, vintage, souvenir_shop])
shopping = drop_missing_open_hours(shopping)

mall = shopping[shopping['category'].str.contains('mall', case=False, na=False)]
mall.to_csv(f'{OUTPUT_DIR}mall.csv', index=False)

souvenir_keywords = ['souvenir store', 'gift shop']
souvenir_shop = souvenir_shop[souvenir_shop['category'].str.contains('|'.join(souvenir_keywords), case=False, na=False)]
souvenir_shop.to_csv(f'{OUTPUT_DIR}souvenir_shop.csv', index=False)

clothing_store = shopping[shopping['category'].str.contains('clothing store', case=False, na=False)]
clothing_store.to_csv(f'{OUTPUT_DIR}clothing_store.csv', index=False)

market = shopping[shopping['category'].str.contains('market', case=False, na=False)]
market.to_csv(f'{OUTPUT_DIR}market.csv', index=False)

duplicate rows: 391


### Culture & History

In [24]:
museums = clean_data(f'{INPUT_DIR}museum.csv')
gallery = clean_data(f'{INPUT_DIR}gallery.csv')
historical_landmark = clean_data(f'{INPUT_DIR}historical_landmark.csv')
visitor_center = clean_data(f'{INPUT_DIR}visitor_center.csv')
culture_history = combine_dataframes([museums, gallery, historical_landmark, visitor_center])

historical_landmark = culture_history[culture_history['category'].str.contains('historical landmark', case=False, na=False)]
to_csv(historical_landmark, f'{OUTPUT_DIR}historical_landmark.csv')

museum = culture_history[culture_history['category'].str.contains('museum', case=False, na=False)]
to_csv(museum, f'{OUTPUT_DIR}museum.csv')

gallery = culture_history[culture_history['category'].str.contains('gallery', case=False, na=False)]
to_csv(gallery, f'{OUTPUT_DIR}gallery.csv')

visitor_center = culture_history[culture_history['category'].str.contains('visitor center', case=False, na=False)]
to_csv(visitor_center, f'{OUTPUT_DIR}visitor_center.csv')

heritage = culture_history[culture_history['category'].str.contains('heritage', case=False, na=False)]
heritage = heritage[~heritage['category'].str.contains('museum', case=False, na=False)]
to_csv(heritage, f'{OUTPUT_DIR}heritage.csv')


duplicate rows: 13


### Nature

In [25]:
beach = clean_data(f'{INPUT_DIR}beach.csv')
waterfall = clean_data(f'{INPUT_DIR}waterfall.csv')
park = clean_data(f'{INPUT_DIR}park.csv')
garden = clean_data(f'{INPUT_DIR}garden.csv')
nature = combine_dataframes([beach, waterfall, park, garden])

beach = nature[nature['category'].str.contains('beach', case=False, na=False)]
to_csv(beach, f'{OUTPUT_DIR}beach.csv')

waterfall = nature[nature['category'].str.contains('waterfall', case=False, na=False)]
to_csv(waterfall, f'{OUTPUT_DIR}waterfall.csv')

park = nature[nature['category'].str.contains('park', case=False, na=False)]
to_csv(park, f'{OUTPUT_DIR}park.csv')

garden = nature[nature['category'].str.contains('garden', case=False, na=False)]
to_csv(garden, f'{OUTPUT_DIR}garden.csv')


duplicate rows: 1142
No data to save to waterfall.csv.


### Sightseeing

In [26]:
scenic_spot = clean_data(f'{INPUT_DIR}scenic_spot.csv')
tourist_attraction = clean_data(f'{INPUT_DIR}tourist_attraction.csv')
sightseeing = combine_dataframes([scenic_spot, tourist_attraction])

scenic_keywords = ['scenic', 'viewpoint', 'panorama', 'panoramic', 'observation', 'hilltop']
scenic_spot = sightseeing[
    (sightseeing['category'].str.contains('|'.join(scenic_keywords), case=False, na=False)) |
    (sightseeing['descriptions'].str.contains('|'.join(scenic_keywords), case=False, na=False))
]
to_csv(scenic_spot, f'{OUTPUT_DIR}scenic_spot.csv')

tourist_attraction = tourist_attraction[tourist_attraction['category'].str.contains('tourist attraction', case=False, na=False)]
to_csv(tourist_attraction, f'{OUTPUT_DIR}tourist_attraction.csv')

duplicate rows: 44


### Art & Craft

In [27]:
workshop = clean_data(f"{INPUT_DIR}art_craft.csv")

def drop_not_on_site_service(df):
    '''Filter out rows where the 'about' column does not mention on-site services.'''
    return df[df['about'].str.contains('"name":"on-site services","enabled":true', case=False, na=False)]

workshop = drop_missing_open_hours(workshop)
workshop = drop_not_on_site_service(workshop)

# print_unique_categories(workshop)
workshop_keywords = ['studio', 'handicraft', 'class', 'store', 'art', 'pottery', 'candle', 'leather']
workshop = workshop[workshop['category'].str.contains('|'.join(workshop_keywords), case=False, na=False)]
to_csv(workshop, f"{OUTPUT_DIR}art_craft.csv")

### Adventure

In [28]:
hiking_area = clean_data(f"{INPUT_DIR}hiking_area.csv")
hiking_area = drop_missing_open_hours(hiking_area)
hiking_area = hiking_area[hiking_area['category'].str.contains('hiking area', case=False, na=False)]
to_csv(hiking_area, f"{OUTPUT_DIR}hiking_area.csv")

### Relaxation

In [29]:
spa = clean_data(f"{INPUT_DIR}spa.csv")
sauna = clean_data(f"{INPUT_DIR}sauna.csv")
relaxation = combine_dataframes([spa, sauna])
relaxation = drop_missing_open_hours(relaxation)

spa = relaxation[relaxation['category'].str.contains('spa', case=False, na=False)]
to_csv(spa, f"{OUTPUT_DIR}spa.csv")

sauna = relaxation[relaxation['category'].str.contains('sauna', case=False, na=False)]
to_csv(sauna, f"{OUTPUT_DIR}sauna.csv")

duplicate rows: 31


### Accommodation

In [30]:
hotel = clean_data(f"{INPUT_DIR}hotel.csv")
to_csv(hotel, f"{OUTPUT_DIR}hotel.csv")