In [35]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point

In [44]:
def calculate_completeness(data, dataset_name):
    print('Dataset: {}'.format(dataset_name))
    print('Number of POIs: {}'.format(len(data)))
    print('Completeness of lat: {} ({}%)'.format(data['geometry.lat'].count(),
                                                 100 * data['geometry.lat'].count()/len(data)))
    print('Completeness of lng: {} ({}%)'.format(data['geometry.lng'].count(),
                                                 100 * data['geometry.lng'].count()/len(data)))
    num_name_sg = len(data[data['properties.name'] == 'Singapore'])
    print('Number of name with Singapore: {}'.format(num_name_sg))
    print('Completeness of name: {} ({}%)'.format(data['properties.name'].count() - num_name_sg,
                                                  100 * (data['properties.name'].count() - num_name_sg)/len(data)))
    print('Completeness of place type: {} ({}%)'.format(data['properties.place_type'].count(),
                                                        100 * data['properties.place_type'].count()/len(data)))
    num_address_sg = len(data[data['properties.address'] == 'Singapore'])
    print('Number of address with Singapore: {}'.format(num_address_sg))
    print('Completeness of address: {} ({}%)'.format(data['properties.address'].count() - num_address_sg,
                                                        100 * (data['properties.address'].count()-num_address_sg)/len(data)))
    tag_cols = [col for col in data.columns if 'properties.tags.' in col]
    if len(tag_cols) == 0:
        print('Completeness of tags: 0 (0%)')
    else:
        non_nan_tags = data[tag_cols].dropna(axis='index', how='all')
        print('Completeness of tags: {} ({}%)'.format(len(non_nan_tags), 
                                                      100*len(non_nan_tags)/len(data)))
    print()
    
def within_bound(lat, lng, country_shp):
    """
    Check if a POI with latitude and longitude pair falls within the study area.
    """
    return (Point(lng, lat).within(country_shp.loc[0, 'geometry'])) or (Point(lng, lat).within(country_shp.loc[1, 'geometry'])) or (Point(lng, lat).within(country_shp.loc[2, 'geometry'])) or (Point(lng, lat).within(country_shp.loc[3, 'geometry'])) or (Point(lng, lat).within(country_shp.loc[4, 'geometry']))


def remove_outofbound(data, country_shp):
    retained_index = [i for i in range(len(data)) if
                      within_bound(data.loc[i, 'geometry.lat'],
                                   data.loc[i, 'geometry.lng'], country_shp)]
    return data.iloc[retained_index].reset_index(drop=True)
    

In [45]:
# load data from different sources
conflated_data = pd.read_excel('../data/conflated_data/conflated_data_tampines.xlsx')
google_data = pd.read_excel('../data/googlemap/google_data_tampines.xlsx')
here_data = pd.read_excel('../data/heremap/here_data_tampines.xlsx')
onemap_data = pd.read_excel('../data/onemap/onemap_data_tampines.xlsx')
osm_data = pd.read_excel('../data/osm/osm_data_tampines.xlsx')
sla_data = pd.read_excel('../data/sla/sla_data_tampines.xlsx')

# load study area shapefile
country_shp = gpd.read_file('../data/master-plan-2014-planning-area-boundary-web/master-plan-2014-planning-area-boundary-web-shp/MP14_PLNG_AREA_WEB_PL.shp')
country_shp = country_shp.to_crs(epsg="4326")
country_shp = country_shp[country_shp['PLN_AREA_N'] == 'TAMPINES'].reset_index(drop=True)

# remove out of bound POIs
conflated_data = remove_outofbound(conflated_data, country_shp)
google_data = remove_outofbound(google_data, country_shp)
here_data = remove_outofbound(here_data, country_shp)
onemap_data = remove_outofbound(onemap_data, country_shp)
osm_data = remove_outofbound(osm_data, country_shp)
sla_data = remove_outofbound(sla_data, country_shp)

# remove duplicates
conflated_data.drop_duplicates(subset=['id'], inplace=True, ignore_index=True)
google_data.drop_duplicates(subset=['id'], inplace=True, ignore_index=True)
here_data.drop_duplicates(subset=['id'], inplace=True, ignore_index=True)
onemap_data.drop_duplicates(subset=['id'], inplace=True, ignore_index=True)
osm_data.drop_duplicates(subset=['id'], inplace=True, ignore_index=True)
sla_data.drop_duplicates(subset=['id'], inplace=True, ignore_index=True)

# calculate completeness 
calculate_completeness(conflated_data, 'Conflated Data')
calculate_completeness(google_data, 'Google Map')
calculate_completeness(here_data, 'Here Map')
calculate_completeness(onemap_data, 'OneMap')
calculate_completeness(osm_data, 'OSM')
calculate_completeness(sla_data, 'SLA')

Dataset: Conflated Data
Number of POIs: 8699
Completeness of lat: 8699 (100.0%)
Completeness of lng: 8699 (100.0%)
Number of name with Singapore: 1
Completeness of name: 8698 (99.98850442579607%)
Completeness of place type: 8699 (100.0%)
Number of address with Singapore: 533
Completeness of address: 8093 (93.03368203241752%)
Completeness of tags: 1353 (15.553511897919302%)

Dataset: Google Map
Number of POIs: 7835
Completeness of lat: 7835 (100.0%)
Completeness of lng: 7835 (100.0%)
Number of name with Singapore: 1
Completeness of name: 7834 (99.98723675813656%)
Completeness of place type: 7835 (100.0%)
Number of address with Singapore: 319
Completeness of address: 7425 (94.76707083599234%)
Completeness of tags: 0 (0%)

Dataset: Here Map
Number of POIs: 2187
Completeness of lat: 2187 (100.0%)
Completeness of lng: 2187 (100.0%)
Number of name with Singapore: 0
Completeness of name: 2187 (100.0%)
Completeness of place type: 2187 (100.0%)
Number of address with Singapore: 24
Completeness 