# OneMap.gov.sg Rest API Geo Data Scraper
Introduction
- This is a multiprocessing threadpool approach to scrap geo data (block number, road name, building name, address, postal code, latitude, longitude) from OneMap.gov.sg via using user definite search value.
- This scaper is designed to request up to 250 search per minutes, the upper limit of OneMap Rest API

In [None]:
# Import Packages
import os
import requests
import json
import pandas as pd
import math
import time
import multiprocessing
from multiprocessing.dummy import Pool as ThreadPool

In [None]:
# MultiThread Process for collection of Geo Data via OneMap API

def geoget(url):
    try:
        result_temp = requests.get(url).json()['results'][0]
        sv = result_temp['SEARCHVAL']
        blk = result_temp['BLK_NO']
        road = result_temp['ROAD_NAME']
        build = result_temp['BUILDING']
        add = result_temp['ADDRESS']
        post = result_temp['POSTAL']
        x = result_temp['X']
        y = result_temp['Y']
        lat = result_temp['LATITUDE']
        lon = result_temp['LONGITUDE']
        long = result_temp['LONGTITUDE']
    except:
        sv = ''
        blk = ''
        road = ''
        build = ''
        add = ''
        post = ''
        x = ''
        y = ''
        lat = ''
        lon = ''
        long = ''
    return sv, blk, road, build, add, post, x, y, lat, lon, long

def getDir_index(dirPath):
    listOfFile = os.listdir(dirPath)
    dir_index = []
    for each in listOfFile:
        dir_index.append(int(each.split('.')[0].split('_')[1]))
    return dir_index

def getLinks(df,column1,column2=None,column3=None):
    
    links=[]
    if column3 != None:
        for each in zip(df[column1], df[column2], df[column3]):
            search_value = (str(each[0])+' '+str(each[1]+' '+str(each[2])))
            apiurl = 'https://developers.onemap.sg/commonapi/search?searchVal='+search_value+'&returnGeom=Y&getAddrDetails=Y&pageNum=1'
            links.append(apiurl)
    elif column2 != None:
        for each in zip(df[column1], df[column2]):
            search_value = (str(each[0])+" "+str(each[1]))
            apiurl = 'https://developers.onemap.sg/commonapi/search?searchVal='+search_value+'&returnGeom=Y&getAddrDetails=Y&pageNum=1'
            links.append(apiurl)
    else:
        for each in df[column1]:
            search_value = (str(each))
            apiurl = 'https://developers.onemap.sg/commonapi/search?searchVal='+search_value+'&returnGeom=Y&getAddrDetails=Y&pageNum=1'
            links.append(apiurl)
        
    return links


def generator(df, links, dirPath, filename):
    
    lowerlimit = 0
    upperlimit = 250
    dir_index = getDir_index(dirPath) 
    
    for i in range(1,math.ceil(df.shape[0]/250)+1):
        start = time.perf_counter()
        if i in dir_index:
            lowerlimit += 250
            upperlimit += 250
            continue
        else:
            start_processing = time.perf_counter()
            
            limited_links = links[lowerlimit:upperlimit]
            limited_df = df[lowerlimit:upperlimit].copy()
            limited_df.reset_index(drop=True, inplace=True)
            
            pool = ThreadPool(4)  # Make the Pool of workers
            results = pool.map(geoget, limited_links) #Open the urls in their own threads
            pool.close() #close the pool and wait for the work to finish 
            pool.join()
            
            sv = []
            blk = []
            road = []
            build = []
            add = []
            post = []
            x = []
            y = []
            lat = []
            lon = []
            long = []

            for each in results:
                sv.append(each[0])
                blk.append(each[1])
                road.append(each[2])
                build.append(each[3])
                add.append(each[4])
                post.append(each[5])
                x.append(each[6])
                y.append(each[7])
                lat.append(each[8])
                lon.append(each[9])
                long.append(each[10])

            data = pd.DataFrame(list(zip(sv,blk,road,build,add,post,x,y,lat,lon,long)),
                                columns =['SEARCHVAL','BLK_NO','ROAD_NAME','BUILDING','ADDRESS','POSTAL','X','Y','LATITUDE','LONGITUDE','LONGTITUDE'])

            limited_df = pd.concat([limited_df, data], axis=1)
            limited_df.to_csv(dirPath+'\\'+filename+'_'+"{:04n}".format(i)+'.csv',index=False)

            finish_processing = time.perf_counter()

            if finish_processing-start_processing > 60.5:
                time.sleep(0)
            else:
                time.sleep(round(60.5-(finish_processing-start_processing),0))

            finish = time.perf_counter()
            print(f'Finished data range {i} for rows {lowerlimit} to {upperlimit} in {round(finish-start,2)} second(s)')

            lowerlimit += 250
            upperlimit += 250
            
def main(df,dirPath,filename,column1,column2=None,column3=None):
    
    if isinstance(df, pd.DataFrame)==False:
        print('Input is not DataFrame')
        return
    
    if column1 == column2 or column1 == column3 or column2 == column3 and column3 != None:
        print('Unique column is needed, cannot use duplicated column')
        return
    
    if os.path.isdir(dirPath) == False:
        if os.path.splitext(dirPath)[1] == '':
            os.makedirs(dirPath)
        else:
            print('Given directory is a file and not a folder, please key in new directory')
            return
    
    links = getLinks(df,column1,column2,column3)
    
    incomplete = True
    while incomplete == True:
        
        generator(df,links,dirPath,filename)
        
        dir_index = getDir_index(dirPath)
        cur_count = len(dir_index)
        tar_count = math.ceil(df.shape[0]/250)
        if cur_count == tar_count:
            incomplete = False
            print(f'All {tar_count} geoData file segments had being scraped successfully')
        else:
            pass

def getListOfFiles(dirName):
    listOfFile = os.listdir(dirName)
    allFiles = list()
    for entry in listOfFile:
        fullPath = os.path.join(dirName,entry)
        allFiles.append(fullPath)
    return allFiles

In [None]:
from pathlib import Path

userhome = os.path.join(Path.home(),'Documents')

# The sample data used would be from https://data.gov.sg/dataset/resale-flat-prices 

# Import data into pandas dataframe
df_flats = pd.read_csv('resale-flat-prices-based-on-registration-date-from-jan-2017-onwards.csv')
df_flats = df_flats[0:1000].copy()

# Running main() to REST API GeoData from OneMap.gov.sg
if __name__ == "__main__":
    df=df_flats
    dirPath=os.path.join(userhome,'OneMap_API_GeoData')
    filename='GeoData'
    column1='block'
    column2='street_name'
    column3=None
    main(df,dirPath,filename,column1,column2,column3)
    
    # Loading up the collected GeoData and re-merge them into a single dataframe
    list_geo=[]
    list_of_geo_files = getListOfFiles(dirPath)
    for filename in list_of_geo_files:
        df = pd.read_csv(filename,index_col=None,header=0)
        list_geo.append(df)
    df_flats = pd.concat(list_geo,axis=0,ignore_index=True)