This file is used to mine the POIs in census tracts in the City of Boston

In [1]:
from collections import Counter as counter
import matplotlib.pyplot as plt
import csv
import numpy as np
import os
import scipy
import matplotlib.path as mplPath
import random
from matplotlib.ticker import NullFormatter, MaxNLocator
from numpy import linspace
import urllib2
import json
import unidecode
import re
import math
import sys
from six.moves import cPickle as pickle

CONSTANTS

In [2]:
LONG_TO_FEET = 82850.73*3.28
LONG_TO_METERS = 82850.73
LAT_TO_FEET = 111073.25*3.28
LAT_TO_METERS = 111073.25

API_KEY = ''
NEARBY_SEARCH_URL = 'https://maps.googleapis.com/maps/api/place/nearbysearch/json?location='
DETAILS_SEARCH_URL = 'https://maps.googleapis.com/maps/api/place/details/json?placeid='
RADIUS = 50

## 3.2 Building classes,
# residentialUse = ['A', 'R1', 'R2', 'R3', 'R4', 'RL', 'CM', 'EA', 'RC']
# workAndOtherUse = ['C', 'E']
# industrialUse = ['I']
# noUse = ['CL', 'CP']

# categoryCodes = residentialUse + industrialUse + workAndOtherUse + noUse

In [3]:
##  4. DATA
# Tracts
tracts = ['25025010702']
tractCoordinates = {}
with open('../Data/demoTract.csv', 'r') as f:
    f.readline()
    for line in f:
        coordinates = line.split('<coordinates>')[1]
        coordinates = coordinates.split('</coordinates>')[0]
        coordinates = coordinates.split(' ')
        coordinates = [x.split(',') for x in coordinates]
        coordinates = [[float(x[0]), float(x[1])] for x in coordinates]
        tractId = line.split(',')[0]
        # Store relevant tracts
        if tractId in tracts:
            tractCoordinates[tractId] = coordinates
            
            

# BUILDING DATA FOR EACH TRACT 
tractBuildings = {}
for t in tracts:
    tractBuildings[t] = []  # List of building characteristics
    with open('../PickledData/TractBuildings/'+t+'.pickle', 'rb') as f:
        save = pickle.load(f)
    for buildingId in save.keys():
        details = save[buildingId]
        longs = [x[0] for x in details['geometry']]
        lats = [x[1] for x in details['geometry']]
        u = details['category']
        use = details['use']
        longitude = details['longitude']
        latitude = details['latitude']
        height = details['height']
        tractBuildings[t].append([buildingId, longs, lats, u, use, \
                              latitude, longitude, height])


# Non-residential buildings
nonResidentialTractBuildings = {}
for t in tracts:
    nonResidentialTractBuildings[t] = [x for x in tractBuildings[t] \
                                    if x[3] in ['wo','i']]

Building based queries for POIs

In [4]:
len(tracts)

1

In [5]:
'''
    This script is used to mine POIs using the Google Places API
    The output is a markup file with the following characteristics
    (all need not be available for each POI)
    1. Building ID (for building-centric search)
    2. Place ID
    (place characteristics)
    3. Name
    4. Address
    5. Location (lat, lng)
    6. POI categories
    7. Opening hours

    The scan is performed at 2 lavels-
    1. Individual buildings (non-residential)
    2. Tract grid (all over)

    Querying is a 2-step process:
    1. Determination of place IDs in proximity of the point of query
    2. Querying for the details of individual place IDs

    The results are stored in ../IntermediateResults/1_POIs/

    Plots for the grid generated for each tract are also stored
    in the ../Plots/ folder
'''

# POI MINING- POINTS CLOSE TO BUILDINGS
# This segment uses the Google Places API to determine POIs within
# 50 meters of the representative coordinates of non-residential buildings.
# These might be overlapping for multiple buildings and a more exhaustive
# determination/allocation is done in the steps to follow

totalPlaces_buildings = 0
allPlacesIds_buildings = []
notFound = []
print 'Building based mining'
for tractIndex in range(len(tracts)):
    tractId = tracts[tractIndex]
    print 'Processing Tract ', tractIndex, tractId
    with open('../Data/POIs_Tracts/'+tractId+'_pois_'+str(RADIUS)+'.txt', 'w') as f:
        for nrb in nonResidentialTractBuildings[tractId]:
            latitude = nrb[5]
            longitude = nrb[6]
            buildingId = nrb[0]
            # Proximity-based search
            response = urllib2.urlopen(NEARBY_SEARCH_URL+str(latitude)+','+str(longitude)+\
                                       '&radius='+str(RADIUS)+'&key='+API_KEY)
            data = json.load(response)
            f.write('<building>\n<id>')
            f.write(buildingId+'</id>\n')

            # Iterate over places and perform a place-ID based search
            for i in range(len(data['results'])):
                placeId = str(data['results'][i]['place_id'])
                if placeId not in allPlacesIds_buildings:
                    totalPlaces_buildings += 1
                    allPlacesIds_buildings.append(placeId)
                f.write('<place>\n<placeid>\n')
                f.write(placeId+'</placeid>')
                response = urllib2.urlopen(DETAILS_SEARCH_URL+ placeId+'&key='+API_KEY)
                placeInfo = json.load(response)
                try:
                    name = unidecode.unidecode(placeInfo['result']['name'])
                    f.write('<name>'+name+'</name>\n')
                    #data1['result']['opening_hours']['weekday_text']
                    address = unidecode.unidecode(placeInfo['result']['formatted_address'])
                    f.write('<address>'+address+'</address>\n')
                    latitude = round(placeInfo['result']['geometry']['location']['lat'], 7)
                    longitude = round(placeInfo['result']['geometry']['location']['lng'], 7)
                    location = [latitude, longitude]
                    f.write('<location>'+str(location)+'</location>\n')
                    f.write('<types>\n')
                    for t in placeInfo['result']['types']:
                        t=unidecode.unidecode(t)
                        f.write('<type>'+t+'</type>\n')
                    f.write('</types>\n')
                    if 'opening_hours' in placeInfo['result'].keys():
                        f.write('<opening_hours>\n')
                        for day in range(7):
                            f.write('<day_hours>'+unidecode.unidecode(\
                                    placeInfo['result']['opening_hours']['weekday_text'][day])+'</day_hours>\n')
                        f.write('</opening_hours>\n')
                    f.write('</place>\n')
                except:
                    notFound.append(placeInfo)
            f.write('</building>\n')

print 'Total Places = ', totalPlaces_buildings
print 'Instances of incomplete information = ', len(notFound)

Building based mining
Processing Tract  0 25025010702
Total Places =  0
Instances of incomplete information =  0


In [6]:
# The following set of functions are used to parse the timestamps
# obtained from calls to Google Places

# Function to convert Unicode format time to military time
def meridianAdditive (meridian, hour):
    if meridian=='AM':
        if hour=='12':
            return 12 # This will convert 12 AM to 24
        else:
            return 0 # Since simulation is for 24 hours, additive for other
                     # AM times should be 0 (if the simulation was longer,
                     # the for opening times like 10AM-2AM, the additive for
                     # the closing time would be 24)
    elif meridian=='PM':
        if hour==12:
            return 0 # 12PM remains 12pm
        else:
            return 12 # All other PM times should have 12 added to them
    else:
        # Never occurs
        print 'Unidentified Meridian : ', meridian

# Convert HH:MM to decimal
def timeToDecimal(timeString):
    hours, minutes = [float(x) for x in timeString.split(':')]
    return hours+minutes/60

# The getHours function is used to determine hours of operation from the
# query string returned from Google Places API
# the first AM/PM might be skipped if they are the same for opening/closing
# Examples of hours:
# Eg. '2:30 - 11:30 PM' # len 15
# Eg. '2:30 - 4:30 PM' # len 14
# Eg. 'Open 24 hours' # len 13
# Eg. 'Closed' # len 6
# Eg. '10:30 AM - 11:30 PM' # len 19
# Eg. '9:30 AM - 11:30 PM' # len 18
# Eg. '2:30 PM - 5:30 PM' # len 17
# Eg. '6:30 AM - 2:30 PM, 5:30 - 10:30 PM' # len 34
# Eg. '6:30 AM - 2:30 PM, 5:30 PM - 10:30 PM' # len 37
# Eg. '6:30 AM - 2:30 PM, 10:30 PM - 11:30 PM' # len 38
# Eg. '10:30 AM - 2:30 PM, 10:30 PM - 11:30 PM' # len 39
# Eg. '10:30 AM - 11:30 PM, 10:30 PM - 11:30 PM' # len 40
# Eg. '6:30 AM - 2:30 PM, 5:30 - 8:30 PM' # len 33

# Distribution of lengths (length, frequency):
# (33, 8)
# (34, 7)
# (37, 1)
# (6, 402)
# (13, 70)
# (14, 6)
# (15, 81)
# (17, 1050)
# (18, 1065)
# (19, 75)

def getHours(hoursString):
    hours = []  # An array of arrays with each element giving an interval
    # if string length is 17, 18 or 19 it is plane timestamps with AM/PM mentioned
    if len(hoursString) in [17, 18, 19]:
        hoursArray = hoursString.split(' - ')
        hoursMeridians = [x[-2:] for x in hoursArray]
        additives = [meridianAdditive(hoursMeridians[x], hoursArray[x][:2]) for x in range(2)]
        hoursArray = [timeToDecimal(x[:-3]) for x in hoursArray]
        verifiedHours = [sum(x) for x in zip(additives, hoursArray)]
        if verifiedHours[0]>verifiedHours[1]:
            if verifiedHours[0]==24:
                verifiedHours[0] = 0.0
            elif verifiedHours[1]<5:
                verifiedHours[1] += 24
        hours.append(verifiedHours)
    elif len(hoursString)==6:
        # indicates 'Closed'- no hours
        pass
    elif len(hoursString) in [14, 15, 16]:
        meridian = hoursString[-2:]
        hoursArray = hoursString.split(' - ')
        additives = [meridianAdditive(meridian, hoursArray[x][:2]) for x in range(2)]
        hoursArray = [timeToDecimal(hoursArray[0]), timeToDecimal(hoursArray[1][:-3])]
        verifiedHours = [sum(x) for x in zip(additives, hoursArray)]
        if verifiedHours[0]>verifiedHours[1]:
            if verifiedHours[0]==24:
                verifiedHours[0] = 0.0
            elif verifiedHours[1]<5:
                verifiedHours[1] += 24
        hours.append(verifiedHours)
    elif len(hoursString)==13:
        if hoursString=='Open 24 hours':
            hours.append([0,24])
        else:
            print hoursString
    elif len(hoursString)>=30:
        hoursStringSegments = hoursString.split(', ')
        # Recursive calls
        firstShift = getHours(hoursStringSegments[0])
        secondShift = getHours(hoursStringSegments[1])
        hours.extend([firstShift[0], secondShift[0]])
    else:
        print 'Unidentified hoursString:'+hoursString+'-'+str(len(hoursString))
    return hours  # This is a list of lists

In [7]:
TRACT_POIS = {}

for i in range(len(tracts)):
    ANALYSIS_INDEX = i
    TRACT = tracts[ANALYSIS_INDEX]
    filenames = [TRACT+'_pois_50']
    buildingIds = []
    TRACT_POIS[TRACT] = {}
    print filenames

    for filename in filenames:
        with open('../Data/POIs_Tracts/'+filename+'.txt', 'r') as f:
            # poiInfo is stored from the string '<place>'
            # to the string '</place>'. Then when '</place>' occurs
            # the poiInfo is parsed and stored in a dict
            poiInfo = ''
            placeIds = []
            writeStatus = 0
            for line in f:
                # strip gets rid of any whitespace
                line = line.strip()
                if line=='<place>':
                    writeStatus = 1
                elif line=='</place>':
                    writeStatus = 0
                    placeId = poiInfo.split('</placeid>')[0][9:]
                    if placeId not in placeIds:
                        placeIds.append(placeId)
                        TRACT_POIS[TRACT][placeId] = {}
                        output = ''
                        poiInfo = poiInfo.replace('<placeid>', '')
                        output += poiInfo.split('</placeid>')[0]
                        output += '|'                       
                        poiInfo = poiInfo.split('</placeid>')[1]
                        if 'name' in poiInfo:
                            poiInfo = poiInfo.split('<name>')[1]
                            TRACT_POIS[TRACT][placeId]['name'] = poiInfo.split('</name>')[0]
                            if '|' in poiInfo.split('</name>')[0]:
                                output += poiInfo.split('</name>')[0].replace('|', ',')
                            else:
                                output += poiInfo.split('</name>')[0]
                            
                            poiInfo = poiInfo.split('</name>')[1]
                        output += '|'                      
                        if 'address' in poiInfo:
                            poiInfo = poiInfo.split('<address>')[1]
                            TRACT_POIS[TRACT][placeId]['address'] = poiInfo.split('</address>')[0]
                            if '|' in poiInfo.split('</address>')[0]:
                                output += poiInfo.split('</address>')[0].replace('|', ',')
                            else:
                                output += poiInfo.split('</address>')[0]
                            poiInfo = poiInfo.split('</address>')[1]
                        output += '|'
                        if 'location' in poiInfo:
                            poiInfo = poiInfo.split('<location>')[1]
                            output += poiInfo.split('</location>')[0]
                            location = [float(x) for x in \
                                            poiInfo.split('</location>')[0][1:-1].split(',')]
                            TRACT_POIS[TRACT][placeId]['location'] = [location[1], location[0]]
                            poiInfo = poiInfo.split('</location>')[1]
                        output += '|'
                        if 'types' in poiInfo:
                            poiInfo = poiInfo.split('<types>')[1]
                            bufferOutput = poiInfo.split('</types>')[0]
                            bufferOutput = bufferOutput.replace('<type>', '')
                            bufferOutput = bufferOutput.replace('</type>', ',')
                            if bufferOutput[-1]==',':
                                bufferOutput = bufferOutput[:-1]
                            TRACT_POIS[TRACT][placeId]['types'] = bufferOutput.split(',')
                            output += bufferOutput
                            poiInfo = poiInfo.split('</types>')[1]
                        output += '|'
                        if 'opening_hours' in poiInfo:
                            initial = poiInfo
                            poiInfo = poiInfo.split('<opening_hours>')[1]
                            bufferOutput = poiInfo.split('</opening_hours>')[0]
                            bufferOutput = bufferOutput.replace('<day_hours>', '')
                            bufferOutput = bufferOutput.replace('</day_hours>', ',')
                            if bufferOutput[-1]==',':
                                bufferOutput = bufferOutput[:-1]
                            output += bufferOutput

                            TRACT_POIS[TRACT][placeId]['opening_hours'] = {}

                            try:
                                sundayHours = bufferOutput.split(',Sunday: ')[1].strip()
                                bufferOutput1 = bufferOutput.split(',Sunday: ')[0]
                                TRACT_POIS[TRACT][placeId]['opening_hours']['Sunday'] = getHours(sundayHours)

                                saturdayHours = bufferOutput1.split(',Saturday: ')[1].strip()
                                bufferOutput1 = bufferOutput.split(',Saturday: ')[0]
                                TRACT_POIS[TRACT][placeId]['opening_hours']['Saturday'] = getHours(saturdayHours)

                                fridayHours = bufferOutput1.split(',Friday: ')[1].strip()
                                bufferOutput1 = bufferOutput.split(',Friday: ')[0]
                                TRACT_POIS[TRACT][placeId]['opening_hours']['Saturday'] = getHours(fridayHours)

                                thursdayHours = bufferOutput1.split(',Thursday: ')[1].strip()
                                bufferOutput1 = bufferOutput.split(',Thursday: ')[0]
                                TRACT_POIS[TRACT][placeId]['opening_hours']['Thursday'] = getHours(thursdayHours)

                                wednesdayHours = bufferOutput1.split(',Wednesday: ')[1].strip()
                                bufferOutput1 = bufferOutput.split(',Wednesday: ')[0]
                                TRACT_POIS[TRACT][placeId]['opening_hours']['Wednesday'] = getHours(wednesdayHours)

                                tuesdayHours = bufferOutput1.split(',Tuesday: ')[1].strip()
                                bufferOutput1 = bufferOutput.split(',Tuesday: ')[0]
                                TRACT_POIS[TRACT][placeId]['opening_hours']['Tuesday'] = getHours(tuesdayHours)

                                mondayHours = bufferOutput1.split('Monday: ')[1].strip()
                                TRACT_POIS[TRACT][placeId]['opening_hours']['Monday'] = getHours(mondayHours)
                            except:
                                pass
                    poiInfo = ''
                elif writeStatus:
                    poiInfo += line

['25025010702_pois_50']


In [8]:
for TRACT in TRACT_POIS.keys():
    f = open('../PickledData/TractPOIs/'+TRACT+'.pickle', 'wb')
    pickle.dump(TRACT_POIS[TRACT], f, pickle.HIGHEST_PROTOCOL)
    f.close()