# README

In [1]:
README = '''
Julian Smoller ~ 2017.04.24

# Intro:
The purpose of this notebook is to parse the GPS coordinates 
and other data from a KML file, which is the type of file you 
get when you export a Google map, e.g. my map of NYC bars.

To generate a map, e.g. a map of Citibikes, I first created 
a Google Map with icons and polygons in different locations. 
For example, I created a Google Map with a polygon representing 
the perimeter of Manhattan. Then I exported that map to a KML file. 
Then I used the code below to parse the KML data into a dataframe 
and save it to a pickle file. Then the code in the other notebook 
will load the dataframe and draw polygons to generate a map. 

'''
print(README)


Julian Smoller ~ 2017.04.24

# Intro:
The purpose of this notebook is to parse the GPS coordinates 
and other data from a KML file, which is the type of file you 
get when you export a Google map, e.g. my map of NYC bars.

To generate a map, e.g. a map of Citibikes, I first created 
a Google Map with icons and polygons in different locations. 
For example, I created a Google Map with a polygon representing 
the perimeter of Manhattan. Then I exported that map to a KML file. 
Then I used the code below to parse the KML data into a dataframe 
and save it to a pickle file. Then the code in the other notebook 
will load the dataframe and draw polygons to generate a map. 




# Import libraries + change display settings

In [2]:
########################################################################
# Import libraries:
from PIL import Image, ImageDraw
from matplotlib.pyplot import imshow
import numpy as np
import pickle
import pandas as pd

########################################################################
# Change display settings:
pd.options.display.max_rows = 500
pd.options.display.max_columns = 100
%matplotlib inline

# Read text from file

In [4]:
def read_file(path):
    '''Given the path to a file, open and read the file, 
    and return the contents as a string.
    '''
    f = open(path,'rb')
    text = f.read()
    text = str(text)
    f.close()
    return text

path_in = 'data/input/manhattan_parks.kml'
text = read_file(path_in)
print(len(text))
print(text[:100])

202761
b"<?xml version='1.0' encoding='UTF-8'?>\n<kml xmlns='http://www.opengis.net/kml/2.2'>\n\t<Document>


# Parse places from KML file

In [5]:
def parse_elements(text, element):
    '''
    Given the text of a KML file, parse the text beteween the start and end
    tags of a specific element, e.g. Placemark; returns a list for each instance of 
    given element, e.g. each Placemark
    '''
    start_tag = '<'+element
    end_tag = '</'+element
    # Split the text wherever a start tag occurs
    elements = text.split(start_tag)
    # If we didn't find any start tags, return an empty list
    if len(elements)<1:
        return elements
    else:
        # Chop off the first item, which contains text before the first start tag
        elements = elements[1:]
        # For every element... 
        for i, e in enumerate(elements):
            # Trim text after end tag
            end_pos = e.find(end_tag)
            if end_pos>=0:
                elements[i] = e[:end_pos]
        return elements
    
def parse_coordinates(text):
    '''
    Given the text for a KML "coordinates" element, parse the longitude and latitude
    Input: a single string containing coordinate text parsed from a "Placemark" object
    Output: a dictionary of longitude and latitude arrays
    '''
    gps = {'longitude':[],'latitude':[]}
    # Split the text into chunks wherever there is a space
    chunks = text.split(' ')
    for chunk in chunks:
        # Split each chunk into its constituent numbers wherever there is a comma
        nums = chunk.split(',')
        # Add the relevant numbers to the appropriate arrays for longitude and lattitude
        gps['longitude'].append(round(float(nums[0]),6))
        gps['latitude'].append(round(float(nums[1]),6))
    return gps
    
def parse_polygon_places_from_kml_file(path):
    '''
    Given the path to a KML file, parse each placemeark along with GPS coordinates. 
    Each placemark is assumed to be a polygon, with multiple GPS coordinates. 
    Input: path to KML file containing placemark elements
    Output: dataframe with one row per placemark, with columns for longitude and latitude
    Note: placemarks could also be lines, in which case you will want to concat, e.g. perimeter
    '''
    # Get text from KML file
    text = read_file(path)
    # Get a list of each placemark's text
    places = parse_elements(text, 'Placemark')
    # Prepare a dictionary of lists for parsed values
    d = {'name':[], 'coordinates':[], 'latitude':[], 'longitude':[]}
    # For each placemark, parse relevant values
    for p in places:
        # Parse the text for name and coordinates (as a string)
        d['name'].append(parse_elements(p,'name')[0][1:])
        coordinate_text = parse_elements(p, 'coordinates')[0][1:]
        d['coordinates'].append(coordinate_text)
        # Parse the longitude and latitude arrays
        gps = parse_coordinates(coordinate_text)
        d['latitude'].append(gps['latitude'])
        d['longitude'].append(gps['longitude'])
    df_places = pd.DataFrame(d)
    return df_places

path_in = 'data/input/manhattan_parks.kml'
places = parse_polygon_places_from_kml_file(path_in)
print(len(places))
places.head()

313


Unnamed: 0,coordinates,latitude,longitude,name
0,"-74.0188086,40.7081817,0.0 -74.018873,40.70800...","[40.708182, 40.708003, 40.7078, 40.7071, 40.70...","[-74.018809, -74.018873, -74.018219, -74.01854...",Polygon 1
1,"-74.0133905,40.7056849,0.0 -74.0138251,40.7051...","[40.705685, 40.705156, 40.70468, 40.70468, 40....","[-74.01339, -74.013825, -74.014083, -74.013342...",Polygon 2
2,"-74.0101773,40.7032571,0.0 -74.010365,40.70321...","[40.703257, 40.703212, 40.702167, 40.702464, 4...","[-74.010177, -74.010365, -74.01, -74.009335, -...",Polygon 3
3,"-74.0089488,40.70493249999999,0.0 -74.0096515,...","[40.704932, 40.704758, 40.704583, 40.70488, 40...","[-74.008949, -74.009652, -74.009453, -74.00892...",Polygon 4
4,"-74.0072,40.7052457,0.0 -74.0073341,40.7051643...","[40.705246, 40.705164, 40.704302, 40.704432, 4...","[-74.0072, -74.007334, -74.00631, -74.006127, ...",Polygon 5


# Class: Place

In [6]:
class Place:
    '''
    Create a Place instance from the text of a Placemark parsed from a KML file.
    Input: the text of a Placemark object, parsed from KML file (using parse_elements)
    Dependencies: parse_elements and parse_coordinates
    Output: a Place instance with attributes like name, icon info, coordinates, etc.
    '''
    def __init__(self, text):
        self.text = text
        self.name = self.parse_name()
        self.description = self.parse_description()
        self.icon_string = self.parse_icon_string()
        self.icon_shape = self.parse_icon_shape()
        self.icon_color = self.parse_icon_color()
        self.coordinate_string = self.parse_coordinate_string()
        self.coordinates = parse_coordinates(self.coordinate_string)
    def parse_name(self):
        names = parse_elements(self.text, 'name')
        name = '' if len(names)<1 else names[0][1:]
        return name
    def parse_description(self):
        descriptions = parse_elements(self.text, 'description')
        description = '' if len(descriptions)<1 else descriptions[0][1:]
        if description[:9] == '<![CDATA[':
            description = description[9:]
        if description[-3:] == ']]>':
            description = description[:-3]
        return description
    def parse_icon_string(self):
        icon_strings = parse_elements(self.text, 'styleUrl')
        icon_string = '' if len(icon_strings)<1 else icon_strings[0][1:]
        return icon_string
    def parse_icon_shape(self):
        try:
            icon_shape = self.icon_string.split('-')[1]
        except:
            icon_shape = ''
        finally:
            return icon_shape
    def parse_icon_color(self):
        try:
            icon_color = self.icon_string.split('-')[2]
        except:
            icon_color = ''
        finally:
            return icon_color
    def parse_coordinate_string(self):
        coordinate_strings = parse_elements(self.text, 'coordinates')
        coordinate_string = '' if len(coordinate_strings)<1 else coordinate_strings[0][1:]
        return coordinate_string

# Parse point places from KML file

In [7]:
def parse_point_places_from_kml_file(path):
    '''
    Create a list of Places by parsing Placemarks from KML file.
    This function is similar to the prior function, but in this case we are only recording 
    one GPS coordinate (longitude,latitude) per place, as if the place were a single point.
    Input: path to a KML file
    Output: a dataframe of places, with columns for longitude, latitude, name, description, icon info, etc.
    '''
    # Read KML file
    text = read_file(path)
    # For every chunk of text containing a Placemark, create a Place instance
    places = []
    for p in parse_elements(text, 'Placemark'):
        places.append(Place(p))
    # Compile all of the Places into a dataframe
    d = {'name':[], 'description':[], 'icon_shape':[], 'icon_color':[], 
         'longitude':[], 'latitude':[]}
    for p in places:
        d['name'].append(p.name)
        d['description'].append(p.description)
        d['icon_shape'].append(p.icon_shape)
        d['icon_color'].append(p.icon_color)
        d['longitude'].append(p.coordinates['longitude'][0])
        d['latitude'].append(p.coordinates['latitude'][0])
    df = pd.DataFrame(d)
    df = df[['longitude', 'latitude', 'name', 'icon_shape', 'icon_color', 'description']]
    return df

path_in = 'data/input/bars.kml'
places = parse_point_places_from_kml_file(path_in)
print(len(places))
places.head()

709


Unnamed: 0,longitude,latitude,name,icon_shape,icon_color,description
0,-73.999488,40.7146,Le Baron NYC,962,FFFFFF,"Exclusive 3-level nightclub with a sultry, red..."
1,-74.008367,40.741029,Le Bain,503,DB4436,"There\'s a disco, bar, (seasonal) plunge pool ..."
2,-74.003636,40.742083,Electric Room,962,000000,The Electric Room lurks deep beneath the Dream...
3,-74.010638,40.739838,The Raven,962,FFFFFF,"Basement club with DJs, drinks & Edgar Allan P..."
4,-74.005886,40.739912,Provocateur,962,000000,Swanky Gansevoort Hotel nightclub lined with l...


# Parse + Save data

In [9]:
# Export GPS data for later use
# Citibikes
path_in = 'data/input/citibikes.kml'
citibikes = parse_point_places_from_kml_file(path_in)
path_out = 'data/output/citibikes.p'
pickle.dump(citibikes, open(path_out, 'wb'))

In [10]:
# Parks
path_in = 'data/input/manhattan_parks.kml'
parks = parse_polygon_places_from_kml_file(path_in)
path_out = 'data/output/parks.p'
pickle.dump(parks, open(path_out, 'wb'))

In [16]:
# Perimeter
path_in = 'data/input/manhattan_perimeter.kml'
perimeter_lines = parse_polygon_places_from_kml_file(path_in)
latitude = []
longitude = []
for i in perimeter_lines.index:
    latitude += perimeter_lines.ix[i,'latitude']
    longitude += perimeter_lines.ix[i,'longitude']
perimeter = pd.DataFrame({'longitude':longitude,'latitude':latitude})
path_out = 'data/output/perimeter.p'
pickle.dump(perimeter, open(path_out, 'wb'))