In [1]:

import xml.etree.cElementTree as ET
import pprint
from collections import defaultdict

filename = 'osm/new-york_new-york.osm'


In [2]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Get a list of all the tags and their count
"""

def count_tags(filename):
    osm_file = open(filename, "r")
    print ('opened_file')
    tagsdict = defaultdict (lambda: 0)
    #for event, elem in ET.iterparse(osm_file, events=('start', )):
    #    tagsdict[elem.tag]=tagsdict[elem.tag]+1
    
    context = iter(ET.iterparse(osm_file, events=('start', 'end')))
    _, root = next(context)
    for event, elem in context:
        if event == 'end':
            tagsdict[elem.tag]=tagsdict[elem.tag]+1
            root.clear()
    return dict(tagsdict)

tags = count_tags(filename)
pprint.pprint(tags)    


opened_file
{'bounds': 1,
 'member': 100531,
 'nd': 12155810,
 'node': 9412590,
 'osm': 1,
 'relation': 8351,
 'tag': 8709038,
 'way': 1538707}


In [3]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
check that the tag keys are valid chars
"""
import re
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}


osm_file = open(filename, "r")
context = iter(ET.iterparse(osm_file, events=('start', 'end')))
_, root = next(context)
for event, element in context:
    if event == 'end':
        if element.tag == "tag":
            key = element.attrib['k']
            whichCase = "other"
            if (lower.match(key)): whichCase = "lower"
            elif (lower_colon.match(key)): whichCase = "lower_colon"
            elif (problemchars.search(key)): whichCase = "problemchars"
            #print key, whichCase
            keys [whichCase] = keys [whichCase] + 1
    root.clear()
    
pprint.pprint(keys)

{'lower': 3458235,
 'lower_colon': 5121360,
 'other': 104450,
 'problemchars': 24993}


In [4]:
"""
Audit Street Types
"""
import xml.etree.cElementTree as ET
from collections import defaultdict
import re
import pprint

street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
first_word_re = re.compile(r'^\w+', re.IGNORECASE)
old_expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons"]
expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons", "Terrace", "Loop", "Highway", "Course", "Circle", "Way", "Crescent", "Walk",
           "Turnpike", "Bridge", "Causeway", "Gate", "Cove", "Alley", "Thruway", "Hill", "Piers", "Quadrangle",
            "Mews", "Path", "Run", "Expressway", "Freeway",
           "Broadway", "Bowery", ""]
expected_first = ["Avenue", "Route"]
extra_suffix = ["North", "East", "West", "South", "north", "EAST", ]


"""
Add street name to a list if the street type appears invalid
"""
def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            #print 'Found', street_type
            if street_type_first(street_name): return
            else: street_types[street_type].add(street_name)

"""
Function to check street valid, where the street type is first, like 'Avenue B'
"""            
def street_type_first(street_name):
    #print 'Check first', street_name
    m = first_word_re.search(street_name)
    if m:
        if m.group() in expected_first:
            #print 'Found Street Type first for', street_name
            return True
    return False
            

In [5]:
def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")


"""
Review the file for invalid street types
"""
def audit_street_types_in_file(osmfile):
    street_types = defaultdict(set)
    osm_file = open(filename, "r")
    context = iter(ET.iterparse(osm_file, events=('start', 'end')))
    _, root = next(context)
    for event, elem in context:
        if event == 'end':

            if elem.tag == "node" or elem.tag == "way":
                for tag in elem.iter("tag"):
                    if is_street_name(tag):
                        #print tag.attrib['k'], tag.attrib['v']
                        audit_street_type(street_types, tag.attrib['v'])
        root.clear()
    osm_file.close()
    return street_types


"""
Handle Scenarios where street type is followed by a suffix, such as South
"""    
def process_suffix (name, mapping):
    split_name = name.split(' ')
    if len(split_name) > 2 and split_name[-1] in extra_suffix:
        #print "Has a valid suffix ", name
        return update_name(name.rsplit(' ', 1)[0], mapping) + ' ' + split_name[-1] 
    return None

In [6]:
"""
Run the functions to process the osm xml file and print out any improved addresses
"""
unexpected_st_types = audit_street_types_in_file(filename)

In [7]:
pprint.pprint(len(dict(unexpected_st_types)))

220


In [8]:
mapping = { "Rd" : "Road",
    "Steet" : "Street",
    "STREET" : "Street",
    "avenue" : "Avenue",
    "ROAD" : "Road",
    "Pkwy" : "Parkway",
    "Cir" : "Circle",
    "AVenue" : "Avenue",
    "Tunrpike" : "Turnpike",
    "Broadwat" : "Broadway",
    "CIRCLE" : "Circle",
    "Turnlike" : "Turnpike",
    "street" : "Street",
    "Dr." : "Drive",
    "Aveneu" : "Avenue",
    "Ave." : "Avenue",
    "Avenue," : "Avenue",
    "Streeet" : "Street",
    "Pky" : "Parkway",
    "PLACE" : "Place",
    "CT)" : "Court",
    "road" : "Road",
    "St." : "Street",
    "Tirnpike" : "Turnpike",
    "LANE" : "Lane",
    "boulevard" : "Boulevard",
    "Ave," : "Avenue",
    "Avene" : "Avenue",
    "AVE." : "Avenue",
    "Tpke" : "Turnpike",
    "PARKWAY" : "Parkway",
    "Hwy" : "Highway",
    "Ave" : "Avenue",
    "Blvd." : "Boulevard",
    "Ct" : "Court",
    "Ln" : "Lane",
    "DRIVE" : "Drive",
    "ave" : "Avenue",
    "blvd" : "Boulevard",
    "Ct." : "Court",
    "AVENUE" : "Avenue",
    "Blv." : "Boulevard",
    "Tunpike" : "Turnpike",
    "Dr" : "Drive",
    "Expy" : "Expressway",
    "St" : "Street",
    "lane" : "Lane",
    "ST" : "Street",
    "drive" : "Drive",
    "st" : "Street",
    "Blvd" : "Boulevard"
            }

In [9]:
"""
Get an improved street name by replacing invalid street type where possible
"""
def update_name(name, mapping):
    m = street_type_re.search(name)
    if m:
        street_type = m.group()
        
        #print "Checking ", name
        if street_type in expected: pass
        elif street_type in mapping: 
            #print "Fixing name ", name
            name = street_type_re.sub(mapping[street_type], name)
            #print "Fixed name ", name
        else:
            processed_suffix = process_suffix(name, mapping)
            if processed_suffix != None: return processed_suffix
        
        return name     


for st_type, ways in unexpected_st_types.iteritems():
    first = True
    for name in ways:
        better_name = update_name(name, mapping)
        if name != better_name and first: 
            print name, "=>", better_name
            first = False
        

Valley Rd => Valley Road
GEORGE STREET => GEORGE Street
Bedford avenue => Bedford Avenue
NEW ENGLAND AVE. => NEW ENGLAND Avenue
SUTPHEN ROAD => SUTPHEN Road
Granite Pkwy => Granite Parkway
Barron Cir => Barron Circle
Wantagh AVenue => Wantagh Avenue
South Broadwat => South Broadway
GIBBONS CIRCLE => GIBBONS Circle
Hempstead Turnlike => Hempstead Turnpike
LIPMAN DRIVE => LIPMAN Drive
Plaza St West => Plaza Street West
Hudson street => Hudson Street
N 9th ST => N 9th Street
Ridge Dr. => Ridge Drive
Marginal St East => Marginal Street East
St. Nicholas Aveneu => St. Nicholas Avenue
Franklin Ave. => Franklin Avenue
70th Avenue, => 70th Avenue
Franklin Ave, => Franklin Avenue
Anderson Ave => Anderson Avenue
Johnson Streeet => Johnson Street
Ellis Pky => Ellis Parkway
TITSWORTH PLACE => TITSWORTH Place
raider road => raider Road
Lafayette St. => Lafayette Street
Hempstead Tirnpike => Hempstead Turnpike
MARVIN LANE => MARVIN Lane
sutphin boulevard => sutphin Boulevard
BIEL ROAD EAST => BIEL R

In [10]:

"""
This section is the start of the code for shaping the OSM XML to JSON
"""
import codecs
import json

lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

"""
Constants for organizing the elements
"""
CREATED = [ "version", "changeset", "timestamp", "user", "uid"]
replace_tags = {'addr':'address', 'type':'type_as_specified'}
#has_value_and_children = ['hgv', 'name', 'building', 'railway', 'lanes', 'maxspeed', 'source', 'is_in', 'internet_access', ]
has_value_and_children = []

In [11]:
"""
Process the top level attributes
"""
def process_attributes(element):
    attributes = {}
    created = {}
    pos = [None,None]

    for attribute in element.attrib:
        value = element.attrib[attribute]

        #organize elements specified in CREATED to a created element
        if attribute in CREATED:
            created[attribute] = value

        #organize the latitude and longitude to a pos [lat,long] element
        elif attribute == 'lat': 
            try:
                pos [0] = float(value)
            except ValueError:
                print "Not a float"                
        elif attribute == 'lon':
            try:
                pos [1] = float(value)
            except ValueError:
                print "Not a float"
                
        #default assignment 
        else: attributes[attribute] = value    

    #include the organized elements to the dict
    attributes['created'] = created
    #Note: Iteration 2: this check for None was introduced after initial nyny was analyzed
    if None not in pos: attributes['pos'] = pos
        
    return attributes    

def process_refs(element):
    node_refs = []
    for ref in element.iter("nd"):
        node_refs.append(ref.attrib['ref'])
    if len(node_refs) > 0 : return {"node_refs":node_refs}
    return {}

"""
Process the list the tag k, v attributes
"""
def process_tags(element, found_complex_tag):
    tags = {}
    for tag in element.iter("tag"):
        process_tag(tag.attrib['k'],tag.attrib['v'], tags, found_complex_tag)
    return tags

In [12]:
"""
Translate one tag k, v attribute
"""
def process_tag (name, value, tags, found_complex_tag):
    #tags[name]=value
    #if (name.startswith('hgv')): print name, value
    try:
        if (name!= None and len(name)>0 and name.strip()!=':' and 
            name.strip()!='' and not problemchars.search(name)):
            
            levels = name.split(':')
            
            #extract the top level key and convert it as needed
            top = levels[0]
            top = replace_tags.get(top,top)
            
            #assign value if it's a top level attribute
            if len(levels) == 1: 
                if top not in tags: tags[top] = value
                else: 
                    print top, value, ' other value already found as ', tags[top]
                
            #add to address dict if there are two levels
            elif top=='address' and len(levels) == 2:
                if 'address' not in tags: tags['address']={}                    
                if not isinstance(tags['address'],dict): 
                    #print 'address:', name, value, ' overwriting simple value already found as ', tags['address']
                    tags['address']={}
                process_tag (levels[1],value,tags['address'], found_complex_tag)
                
            #add to a dict if there are multiple levels
            elif top!='address' and len(levels) > 1:
                if top in has_value_and_children: top = top + "_data"
                elif top in tags and not isinstance(tags[top],dict): 
                    #a root value was already found for this tag.
                    #print name, value, ' unexpected - simple value already found as ', top, tags[top]
                    found_complex_tag.add(top)
                    return
                if top not in tags: tags[top]={}                    
                process_tag (name.split(':', 1)[1], value, tags[top], found_complex_tag)
                
    except Exception, e:
        print "Failed: %s" % e
        print 'Exception', name, value
        pprint.pprint(tags)
        return

In [13]:
"""
Modify the given tags dict and fix street name in address
"""
street_tags = ['street']
def fix_street(tags):
    if 'address' in tags:
        addr = tags['address']
        for key in addr:
            if key in street_tags:
                #print 'checking', addr[key]
                addr[key] = update_name(addr[key], mapping)

In [14]:
"""
Translation method for convering one OSM XML entry to a dict as per specs
"""
def shape_element(element, found_complex_tag):
    node = {}
    if element.tag == "node" or element.tag == "way" :
        node['type']=element.tag        
        refs = process_refs(element)
        node.update(refs)
        attrs = process_attributes(element)    
        node.update(attrs)
        tags = process_tags(element, found_complex_tag)
        fix_street(tags)
        node.update(tags)
              
        #pprint.pprint( node )
        return node
    else:
        return None

In [15]:
"""
Parse an OSM XML file and call shape_element for each entry,
the shaped element is written to a file in json format
"""
def check_has_value_and_children(file_in, pretty = False):
    #data = []
    found_complex_tag = set()
    osm_file = open(filename, "r")
    context = iter(ET.iterparse(osm_file, events=('start', 'end')))
    _, root = next(context)
    for event, elem in context:
        if event == 'end':
            el = shape_element(elem, found_complex_tag)
        root.clear()
    return found_complex_tag
has_value_and_children = check_has_value_and_children(filename, False)
print has_value_and_children

set(['inscription', 'maxspeed', 'bicycle', 'proposed', 'bridge', 'social_facility', 'community', 'alt_name', 'parking', 'operator', 'is_in', 'toilets', 'scale', 'capacity', 'aerialway', 'hgv', 'opening_hours', 'destination', 'wikipedia', 'feature', 'handrail', 'cycleway', 'access', 'source', 'toll', 'ref', 'highway', 'FIXME', 'animal_keeping', 'lanes', 'description', 'short_name', 'heritage', 'electrified', 'denomination', 'historic', 'old_name', 'smoking', 'sidewalk', 'population', 'building', 'odbl', 'deelectrified', 'name', 'tunnel', 'wheelchair', 'ramp', 'oneway', 'railway', 'disused', 'internet_access'])


In [16]:
"""
Parse an OSM XML file and call shape_element for each entry,
the shaped element is written to a file in json format
"""
def process_map(file_in, pretty = False):
    file_out = "{0}.json".format(file_in)
    #data = []
    with codecs.open(file_out, "w") as fo:
        osm_file = open(filename, "r")
        context = iter(ET.iterparse(osm_file, events=('start', 'end')))
        _, root = next(context)
        for event, elem in context:
            if event == 'end':
                el = shape_element(elem, set())
                if el:
                    #data.append(el)
                    if pretty:
                        fo.write(json.dumps(el, indent=2)+"\n")
                    else:
                        fo.write(json.dumps(el) + "\n")
            root.clear()
    #return data

In [17]:
"""
******************  GENERATING THE JSON FILE ****************************
"""
process_map(filename, False)

In [None]:
"""
******************  ADDITIONAL AUDITS USING THE JSON FILE ****************************
"""

import json

"""
Extract the tags of a particular entry and add it to a set for tracking locations of a tag name
"""
def extract_tags(node,parent,tags):
    for key in node:
        if isinstance (node[key],dict): 
            extract_tags(node[key] , parent + key + ".", tags)
        else:
            if key not in tags: tags[key] = set()
            tags[key].add( parent+key )

"""
Iterate through a file with json entries and capture each tag name's location within documents
"""
def check_for_unique_tags(jsonFile):
    f = open(jsonFile)
    tags = defaultdict(set)
    for line in iter(f):
        #print line
        node = json.loads(line)
        extract_tags(node, '', tags)
    f.close()
    #pprint.pprint (dict(tags))
    multis = {k: v for k, v in (dict(tags)).items() if len(tags[k])>1}
    pprint.pprint (multis)
    

In [None]:
check_for_unique_tags(filename + '.json')

In [None]:
"""
These functions are for checking the data type.  It's helpful for seeing the quality of the data.
For example, zip code resolving to integer meant they were all proper digits
And if there are fields with both array and single values, then this would help identify, but 
since I formatted these json records, I didn't have a problem with array/single value types.
"""

def isfloat(value):
  try:
    float(value)
    return True
  except:
    return False

def isint(value):
  try:
    int(value)
    return True
  except:
    return False


def get_data_type(value):
    if value == "" or value == "NULL":
        return type(None)
    elif type(value) is list:
        return type([])
    elif isint(value):
        return type(int())
    elif isfloat(value):
        return type(float())
    else:
        return type(str())


def extract_data_types(node,parent,tags):
    for key in node:
        location = parent + key
        if isinstance (node[key],dict): 
            extract_data_types(node[key] , location + ".", tags)
        else:
            if location not in tags: tags[location] = set()
            tags[location].add( get_data_type(node[key]) )

def audit_data_types_in_file(jsonFile):
    f = open(jsonFile)
    tags = defaultdict(set)
    for line in iter(f):
        node = json.loads(line)
        extract_data_types(node, '', tags)
    f.close()
    return dict(tags)
   

In [None]:
import json 
data_types = audit_data_types_in_file(filename + '.json')

In [None]:
pprint.pprint(data_types['address.postcode'])

In [1]:
def get_db():
    from pymongo import MongoClient
    client = MongoClient('localhost:27017')
    # 'examples' here is the database name. It will be created if it does not exist.
    db = client.osm
    return db

db = get_db()
db.nyny.find_one()


{u'_id': ObjectId('58292f4103fb1722a45a65fb'),
 u'created': {u'changeset': u'41015803',
  u'timestamp': u'2016-07-25T17:17:46Z',
  u'uid': u'326503',
  u'user': u'wambag',
  u'version': u'4'},
 u'id': u'26769789',
 u'pos': [40.6995927, -74.1868914],
 u'type': u'node'}

In [3]:
db.nyny.find_one({"address.postcode" : { "$exists" : True } })

{u'_id': ObjectId('58292f4203fb1722a45a7ea9'),
 u'address': {u'city': u'New York',
  u'housenumber': u'244',
  u'postcode': u'10001',
  u'street': u'5th Avenue'},
 u'contact': {u'email': u'ny@infrontweb.com',
  u'facebook': u'https://www.facebook.com/infrontweb',
  u'google_plus': u'https://plus.google.com/+Infrontweb/posts',
  u'phone': u'(646) 395-9884',
  u'twitter': u'https://twitter.com/infrontweb',
  u'website': u'http://infrontweb.com/'},
 u'created': {u'changeset': u'36034386',
  u'timestamp': u'2015-12-18T19:38:30Z',
  u'uid': u'251543',
  u'user': u'ChrissW-R1',
  u'version': u'9'},
 u'description': u'infrontweb is a full service marketing agency offering ecommerce and custom web design to businesses in New York, the continental United States and internationally since 1997. For the past 15 year',
 u'id': u'42465563',
 u'name': u'Infront Web',
 u'pos': [40.6286801, -74.0229613],
 u'service': u'Seo New York,Web design New York,website design NY,Iphone App Developer New York,Iph

In [20]:
size = db.nyny.count()
print size

10951297


In [21]:
"""
Check Zip code
"""
def make_zip_pipeline():
    
    pipeline = [ 
        { "$match": { "address.postcode" : { "$exists" : True } } },
        { "$group" : { "_id": "$address.postcode", "count" : {"$sum" : 1 }}},
        { "$sort": {"_id":1 }} ]
        #{ "$sort": {"address.postcode":-1 }},
        #{ "$limit" : 1 } ]
    return pipeline

def aggregate(db, pipeline):
    return [doc for doc in db.nyny.aggregate(pipeline)]


#db = get_db()
pipeline = make_zip_pipeline()
result = aggregate(db, pipeline)
#import pprint
#pprint.pprint(result)
print result

[{u'count': 1, u'_id': u'(718) 778-0140'}, {u'count': 25, u'_id': u'06807'}, {u'count': 17, u'_id': u'06820'}, {u'count': 22, u'_id': u'06830'}, {u'count': 8, u'_id': u'06831'}, {u'count': 2, u'_id': u'06853'}, {u'count': 3, u'_id': u'06855'}, {u'count': 2, u'_id': u'06870'}, {u'count': 6, u'_id': u'06878'}, {u'count': 1, u'_id': u'06880'}, {u'count': 6, u'_id': u'06901'}, {u'count': 44, u'_id': u'06902'}, {u'count': 1, u'_id': u'06904'}, {u'count': 9, u'_id': u'06905'}, {u'count': 10, u'_id': u'06906'}, {u'count': 1, u'_id': u'06907'}, {u'count': 1, u'_id': u'07001'}, {u'count': 2, u'_id': u'07002'}, {u'count': 17, u'_id': u'07003'}, {u'count': 2, u'_id': u'07004'}, {u'count': 2, u'_id': u'07006'}, {u'count': 1, u'_id': u'07007'}, {u'count': 4, u'_id': u'07008'}, {u'count': 2, u'_id': u'07010'}, {u'count': 1, u'_id': u'07011'}, {u'count': 3, u'_id': u'07012'}, {u'count': 2, u'_id': u'07013'}, {u'count': 6, u'_id': u'07016'}, {u'count': 7, u'_id': u'07020'}, {u'count': 43, u'_id': u'07

In [23]:
"""
Inspect records with bad zip codes
"""
wrong_length = [ rec['_id'] for rec in result if len(rec['_id']) != 5]
print wrong_length

check_zips = ['08816', '08854']

for zip in (check_zips):
    res = db.nyny.find_one({"address.postcode" : zip})
    pprint.pprint(res)

[u'(718) 778-0140', u'07030-5774', u'07052 ', u'07054-1396', u'07302-4522', u'07305-9997', u'07506-1566', u'07834-2558', u'07834-2716', u'07940-2747', u'08854-3929', u'08854-5024', u'08854-5602', u'08854-5603', u'08854-5610', u'08854-5622', u'08854-5627', u'08854-5659', u'08854-5695', u'08854-8000', u'08854-8001', u'08854-8002', u'08854-8003', u'08854-8004', u'08854-8006', u'08854-8007', u'08854-8008', u'08854-8009', u'08854-8010', u'08854-8012', u'08854-8014', u'08854-8018', u'08854-8019', u'08854-8020', u'08854-8022', u'08854-8023', u'08854-8025', u'08854-8029', u'08854-8030', u'08854-8031', u'08854-8032', u'08854-8033', u'08854-8035', u'08854-8036', u'08854-8037', u'08854-8038', u'08854-8039', u'08854-8040', u'08854-8041', u'08854-8042', u'08854-8043', u'08854-8045', u'08854-8047', u'08854-8048', u'08854-8049', u'08854-8050', u'08854-8051', u'08854-8052', u'08854-8053', u'08854-8054', u'08854-8055', u'08854-8058', u'08854-8059', u'08854-8060', u'08854-8062', u'08854-8063', u'08854-8

In [5]:
"""
Functions to get distinct values for a field and the count of each
"""

def make_distinct_with_count_pipeline( by_field ):    
    pipeline = [ 
        { "$match": { by_field : { "$exists" : True } } },
        { "$group" : { "_id": "$" + by_field , "count" : {"$sum" : 1 }}},
        { "$sort" : { "count": -1 }},
        { "$limit" : 20 } ]
    return pipeline

def aggregate(db, pipeline):
    return [doc for doc in db.nyny.aggregate(pipeline)]

def distinct_with_count(field_name):
    #db = get_db()
    pipeline = make_distinct_with_count_pipeline(field_name)
    result = aggregate(db, pipeline)
    return result


In [25]:
result = distinct_with_count('type')
print result

[{u'count': 1538707, u'_id': u'way'}, {u'count': 9412590, u'_id': u'node'}]


In [7]:
import pprint
get_distincts = ["address.state", "source", "cuisine", "leisure", "office", "service", "shop", "sport"]

for field_name in get_distincts:
    print "Distinct", field_name
    pprint.pprint (distinct_with_count(field_name))
    print " "


Distinct address.state
[{u'_id': u'NY', u'count': 7514},
 {u'_id': u'NJ', u'count': 2206},
 {u'_id': u'Nj', u'count': 534},
 {u'_id': u'CT', u'count': 137},
 {u'_id': u'New York', u'count': 37},
 {u'_id': u'ny', u'count': 35},
 {u'_id': u'New Jersey', u'count': 9},
 {u'_id': u'nj', u'count': 9},
 {u'_id': u'Ny', u'count': 6},
 {u'_id': u'ct', u'count': 4},
 {u'_id': u'Nj;NJ', u'count': 4},
 {u'_id': u'TX', u'count': 2},
 {u'_id': u'ON', u'count': 1},
 {u'_id': u'New York State', u'count': 1},
 {u'_id': u'VA', u'count': 1},
 {u'_id': u'BY', u'count': 1},
 {u'_id': u'CA', u'count': 1},
 {u'_id': u'NJ - New Jersey', u'count': 1},
 {u'_id': u'10009', u'count': 1}]
 
Distinct source
[{u'_id': u'NJ2002LULC', u'count': 7618},
 {u'_id': u'Bing', u'count': 5581},
 {u'_id': u'USGS Geonames', u'count': 2021},
 {u'_id': u'Yahoo', u'count': 1733},
 {u'_id': u'TIGER/Line\xae 2008 Place Shapefiles (http://www.census.gov/geo/www/tiger/)',
  u'count': 1713},
 {u'_id': u'BingSat and Mapillary.com', u'co

In [10]:
"""
Functions to count the distinct values for a field
"""

z
    return pipeline

def aggregate(db, pipeline):
    return [doc for doc in db.nyny.aggregate(pipeline)]

def count_distinct(field_name):
    #db = get_db()
    pipeline = make_countdistinct_pipeline(field_name)
    result = aggregate(db, pipeline)
    return result[0]


IndentationError: unexpected indent (<ipython-input-10-24fa40409ff7>, line 6)

In [9]:
print "Distinct", 'amenity'
pprint.pprint (distinct_with_count('amenity'))
    
def print_count_distincts(fields):
    for field_name in fields:
        #pprint.pprint(get_top(field_name))
        count_dist = count_distinct(field_name)
        print "Distinct ", field_name, count_dist['_id'], ' - count', count_dist['count']

get_count_distincts_fields = ['created.user', 'address.postcode', 'address.city', 'source']
print_count_distincts(get_count_distincts_fields)

Distinct amenity
[{u'_id': u'parking', u'count': 7061},
 {u'_id': u'bicycle_parking', u'count': 4874},
 {u'_id': u'school', u'count': 4703},
 {u'_id': u'place_of_worship', u'count': 4694},
 {u'_id': u'restaurant', u'count': 3313},
 {u'_id': u'fast_food', u'count': 1077},
 {u'_id': u'cafe', u'count': 990},
 {u'_id': u'bank', u'count': 800},
 {u'_id': u'fire_station', u'count': 690},
 {u'_id': u'bench', u'count': 592},
 {u'_id': u'fuel', u'count': 549},
 {u'_id': u'library', u'count': 496},
 {u'_id': u'pharmacy', u'count': 450},
 {u'_id': u'toilets', u'count': 450},
 {u'_id': u'hospital', u'count': 435},
 {u'_id': u'bar', u'count': 416},
 {u'_id': u'post_office', u'count': 403},
 {u'_id': u'bicycle_rental', u'count': 400},
 {u'_id': u'grave_yard', u'count': 349},
 {u'_id': u'parking_space', u'count': 331}]
Distinct  created.user created.user  - count 3852
Distinct  address.postcode address.postcode  - count 765
Distinct  address.city address.city  - count 430
Distinct  source source  - c

In [11]:
"""
Functions to get top value for a field and the count of each
"""

def make_get_top_pipeline( by_field ):    
    pipeline = [ 
        { "$match": { by_field : { "$exists" : True } } },
        { "$group" : { "_id": "$" + by_field , "count" : {"$sum" : 1 }}},
        { "$sort": { "count" :-1 }},
        { "$limit" : 1 } ]
    return pipeline

def get_top(field_name):
    #db = get_db()
    pipeline = make_get_top_pipeline(field_name)
    result = aggregate(db, pipeline)
    return result[0]

def print_tops(fields):
    for field_name in fields:
        #pprint.pprint(get_top(field_name))
        top = get_top(field_name)
        print "Top", field_name, top['_id'], ' - count', top['count']

#import pprint
#pprint.pprint(result)
get_top_fields = ['created.user', 'address.postcode', 'address.city', 'amenity']
print_tops(get_top_fields)

Top created.user Rub21_nycbuildings  - count 4889620
Top address.postcode 10314  - count 23065
Top address.city New York  - count 5185
Top amenity parking  - count 7061
