**Code to determine the size of the source map file:**

In [81]:
import os

b = os.path.getsize("vegas")
print("The size of the OSM source file is", b, "bytes.")

The size of the OSM source file is 62711991 bytes.


**Process to determine the number of unique users that contributed to this map selection:**

In [82]:
import xml.etree.ElementTree as ET
import pprint
import re

def get_user(element):
    uid = ''
    if element.tag == "node" or element.tag == "way" or element.tag == "relation":
        uid = element.get('uid')

    return uid


def process_map(filename):
    users = set()
    for _, element in ET.iterparse(filename):
        if get_user(element):
            users.add(get_user(element))
            users.discard('')
        pass

    return users


def test():

    num_users = process_map('vegas')
    print("There are", (len(num_users)), "unique users.")


if __name__ == "__main__":
    test()

There are 676 unique users.


In [83]:
import xml.etree.ElementTree as ET
import pprint
import re

def get_userid(element):
    return element.attrib['uid']

def get_username(element):
    return element.attrib['user']

def process_map(filename):
    userids = set()
    usernames = set()
    for _, element in ET.iterparse(filename):
        if 'uid' in element.attrib.keys():
            userid = get_userid(element)
            userids.add(userid)
        if 'user' in element.attrib.keys():
            username = get_username(element)
            usernames.add(username)

    return userids, usernames

with open('vegas', 'rb') as mapfile:
    userids, usernames = process_map(mapfile)
    print("There are %d unique user IDs." % len(userids))
    print("There are %d unique usernames." % len(usernames))

There are 676 unique user IDs.
There are 676 unique usernames.


**Used alternative coding to make sure number the number of 'user' and 'uid' selections was both accurate symmetrical.  No problems found.**

In [84]:
import xml.etree.ElementTree as ET
import pprint

def count_tags(filename):
    tags = {}
    for event, elem in ET.iterparse(filename, events=("start",)):
        if elem.tag in tags.keys():
            tags[elem.tag] += 1
        else:
            tags[elem.tag] = 1
            print(tags)
    return tags

def test():
    
    tags = count_tags('vegas')
    print("\nUnique tabs:\n")
    pprint.pprint(tags)

if __name__ == "__main__":
    test()

{'osm': 1}
{'osm': 1, 'note': 1}
{'osm': 1, 'note': 1, 'meta': 1}
{'osm': 1, 'note': 1, 'meta': 1, 'bounds': 1}
{'osm': 1, 'note': 1, 'meta': 1, 'bounds': 1, 'node': 1}
{'osm': 1, 'note': 1, 'meta': 1, 'bounds': 1, 'node': 1, 'tag': 1}
{'osm': 1, 'note': 1, 'meta': 1, 'bounds': 1, 'node': 290183, 'tag': 15471, 'way': 1}
{'osm': 1, 'note': 1, 'meta': 1, 'bounds': 1, 'node': 290183, 'tag': 15471, 'way': 1, 'nd': 1}
{'osm': 1, 'note': 1, 'meta': 1, 'bounds': 1, 'node': 290183, 'tag': 90392, 'way': 26918, 'nd': 330033, 'relation': 1}
{'osm': 1, 'note': 1, 'meta': 1, 'bounds': 1, 'node': 290183, 'tag': 90392, 'way': 26918, 'nd': 330033, 'relation': 1, 'member': 1}

Unique tabs:

{'bounds': 1,
 'member': 6104,
 'meta': 1,
 'nd': 330033,
 'node': 290183,
 'note': 1,
 'osm': 1,
 'relation': 294,
 'tag': 91369,
 'way': 26918}


**Used iterative parsing to process the source file and find how many unique tabs exist in the data:**

**Next, I explored lower level tags to identify name and count of each occurance.**

In [85]:
import xml.etree.ElementTree as ET
from pprint import pprint
import operator

OSMFILE = 'vegas'

def count_tags(filename):
    element_count = {}
    k_attributes = {}

    for event, element in ET.iterparse(filename, events=("start",)):
        element_count[element.tag] = element_count.get(element.tag, 0) + 1

        if element.tag == 'tag' and 'k' in element.attrib:
            k_attributes[element.get("k")] = k_attributes.get(element.get("k"), 0) + 1

    # sort the dictionary by counts in decending order
    k_attributes = sorted(k_attributes.items(), key=operator.itemgetter(1))[::-1]
    element_count = sorted(element_count.items(), key=operator.itemgetter(1))[::-1]

    return element_count, k_attributes

def main():
    """ main function """
    element_count, k_attributes = count_tags(OSMFILE)
    print(element_count)
    print(k_attributes)
    return element_count, k_attributes

if __name__ == "__main__":
    main()

[('nd', 330033), ('node', 290183), ('tag', 91369), ('way', 26918), ('member', 6104), ('relation', 294), ('bounds', 1), ('meta', 1), ('note', 1), ('osm', 1)]
[('highway', 22937), ('name', 9273), ('building', 5172), ('tiger:county', 3554), ('tiger:name_base', 3544), ('tiger:cfcc', 3544), ('tiger:name_type', 3403), ('oneway', 3189), ('tiger:reviewed', 3008), ('surface', 2777), ('service', 2508), ('noexit', 2032), ('access', 1877), ('source', 1660), ('lanes', 1531), ('landuse', 1317), ('natural', 1210), ('golf', 998), ('addr:housenumber', 890), ('tiger:name_direction_prefix', 875), ('road_marking', 812), ('crossing', 804), ('amenity', 764), ('barrier', 688), ('power', 623), ('addr:street', 566), ('addr:postcode', 529), ('leisure', 438), ('created_by', 388), ('bicycle', 335), ('phone', 326), ('website', 315), ('maxspeed', 311), ('tiger:name_base_1', 296), ('type', 295), ('addr:city', 293), ('opening_hours', 290), ('name_1', 289), ('addr:state', 277), ('brand', 256), ('brand:wikidata', 254),

**List of all of the different tags in the area and the number of occurances of each.  Will use to identify further areas of exploration.  Looking for tags with multiple instances in order to gain more in-depth understanding.**

In [86]:
import xml.etree.cElementTree as ET
import pprint
import re

lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

def key_type(element, keys):
    if element.tag == "tag":
        if lower.match(element.attrib['k']):
            keys["lower"] += 1
        elif lower_colon.search(element.attrib['k']):
            keys["lower_colon"] += 1
        elif problemchars.search(element.attrib['k']):
            keys["problemchars"] += 1
        else:
            keys["other"] += 1
        pass
        
    return keys



def process_map(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
    for _, element in ET.iterparse(filename):
        keys = key_type(element, keys)

    return keys

def test2():
    
    keys = process_map('vegas')
    pprint.pprint(keys)

if __name__ == "__main__":
    test2()

{'lower': 67760, 'lower_colon': 22444, 'other': 1165, 'problemchars': 0}


**Process to find tag types with problem characters.  There were none.**

**Also wanted to find the values of k tags to further evaluate data and identify specific areas to explore in the data:**

In [95]:
import xml.etree.ElementTree as ET
import pprint
import re

lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
lower_colon_vals = {}

def key_type(element, keys):
    if element.tag == "tag":
        kval = element.attrib['k']
        if re.search(lower, kval):
            keys['lower'] += 1
        elif re.search(lower_colon, kval):
            keys['lower_colon'] += 1
            colvals = kval.split(':')
            if colvals[0] not in lower_colon_vals.keys():
                lower_colon_vals[colvals[0]] = set()
            lower_colon_vals[colvals[0]].add(colvals[1])
        elif re.search(problemchars, kval):
            keys['problemchars'] += 1
        else:
            keys['other'] += 1
        
    return keys


def process_map(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
    for _, element in ET.iterparse(filename):
        keys = key_type(element, keys)

    return keys

with open('vegas', 'rb') as mapfile:
    keys = process_map(mapfile)
    print("Types of k-values and their counts:\n")
    pprint.pprint(keys)
    print("\nTypes of colon-separated k-values:\n")
    pprint.pprint(lower_colon_vals)

Types of k-values and their counts:

{'lower': 67760, 'lower_colon': 22444, 'other': 1165, 'problemchars': 0}

Types of colon-separated k-values:

{'addr': {'city',
          'country',
          'county',
          'full',
          'housename',
          'housenumber',
          'postcode',
          'state',
          'street',
          'unit'},
 'brand': {'wikipedia', 'wikidata'},
 'building': {'levels', 'part'},
 'capacity': {'disabled'},
 'change': {'lanes'},
 'communication': {'mobile_phone'},
 'contact': {'facebook'},
 'crossing': {'barrier', 'island'},
 'cycleway': {'right'},
 'destination': {'ref', 'street', 'lanes'},
 'diet': {'vegan'},
 'disused': {'leisure'},
 'flag': {'type', 'name', 'wikidata'},
 'fuel': {'diesel', 'regular'},
 'generator': {'type', 'method', 'source'},
 'gnis': {'county_id',
          'county_name',
          'created',
          'edited',
          'feature_id',
          'feature_type',
          'id',
          'state_id'},
 'golf': {'designer', 'pa

**Further audit of 'k' values in map data selection.  Upon review, there does not seem to be any invalid 'k' values.  All tags appear to be legitimate based on knowledge of area.**

**Will explore the following k tags to gain a better understanding of the selected map:
* 'brand' tags - Which brands occur with the most frequency?
* 'city' tags - What cities are represented as labels?
* 'leisure' tags - What leisure facilities are available in the area?
* 'golf' tags - What golf related labels exist in the map selection?  
* 'healthcare' tags - What categories of healthcare facilities are located here?

In [88]:
import xml.etree.cElementTree as ET
from collections import defaultdict
import pprint
import re

osm_file = open("West Vegas Valley.osm", "rb", buffering=0)

street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
street_types = defaultdict(set)

expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Circle", "Road", "Lane", "Road", "Trail", "Parkway", "Commons"]

def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)
            
def print_sorted_dict(d):
    keys = d.keys()
    keys = sorted(keys, key=lambda s:s.lower())
    for k in keys:
        v = d[k]
        print("%s: %d" % (k, v))
        
def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")

def audit():
    for event, elem in ET.iterparse(osm_file, events=("start",)):
        if elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
    print("\n")
    pprint.pprint(dict(street_types))
    
if __name__ == '__main__':
    audit()
                    



{'1': {'Spanish Ridge Ave., Suite 1'},
 '705': {'West Ali Baba Lane #705'},
 'Ave': {'W Washington Ave'},
 'Ave.': {'6601 W. Twain Ave.'},
 'Blv': {'South Rampart Blv'},
 'Blvd': {'Carmen Blvd',
          'Thomas Ryan Blvd',
          'W Charleston Blvd',
          'West Charleston Blvd'},
 'Blvd.': {'West Lake Mead Blvd.'},
 'Buckskin': {'W Buckskin'},
 'Dr,': {'S Pavilion Center Dr,'},
 'Rd': {'Hillpointe Rd', 'El Camino Rd', 'W Sunset Rd', 'S Fort Apache Rd'},
 'Robindale': {'West Robindale'},
 'S': {'Silverado Ranch Boulevard;Las Vegas Boulevard S'},
 'St': {'6770 S Edmond St', 'S Edmond St', 'Alerion St'},
 'Way': {'Brandywine Way',
         'Crystal Water Way',
         'North Michael Way',
         'North Tenaya Way',
         'South Hualapai Way'},
 'drive': {'Vegas drive'}}


## Notes on audited street names (unexpected street types):

* '1' and '705' - Valid apartment/suite numbers.  No changes needed.
* 'Ave' and 'Ave.' - Abbreviations of Avenue.  Will update in subsequent code.
* 'Blv' , 'Blvd.', and 'Blvd' - Abbreviations of Boulevard.  Will update in subsequent code. 
* 'Buckskin' - Valid street name.  Google search reveals not missing street label.  Will add to 'expected' list. 
* 'Dr,' - Abbreviation of Drive with an added character (,).  Will update in subsequent code.
* 'Rd' - Abbreviation of Road.  Will update in subsequent code.
* 'Robindale' - Valid street name.  Will add to 'expected' list in subsequent code.
* 'S' - Valid abbreviation of directional.  Will update to full word in subsequent code.
* 'St' - Valid abbreviation of Street.  Will update in subsequent code.
* 'Way' - Valid street name.  Will add to 'expected' list in subsequent code.
* 'drive' - Valid lowercase version of Drive.  Not sure why the IGNORECASE code did not catch it so will update in subsequent code. 

**Conclusions:  Overall pretty pleased with the low number of 'problem' street name abnormalities, especially considering the size of the source data set.** 

In [89]:
import xml.etree.cElementTree as ET
import pprint
import re
from collections import defaultdict

OSMFILE = "vegas"

street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
expected = expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Circle", "Road", "Lane", "Road", "Trail", "Parkway", "Commons",
                          "Buckskin", "Robindale", "Way"]

# Updated dictionary 'mapping' reflects changes needed in SW Vegas file
mapping = { "E": "East", "E.": "East", "W.":"West", "W": "West", "N.": "North", "N": "North", "S": "South", "Rd": "Road",
            "Rd.": "Road", "ln": "Lane", "ln.": "Lane", "Ln": "Lane", "Ln.": "Lane", "Ct": "Court", "dr": "Drive",
            "dr.": "Drive", "Dr": "Drive", "Dr.": "Drive", "drive" : "Drive", "st": "Street", "St": "Street", "St.": "Street", "Ste": "Suite",
            "Ste.": "Suite", "Trl": "Trail", "Cir": "Circle", "cir": "Circle", "Av": "Avenue", "Ave": "Avenue",
            "Ave.": "Avenue", "Pky": "Parkway", "Pky.": "Parkway",
            "Pkwy": "Parkway", "pkwy": "Parkway", "Fwy": "Freeway", "Fwy.": "Freeway", "BLVD": "Boulevard", "Blvd": "Boulevard",
            "Blvd.": "Boulevard", "Blv" : "Boulevard"
            }


def audit_street_type(street_types, street_name):
   
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            if street_type.isdigit():
                try:
                    true_street_type = street_name.split()[-2]
                    if true_street_type not in expected:
                        street_types[true_street_type].add(street_name)
                except IndexError:
                    pass
            else:
                street_types[street_type].add(street_name)


def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")


def audit(osmfile):
    """
    Redoing audit to make sure expected changes and updates have taken place.
    """
    osm_file = open(osmfile, "rb")
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):
        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
    osm_file.close()
    return street_types


def update(name, mapping):
  
    words = name.split()
    for w in range(len(words)):
        if words[w] in mapping:
            words[w] = mapping[words[w]]
    name = " ".join(words)
    return name


def example_test():
    st_types = audit(OSMFILE)
    pprint.pprint(dict(st_types))
    for st_type, ways in st_types.items():
        for name in ways:
            better_name = update(name, mapping)
            print(name, "=>", better_name)
            if name == "S Fort Apache Rd":
                assert better_name == "South Fort Apache Road"
            if name == "S Edmond St":
                assert better_name == "South Edmond Street"


if __name__ == '__main__':
    example_test()

{'Ave': {'W Sahara Ave', 'W. Sahara Ave', 'W Cactus Ave'},
 'Blvd': {'S Rainbow Blvd', 'W Charleston Blvd'},
 'Dr': {'S Grand Canyon Dr #106'},
 'Dr,': {'S Pavilion Center Dr,'},
 'Lavender': {'W Lavender'},
 'Ln': {'Frontier Ranch Ln'},
 'Rd': {'S Fort Apache Rd', 'Desert Inn Rd', 'W Flamingo Rd'},
 'Resort': {'Red Rock Resort'},
 'Suite': {'Spanish Ridge Ave., Suite 1'},
 'drive': {'oval drive'}}
W Sahara Ave => West Sahara Avenue
W. Sahara Ave => West Sahara Avenue
W Cactus Ave => West Cactus Avenue
W Lavender => West Lavender
S Rainbow Blvd => South Rainbow Boulevard
W Charleston Blvd => West Charleston Boulevard
S Grand Canyon Dr #106 => South Grand Canyon Drive #106
oval drive => oval Drive
Red Rock Resort => Red Rock Resort
S Fort Apache Rd => South Fort Apache Road
Desert Inn Rd => Desert Inn Road
W Flamingo Rd => West Flamingo Road
Frontier Ranch Ln => Frontier Ranch Lane
Spanish Ridge Ave., Suite 1 => Spanish Ridge Ave., Suite 1
S Pavilion Center Dr, => South Pavilion Center 

In [90]:
import xml.etree.cElementTree as ET
import codecs
import pprint
import json
import re
from collections import defaultdict


OSMFILE = "vegas"

problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

mapping = { "E": "East", "E.": "East", "W.":"West", "W": "West", "N.": "North", "N": "North", "S": "South", "Rd": "Road",
            "Rd.": "Road", "ln": "Lane", "ln.": "Lane", "Ln": "Lane", "Ln.": "Lane", "Ct": "Court", "dr": "Drive",
            "dr.": "Drive", "Dr": "Drive", "Dr.": "Drive", "drive" : "Drive", "st": "Street", "St": "Street", "St.": "Street", "Ste": "Suite",
            "Ste.": "Suite", "Trl": "Trail", "Cir": "Circle", "cir": "Circle", "Av": "Avenue", "Ave": "Avenue",
            "Ave.": "Avenue", "Pky": "Parkway", "Pky.": "Parkway",
            "Pkwy": "Parkway", "pkwy": "Parkway", "Fwy": "Freeway", "Fwy.": "Freeway", "BLVD": "Boulevard", "Blvd": "Boulevard",
            "Blvd.": "Boulevard", "Blv" : "Boulevard"
            }

CREATED = ["version", "changeset", "timestamp", "user", "uid"]

TIGER_NAME_KEYS = ["name_direction_prefix", "name_base", "name_type", "name_direction_suffix"]

def update_postcode(value):
    """
    Replace any postcodes that are not of length 5 with 5-digit string
    
    """
    postcode = ''
    for char in value:
        if char.isdigit():
            postcode += char
        if len(postcode) == 5:
            break
    return postcode
                    
def update_street(name, mapping):
    """
    Scan through each substring and update any matched substring to their values in dictionary 'mapping'
   
    """
    words = name.split()
    for w in range(len(words)):
        if words[w] in mapping:
            words[w] = mapping[words[w]]
    name = " ".join(words)
    return name

def process_address_tiger(element, node, address, tiger):
    """
    Specifically deal with the dictionaries 'address' and 'tiger'
    
    """
    street = {}
    zipcode = set()
    for tag in element.iter("tag"):  
        k = tag.attrib["k"]
        v = tag.attrib["v"]
        problem = problemchars.search(k)

        if problem:
            continue

        # Create dictionary 'address'
        elif k.startswith("addr:"):
            if ":" in k[5:]:
                continue
            else:
                address[k[5:]] = v

        # Create dictionary 'tiger'
        elif k.startswith("tiger:"):
            if k[6:] in TIGER_NAME_KEYS:
                street[k[6:]] = v
            elif k[6:].startswith("zip"):
                if ";" in  v:
                    for z in v.split(";"):
                        zipcode.add(z)
                else:
                    zipcode.add(v)
            elif k[6:] in ["county", "cfcc"]:
                tiger[k[6:]] = v
            else:
                continue

        # Ignore gnis geographical features
        elif k.startswith("gnis:"):
            continue
        # To avoid overwriting the key "type" which records whether the document is a "node" or a "way",
        # Thus I rename the "type" attribute to "location_type"
        elif k == "type":
            node["location_type"] = v
        else:
            node[k] = v

    # Process tiger street name
    if len(street) != 0:
        street_string = " ".join([street[key] for key in TIGER_NAME_KEYS if key in street])
        tiger["street_name"] = update_street(street_string, mapping)

    # Update street in dictionary 'address'
    if "street" in address:
        address["street"] = update_street(address["street"], mapping)
    
    # Process postcodes
    if "postcode" in address:
        address["postcode"] = update_postcode(address["postcode"])
    if len(zipcode) != 0:
        tiger["zipcode"] = [update_postcode(v) for v in list(zipcode)]

    if len(tiger) != 0:
        node["tiger"] = tiger
    if len(address) != 0:
        node["address"] = address




def create_common_attributes_dict(element, common_attr_list, node):
    """
    Args:
        element: element of OSM XML file
        common_attr_list: list of common attributes ("version", "changeset", "timestamp", "user", "uid")
        node: dictionary storing data for each element with tag name "node" or "way"
    Returns:
        a complete dictionary storing common attributes for the nodes
    """
    if "created" not in node:
        node["created"] = {}
    for attr in common_attr_list:
        node["created"][attr] = element.attrib[attr]


def shape_element(element):
    """
    Takes an XML tag as input and returns a cleaned and reshaped dictionary for JSON ouput. 
    If the element contains an abbreviated street name, it returns with an updated full street name.
    If the postcodes/zipcodes do not follow 5-digit format, update them with the correct format.
    """
    node = {}
    address = {}
    tiger = {}
    nd_info = []
    if element.tag == "node" or element.tag == "way":
        node["type"] = element.tag
        node["id"] = element.attrib["id"]
        if "visible" in element.attrib:
            node["visible"] = element.attrib["visible"]
        if "lat" in element.attrib:
            node["pos"] = [float(element.attrib["lat"]), float(element.attrib["lon"])]
        create_common_attributes_dict(element, CREATED, node)
        process_address_tiger(element, node, address, tiger)
 
        for tag in element.iter("nd"):
            nd_info.append(tag.attrib['ref'])
        if nd_info != []:
            node['node_refs'] = nd_info

        return node
    
    else:
        return None


def process_map(file_in, pretty = False):
    """
    Outputs a JSON file with the above structure.
    Returns the data as a list of dictionaries.
    """
    file_out = "{0}.json".format(file_in)
    data = []
    with codecs.open(file_out, "w") as fo:
        parser = ET.iterparse(file_in)
        for __, elem in parser:
            el = shape_element(elem)
            if el:
                data.append(el)
                # Output to JSON
                if pretty:
                    fo.write(json.dumps(el, indent=2)+"\n")
                else:
                    fo.write(json.dumps(el) + "\n")
        del parser
    return data


def main_test():
    data = process_map(OSMFILE, False)
    print('Map processed...\n')
    pprint.pprint(data[:5])
    pprint.pprint(data[-4:])

if __name__ == "__main__":
    main_test()
    


Map processed...

[{'created': {'changeset': '57189326',
              'timestamp': '2018-03-14T20:04:51Z',
              'uid': '548288',
              'user': 'WayneSchlegel',
              'version': '3'},
  'highway': 'turning_circle',
  'id': '137032447',
  'pos': [36.150817, -115.2576616],
  'type': 'node'},
 {'created': {'changeset': '57189326',
              'timestamp': '2018-03-14T20:04:51Z',
              'uid': '548288',
              'user': 'WayneSchlegel',
              'version': '3'},
  'highway': 'turning_circle',
  'id': '137032450',
  'pos': [36.150788, -115.2586],
  'type': 'node'},
 {'created': {'changeset': '3431276',
              'timestamp': '2009-12-23T00:18:41Z',
              'uid': '147510',
              'user': 'woodpeck_fixbot',
              'version': '2'},
  'id': '137032457',
  'pos': [36.127445, -115.303159],
  'type': 'node'},
 {'created': {'changeset': '3172090',
              'timestamp': '2009-11-20T21:49:46Z',
              'uid': '147510',
  

**Included audit of beginning and end of created list.** 

In [96]:
db_name = "vegas"

def get_db(db_name):
    from pymongo import MongoClient
    client = MongoClient("mongodb://localhost:27017/")
    db = client[db_name]
    return db


def top_users():
    # Top 10 contributing users
    group = {"$group" : {'_id' : '$created.user', 'count' : {'$sum' : 1}}}
    sort = {"$sort" : {'count' : -1}}
    limit = {"$limit" : 10}
    pipeline = [group, sort, limit]
    return pipeline

def top_cities():
    # Top 10 mentioned cities
    match = {"$match":{"address.city":{"$exists":1}}}
    group = {"$group":{"_id":"$address.city", "count":{"$sum":1}}}
    sort = {"$sort":{"count":-1}}
    limit = {"$limit":10}
    pipeline = [match, group, sort, limit]
    return pipeline

def top_amenities():
    # Top 10 appearing amenities
    match = {"$match":{"brand":{"$exists":1}}}
    group = {"$group":{"_id":"$brand", "count":{"$sum":1}}}
    sort = {"$sort":{"count":-1}}
    limit = {"$limit":10}
    pipeline = [match, group, sort, limit]
    return pipeline

def top_leisure_facilities():
    # Top 10 mentioned leisure facilities 
    match = {"$match":{"leisure":{"$exists":1}}}
    group = {"$group":{"_id":"$leisure", "count":{"$sum":1}}}
    sort = {"$sort":{"count":-1}}
    limit = {"$limit":10}
    pipeline = [match, group, sort, limit]
    return pipeline

def top_golf_labels():
    # Top 10 mentioned golf course labels
    match = {"$match":{"golf":{"$exists":1}}}
    group = {"$group":{"_id":"$golf", "count":{"$sum":1}}}
    sort = {"$sort":{"count":-1}}
    limit = {"$limit":10}
    pipeline = [match, group, sort, limit]
    return pipeline

def top_healthcare_facilities():
    # Top 10 types of healthcare facilities 
    match = {"$match":{"healthcare":{"$exists":1}}}
    group = {"$group":{"_id":"$healthcare", "count":{"$sum":1}}}
    sort = {"$sort":{"count":-1}}
    limit = {"$limit":10}
    pipeline = [match, group, sort, limit]
    return pipeline

def aggregate(db, pipeline):
    result = db.vegas.aggregate(pipeline)
    return result

def test(pipeline_function):
    db = get_db(db_name)
    pipeline = pipeline_function
    cursor = aggregate(db, pipeline)
    import pprint
    for document in cursor:
        pprint.pprint(document)
        

if __name__ == "__main__":
    print("Top 10 contributing users:\n")
    test(top_users())
    print("\nTop listed cities:\n")
    test(top_cities())
    print("\nTop 10 brands in the area:\n")
    test(top_amenities())
    print("\nTop 10 leisure facilities:\n")
    test(top_leisure_facilities())
    print("\nTop 10 golf course labels:\n")
    test(top_golf_facilities())
    print("\nTop categories of healthcare facilities:\n")
    test(top_healthcare_facilities())

Top 10 contributing users:

{'_id': 'TheDutchMan13', 'count': 23434}
{'_id': 'bmuskaan', 'count': 22518}
{'_id': 'Tom_Holland', 'count': 15987}
{'_id': 'JBradshaw84', 'count': 15402}
{'_id': 'CoasterRoyalty', 'count': 11977}
{'_id': 'vennredd', 'count': 11583}
{'_id': 'rkkasams', 'count': 10671}
{'_id': 'gMitchellD', 'count': 10121}
{'_id': 'Doubletap13', 'count': 9445}
{'_id': 'Trail_Caretaker', 'count': 8646}

Top listed cities:

{'_id': 'Las Vegas', 'count': 283}
{'_id': 'Spring Valley', 'count': 3}
{'_id': 'Las Vegas, NV', 'count': 1}

Top 10 brands in the area:

{'_id': 'Subway', 'count': 9}
{'_id': 'CVS Pharmacy', 'count': 9}
{'_id': 'Starbucks', 'count': 9}
{'_id': 'Chevron', 'count': 9}
{'_id': '7-Eleven', 'count': 8}
{'_id': "McDonald's", 'count': 7}
{'_id': 'Jack in the Box', 'count': 6}
{'_id': 'Walgreens', 'count': 6}
{'_id': 'Taco Bell', 'count': 6}
{'_id': 'Bank of America', 'count': 5}

Top 10 leisure facilities:

{'_id': 'pitch', 'count': 170}
{'_id': 'park', 'count': 7

## Notes and thoughts on pipeline queries:

* Top listed cities presents an accurate and interesting finding.  While this area is mostly located outside the official city limits of Las Vegas and in the township of Spring Valley, almost all of the city labels reflect 'Las Vegas'.  This corresponds with official addresses in the area.

* Top listed brands represent an accurate finding based on personal knowledge of the area.

* Leisure facility labels seem to be accurate based on personal knowledge of the area.  Findings indicate there are plenty of outdoor options available to residents.

* Golf course labels are accurate and seem to represent the number of golf courses in the area.  

* Healthcare facility labels seem to be accurate.  However, my knowledge of the area raises a potential red flag, as there are many more dentists and optometrists in this neighborhood.

In [92]:
print("Total number of documents in source file: ", db.vegas.find().count())

Total number of documents in source file:  317101


  print("Total number of documents in source file: ", db.vegas.find().count())


In [93]:
print("Total number of node tags: ", db.vegas.find({'type':'node'}).count())

Total number of node tags:  290183


  print("Total number of node tags: ", db.vegas.find({'type':'node'}).count())


**Node count value corresponds with initial query of node label above.**

In [94]:
print("Total number of way tags: ", db.vegas.find({'type':'way'}).count())

  print("Total number of way tags: ", db.vegas.find({'type':'way'}).count())


Total number of way tags:  26918


**Way count value corresponds with initial query of way label above.**