#Project 3: Data Wrangling Open Streetmaps Data
##1. Problems encountered in the map
####I decided to go for my hometown of Oslo, Norway, and naturally there was a bit of trouble with the non-English characters found there. I made two translation mappings, one for unicode and one for normal strings, for both small and capital æ,ø,å.

### Non-English characters
#### The non-English characters in the dataset have different encodings, both unicode and standard. This called for two different approaches to translating these to the standard English 
#### Other than this I didn't really find any particular problems, both post codes and house numbers seem to be within an acceptabel range (The post codes are all from the correct area).
##2. Overview of the data

###File sizes
#### oslo_norway.osm .. 1,09 GB
#### oslo_norwa.osm.json .. 3,014 KB

###Some selected data
#### The number of documents in the set is 14,754, 14,077 of which are nodes and the remaining 677 are ways.
db.tranby.find().count()
list(db.tranby.aggregate(nodes_and_ways))

nodes_and_ways = [{"\$group":{"_id":"\$type","count":{"\$sum":1 \}\}\},{"$sort":{"count":-1}}]

#### There are 48 distinct contributors, the most active being 'vibrog', who's commited 5,310 (36%)of the 14,754 documents.
len(db.tranby.distinct('created.user'))
list(db.tranby.aggregate(top_user))

top_user = [{"\$group":{"_id":"\$created.user","count":{"\$sum":1\}\}\},{"\$sort":{"count":-1}},{"\$limit":1}]
                           
#### There are 92 distinct street addresses in the set.
len(db.tranby.distinct('address.street'))

##3. Other ideas about the dataset
###Addresses per postal code
####Though it would be interesting to see the number of addresses per postal code.
####Results were as follows:
####3409: 928
####3408: 647
####3420: 562
####3406: 173
####3403: 20
#### using the aggregation
addresses_per_postal_code = [{"\$match":{"address":{"\$exists": True\}\}\},{"\$group":{"_id":"\$address.postcode","count":{"\$sum":1}}},{"\$sort":{"count":-1}\}\]
##4. Conclusion

In [9]:
import xml.etree.ElementTree as ET
import pprint
import re
import codecs
import json
import time
from pymongo import MongoClient
file_path = 'C:\Users\hakon.tromborg\Data Analyst Nanodegree\Data\\oslo_norway.osm'

In [10]:
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

CREATED = [ "version", "changeset", "timestamp", "user", "uid"]

#json->MongoDB can't handle the Norwegian letters, so I'm mapping them to their official [a-z] equivalents for both unicode
#and normal strings
letter_map = {u"Å":'Aa',
              u"Ø":'Oe',
              u"Æ":'Ae',
              u"å":"aa",
              u"ø":"oe",
              u"æ":"ae"}
unicode_map = {u"\xc5":'Aa',
              u"\xd8":'Oe',
              u"\xc6":'Ae',
              u"\xe5":"aa",
              u"\xf8":"oe",
              u"\xe6":"ae"}

In [12]:
#From chapter 6
def process_map(file_in, pretty = False):
    # You do not need to change this file
    file_out = "{0}.json".format(file_in)
    #data = []
    with codecs.open(file_out, "w") as fo:
        for _, element in ET.iterparse(file_in):
            el = shape_element(element)
            if el:
                #data.append(el)
                fo.write(json.dumps(el) + "\n")
                element.clear()
        #fo.write(json.dumps(data) + "\n")
    return data

In [13]:
def shape_element(element):
    node = {}
    
    node['created'] = {}
    if element.tag == "node" or element.tag == "way" :
        node['type'] = element.tag
        if element.tag == "way":
            node['node_refs'] = []
            for it in element.iter("nd"):
                node['node_refs'].append(it.attrib['ref'])

        for a in element.attrib:
            if a in CREATED:
                node['created'][a] = element.attrib[a]
            elif a == 'pos' or 'lat':
                #Ensure 'pos' only shows up in documents with a latitude/longitude
                if 'pos' not in node:
                    node['pos'] = ["0","0"]
                if a == "lat":
                    node['pos'][0]=(float(element.attrib[a]))
                elif a == "lon":
                    node['pos'][1]=(float(element.attrib[a]))
            else:
                node[a] = element.attrib[a]
                

        for it in element.iter("tag"):
            #Skip the iteration and drop the element if problemchars are discovered
            e = problemchars.search(it.attrib['k'])
            if e:
                continue
                
            elif it.attrib['k'][:5] == "addr:":
                if 'address' not in node:
                    node['address'] = {}
                c = it.attrib['k'].split(':')
                
                #If c is 2, there's exactly one : in the string, which is the format we're looking for
                if len(c) == 2:
                    translated_string = translate_string(it.attrib['v'])
                    #Translating Norwegian letters proved more difficult than expected 
                    #because the set contains both unicode and normal strings
                    node['address'][it.attrib['k'][5:]] = translated_string
            else:
                node[it.attrib['k']] = it.attrib['v']
        return node
    else:
        return None
    
def translate_string(str):
    if type(str) is unicode:
        translated_string = translate_unicode(str)
        return translated_string
    else:
        translated_string = translate_utf(str)
        return translated_string
    
def translate_unicode(addr):
    for l in unicode_map:
        if l in list(addr):
            #using if l in letter_map crashes with unicode characters because illegal decodings are attempted
            addr = [c.replace(l, unicode_map[l]) for c in list(addr)]
            addr = ''.join(addr)
    return addr
def translate_utf(addr):
    for l in letter_map:
        if l in addr:
            addr = addr.replace(l,letter_map[l])
    return addr

In [14]:
def process_file():
    data = process_map(file_path, False)

In [None]:
t0 = time.time()
process_file()
print time.time()-t0

In [219]:
def insert_data(data, db):
    db.tranby.insert(data)

In [276]:
client = MongoClient("mongodb://localhost:27017")
db = client.tranby
#Clear the database so it doesn't add multiple records
db.tranby.remove()

with open(file_path + '.json') as f:
        data = json.loads(f.read())
        insert_data(data, db)
nodes_and_ways = [{"$group":{"_id":"$type",
                      "count":{"$sum":1}}},
            {"$sort":{"count":-1}}]

top_user = [{"$group":{"_id":"$created.user",
                            "count":{"$sum":1}}},
                           {"$sort":{"count":-1}},
                           {"$limit":1}]
addresses_per_postal_code = [{"$match":{"address":{"$exists": True}}},
                            {"$group":{"_id":"$address.postcode",
                            "count":{"$sum":1}}},
                           {"$sort":{"count":-1}}]

print "Number of records:"
print db.tranby.find().count()

print "Number of nodes and ways:"
pprint.pprint(list(db.tranby.aggregate(nodes_and_ways)))

print "Number of unique users:"
print len(db.tranby.distinct('created.user'))

print "Top contributing user:"
print list(db.tranby.aggregate(top_user))

print "Distinct street addresses in set:"
print len(db.tranby.distinct('address.street'))

print "Number of addresses per postal code: "
print list(db.tranby.aggregate(addresses_per_postal_code))

Number of records:
14754
Number of nodes and ways:
[{u'_id': u'node', u'count': 14077}, {u'_id': u'way', u'count': 677}]
Number of unique users:
48
Top contributing user:
[{u'count': 5310, u'_id': u'vibrog'}]
Distinct street addresses in set:
92
Number of addresses per postal code: 
[{u'count': 928, u'_id': u'3409'}, {u'count': 647, u'_id': u'3408'}, {u'count': 562, u'_id': u'3420'}, {u'count': 173, u'_id': u'3406'}, {u'count': 20, u'_id': u'3403'}]


In [291]:
import pandas as pd
times = db.tranby.distinct('created.timestamp')
for x,t in enumerate(times):
    times[x] = times[x][:10]
#print sorted(times)
d = pd.DataFrame(sorted(times))
#print d
d.hist(bins = 10)

ValueError: num must be 0 <= num <= 0, not 1