# Data Analysis workbook DAND Part III


This notebook introduces the analysis performed on a selected map fetched from OpenStreeMap via a python Script. 

The selected area has been described in the file area.txt.

The notebook is structured in two parts:

* The audit of the dataset
* Test and vizualization of data cleaning

Most of the code has been adapted from the different DAND (i.e. Data Auditing and Cleaning)

## Data wrangling

### Data Importation

In [14]:
# Import chunk library + Dataset

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import unicodecsv
import seaborn as sns

ways_nodes = pd.read_csv("~/PycharmProjects/DAND_OpenStreeMap/subset/ways_nodes.csv")
ways       = pd.read_csv("~/PycharmProjects/DAND_OpenStreeMap/subset/ways.csv")
ways_tags  = pd.read_csv("~/PycharmProjects/DAND_OpenStreeMap/subset/ways_tags.csv")
nodes      = pd.read_csv("~/PycharmProjects/DAND_OpenStreeMap/subset/nodes.csv")
nodes_tags = pd.read_csv("~/PycharmProjects/DAND_OpenStreeMap/subset/nodes_tags.csv")


#### Way Nodes :

In [8]:
print "columns names:",ways_nodes.columns
print "Number of observation:", len(ways_nodes)

Index([u'id', u'node_id', u'position'], dtype='object')
12242


In [9]:
ways_nodes.head()

Unnamed: 0,id,node_id,position
0,4803849,34691601,0
1,4803849,1801000290,1
2,4803849,1164867374,2
3,4803849,3397409847,3
4,4803849,1937688909,4


### Nodes :

In [16]:
print "columns names:",ways.columns
print "Number of observation:", len(ways)

ways.head()

Index([u'id', u'user', u'uid', u'version', u'changeset', u'timestamp'], dtype='object')
1410


#### Way Nodes :

In [19]:
print "columns names:", ways_tags.columns
print "Number of observation:", len(ways_tags)

ways_tags.head()

Index([u'id', u'key', u'value', u'type'], dtype='object')
5745


In [25]:
### Way Nodes :

print "columns names:", nodes.columns
print "Number of observation:", len(nodes)

nodes.head()

columns names: Index([u'id', u'lat', u'lon', u'user', u'uid', u'version', u'changeset',
       u'timestamp'],
      dtype='object')
Number of observation: 7769


Unnamed: 0,id,lat,lon,user,uid,version,changeset,timestamp
0,19388646,47.545806,7.586781,Nzara,481380,7,13437936,2012-10-10T08:49:13Z
1,19388648,47.544706,7.585863,mjessen_mdv,1227175,14,35193074,2015-11-09T14:14:19Z
2,30828152,47.542891,7.59425,MENTZ_TU,2385132,27,34952835,2015-10-29T16:18:00Z
3,34691600,47.549277,7.589421,Mammi71,668573,5,38486898,2016-04-11T22:14:14Z
4,34691601,47.548528,7.591281,Sebra1177,2557264,8,28075319,2015-01-11T22:45:04Z


In [None]:
"""
Auditing of the file 

"""
import xml.etree.cElementTree as ET
from collections import defaultdict
import re
import pprint

OSMFILE = "example.osm"
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)


expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons"]

# UPDATE THIS VARIABLE
mapping = { "St": "Street",
            "St.": "Street",
            "Ave": "Avenue",
            "Rd.": "Road"}


def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)


def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")


def audit(osmfile):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
    osm_file.close()
    return street_types


def update_name(name, mapping):
    # YOUR CODE HERE
    # https://stackoverflow.com/questions/29987116/python-how-to-replace-the-last-word-of-a-line-from-a-text-file
    name_temp = (name.split())
    for i in name_temp:
        if i in mapping:
            lastword = name.split()[-1]
            name = name.replace(lastword, mapping[i]) 
    return name
    



def test():
    st_types = audit(OSMFILE)
    assert len(st_types) == 3
    pprint.pprint(dict(st_types))

    for st_type, ways in st_types.iteritems():
        for name in ways:
            better_name = update_name(name, mapping)
            print name, "=>", better_name
            if name == "West Lexington St.":
                assert better_name == "West Lexington Street"
            if name == "Baldwin Rd.":
                assert better_name == "Baldwin Road"


if __name__ == '__main__':
    test()