# Project 2 workflow

In [14]:
import xml.etree.cElementTree as ETree
import pprint
import re
import codecs
import json
import sys
import copy

%matplotlib
execfile('osm_to_json.py')
inputfile = 'santiago.xml'

Using matplotlib backend: Qt4Agg


* `structures`: list of sets, which contains associations of 'tags' grouped under a same node or way
* `problems`: list of strings, corresponding to 'tags' that if included or not corrected in the JSON document might produce mongo importing errors.

In [2]:
structures, problems = check_struct(inputfile)

967214 967214


In [3]:
tags, ele_childs = count_tags('santiago.xml')

In [4]:
ele_childs

{'meta': set(),
 'node': {'tag'},
 'note': set(),
 'relation': {'member', 'tag'},
 'way': {'nd', 'tag'}}

* `data`: list of valid JSON documents parsed from the OSM/XML file
* `problems_data`: list of strings, corresponding to 'tags' that if included or not corrected in the JSON document might produce mongo importing errors.


In [5]:
data_or, problems_data, attributes = process_map(inputfile)

967214


In [6]:
len(problems), len(problems_data)

(204, 204)

In [7]:
sorted(attributes.items(), reverse=True, key=lambda e: e[1][1])

[('timestamp', [{datetime.datetime}, 1015506]),
 ('changeset', [{float}, 1015506]),
 ('version', [{float}, 1015506]),
 ('id', [{float}, 1015506]),
 ('user', [{str, unicode}, 1015506]),
 ('uid', [{float}, 1015506]),
 ('lat', [{float}, 814484]),
 ('lon', [{float}, 814484]),
 ('addr:street', [{float, str, unicode}, 214250]),
 ('addr:housenumber', [{float, str, unicode}, 212906]),
 ('addr:interpolation', [{str}, 101575]),
 ('highway', [{str}, 84166]),
 ('name', [{float, str, unicode}, 70325]),
 ('id_origin', [{float, str}, 37359]),
 ('source', [{str, unicode}, 23297]),
 ('oneway', [{float, str}, 13579]),
 ('ref', [{float, str, unicode}, 12182]),
 ('building', [{str}, 11869]),
 ('attribution', [{str, unicode}, 11454]),
 ('ref_name', [{str, unicode}, 10692]),
 ('is_in:city', [{str, unicode}, 10692]),
 ('route_ref', [{float, str}, 10692]),
 ('ts_orientacion', [{str}, 10692]),
 ('ts_calle', [{str, unicode}, 10692]),
 ('ts_codigo', [{str}, 10691]),
 ('ts_hacia', [{float, str, unicode}, 10689]),

What do we want to check, given the problems we found out:

* Street names: we don't have the same street naming system, but some problems can be solved.
* Phone numbers: lately the phone number at Santiago were changed. We detected outdated or plainly wrong elements.
* E-mails: certainly bad emails
* Invalid types, or ambiguous values for keys
* Inconcistency in the use of 'addr:city'
* Untag ways
* Names with multiple values (;)
* addr:street with multiple values
* highway with addr:
* empty tags
* 'lanes' with two values (2;3). It must be number
* property tags (as opening_hours, traffic_calming)
* valid format (opening_hours)
* duplicated ways (same nodes)
* place_of_worship without religion
* unnamed streets
* source:lanes instead of source:name?


# Auditing street names (validity, consistence)
## Abbreviations to full
### 1st part: detection of problems

In [127]:
db = get_db('dataw')
stgo = db.santiago

In [24]:
# get first component of a street name on address fields
cursor = stgo.find(
    {'type': 'node', 'address.street':{'$exists': 1}},
    {'_id':0, 'type':1, 'address.street':1})
address_name_1 = {}
address_name = {}

first_word = re.compile(r'^[A-z|.]+\s')

for c in cursor:
    s = c['address']['street']
    if first_word.search(s):
        w = first_word.search(s).group()
        if w in address_name_1.keys():
            address_name_1[w] += 1
        else:
            address_name_1[w] = 1
    if s not in address_name.keys():
        address_name[s] = 1
    else:
        address_name[s] += 1

In [25]:
sorted(address_name_1.items(), reverse=True, key=lambda e: e[1])

[(u'Pasaje ', 51552),
 (u'Avenida ', 23800),
 (u'Los ', 7657),
 (u'El ', 5105),
 (u'San ', 4519),
 (u'Las ', 4252),
 (u'La ', 2999),
 (u'Santa ', 2453),
 (u'Calle ', 1802),
 (u'Camino ', 1409),
 (u'General ', 1235),
 (u'Manuel ', 1226),
 (u'Rio ', 1147),
 (u'Pedro ', 1110),
 (u'Juan ', 897),
 (u'Luis ', 894),
 (u'Cerro ', 890),
 (u'Jose ', 873),
 (u'Isla ', 862),
 (u'Lo ', 781),
 (u'Carlos ', 769),
 (u'Volcan ', 761),
 (u'Arturo ', 749),
 (u'Lago ', 663),
 (u'Ramon ', 633),
 (u'Psje. ', 632),
 (u'Sta ', 601),
 (u'Padre ', 575),
 (u'Nueva ', 529),
 (u'Francisco ', 521),
 (u'Miguel ', 481),
 (u'Avenida. ', 477),
 (u'Julio ', 462),
 (u'Antonio ', 443),
 (u'Alonso ', 443),
 (u'Maria ', 434),
 (u'Sargento ', 400),
 (u'Simon ', 395),
 (u'Alcalde ', 387),
 (u'Presidente ', 387),
 (u'Jorge ', 380),
 (u'Enrique ', 368),
 (u'Gran ', 346),
 (u'Martin ', 338),
 (u'Del ', 337),
 (u'Valle ', 335),
 (u'Almirante ', 315),
 (u'Salvador ', 311),
 (u'Bahia ', 305),
 (u'Isabel ', 290),
 (u'Victor ', 288),

In [11]:
# get first component of a street name on way elements
cursor = stgo.find(
    {'type': 'way', 'name':{'$exists': 1}, 'highway':{'$exists':1}},
    {'_id':0, 'type':1, 'name':1})

street_1 = {}
street = {}

first_word = re.compile(r'^[A-z|.]+\s')

for c in cursor:
    s = c['name']
    if first_word.search(s):
        w = first_word.search(s).group()
        if w in street_1.keys():
            street_1[w] += 1
        else:
            street_1[w] = 1
    if s not in street.keys():
        street[s] = 1
    else:
        street[s] += 1

In [12]:
sorted(street_1.items(), reverse=True, key=lambda e: e[1])

[(u'Pasaje ', 8449),
 (u'Avenida ', 3443),
 (u'Los ', 2170),
 (u'El ', 1517),
 (u'Las ', 1211),
 (u'Camino ', 1017),
 (u'La ', 802),
 (u'San ', 671),
 (u'Santa ', 629),
 (u'Autopista ', 551),
 (u'Acceso ', 511),
 (u'Calle ', 426),
 (u'Salida ', 371),
 (u'Cerro ', 348),
 (u'Isla ', 226),
 (u'Manuel ', 224),
 (u'Lago ', 224),
 (u'Juan ', 209),
 (u'Pedro ', 188),
 (u'General ', 177),
 (u'Carlos ', 165),
 (u'Padre ', 159),
 (u'Parque ', 153),
 (u'Nueva ', 141),
 (u'Luis ', 139),
 (u'Caletera ', 137),
 (u'Francisco ', 135),
 (u'Lo ', 134),
 (u'Estacionamiento ', 130),
 (u'Puerto ', 124),
 (u'Condominio ', 122),
 (u'Doctor ', 110),
 (u'Corredor ', 107),
 (u'Valle ', 106),
 (u'Mar ', 99),
 (u'Paseo ', 96),
 (u'Presidente ', 90),
 (u'Cruce ', 88),
 (u'Del ', 83),
 (u'Mall ', 83),
 (u'Laguna ', 80),
 (u'Jorge ', 77),
 (u'Sendero ', 77),
 (u'Arturo ', 75),
 (u'Hospital ', 74),
 (u'Almirante ', 70),
 (u'Puente ', 66),
 (u'Diego ', 66),
 (u'Julio ', 66),
 (u'Peatones ', 64),
 (u'Alcalde ', 64),
 (

### 2nd. part: fix

In [15]:
correct = {'Avda': 'Avenida',
           'Av': 'Avenida',
           'Avda.': 'Avenida'
           'Av.': 'Avenida',
           'Pje': 'Pasaje',
           'Pja': 'Pasaje',
           'Pje.': 'Pasaje'}


In [None]:
cursor = stgo.find({'$or': [{'phone' : {'$exists':1}}, {'contact:phone':{'$exists':1}}]}, {'phone':1, 'contact:phone':1})
for 

# Second OSM to JSON, with fixes

In [22]:
execfile('osm_to_json_2.py')
data_two = process_map(inputfile)

In [27]:
pipeline = [{'$match': {'type': 'way', 'highway':{'$exists': 1}}},
            {'$group': {'_id':'$highway', 'count': {'$sum': 1}}},
            {'$sort': {'count': -1}}]
r = stgo.aggregate(pipeline)

In [39]:
orphan = {}
for a in address_st.keys():
    if a not in name_st.keys():
        if a not in orphan.keys():
            orphan[a] = 1
        else:
            orphan[a] += 1

In [44]:
sorted(orphan.items(), reverse=True, key=lambda e: e[0])

[(u'\xd1uble / Santa Elena', 1),
 (u'\xd1uble / Nataniel Cox', 1),
 (u'\xd1uble / Avda. Viel', 1),
 (u'\xc3\x91uble', 1),
 (u'\xc3\x91ipas', 1),
 (u'\xc3\x91andu', 1),
 (u'\xc3\x81ngel Pimentel', 1),
 (u'vitacura', 1),
 (u'uno sur', 1),
 (u'simon bolivar', 1),
 (u'san ram\xf3n', 1),
 (u'san ramon', 1),
 (u'ricardo lyon', 1),
 (u'pedro de valdivia', 1),
 (u'paseo bulnes', 1),
 (u'pasaje 1', 1),
 (u'octinos', 1),
 (u'merced', 1),
 (u'ismael Vald\xe9s', 1),
 (u'hueneay', 1),
 (u'departamental', 1),
 (u'catamarca', 1),
 (u'calle jose maria caro', 1),
 (u'barcelat2a', 1),
 (u'avenida italia', 1),
 (u'avda Cuatro', 1),
 (u'alberto lo seco', 1),
 (u'Zurich Norte', 1),
 (u'Za\xc3\xb1artu', 1),
 (u'Zapadores', 1),
 (u'Zanjon De La Aguada', 1),
 (u'Yarquen', 1),
 (u'Yapey\xc3\xba', 1),
 (u'Yanez Pinzon', 1),
 (u'Willy Brandt', 1),
 (u'Wenceslao Sanchez', 1),
 (u'Washington Espejo', 1),
 (u'Walter Martinez', 1),
 (u'Walker Martinez', 1),
 (u'Volcan Villarrica', 1),
 (u'Volcan Villarica', 1),
 (u'

In [19]:
sorted(address_st.items(), key=lambda e: e[1], reverse=True)

[(u'San Francisco', 495),
 (u'Arturo Prat', 425),
 (u'Pje 2', 336),
 (u'Avenida Sur', 317),
 (u'Avenida Vitacura', 316),
 (u'Avenida Pedro de Valdivia', 303),
 (u'Avenida Central', 300),
 (u'Pje 1', 300),
 (u'Avenida Independencia', 284),
 (u'Avenida Diego Portales', 279),
 (u'Avenida Francisco Bilbao', 275),
 (u'Simon Bolivar', 275),
 (u'Avenida San Jose de la Estrella', 258),
 (u'Avenida Las Torres', 257),
 (u'Victoria', 257),
 (u'Pje 3', 256),
 (u'Gran Avenida Jos\xc3\xa9 Miguel Carrera', 246),
 (u'Avenida La Serena', 245),
 (u'Calle 1', 242),
 (u'San Ignacio', 241),
 (u'Los Alerces', 236),
 (u'Las Hualtatas', 232),
 (u'Gabriela Mistral', 230),
 (u'Avenida Providencia', 229),
 (u'Avenida Lo Ovalle', 226),
 (u'Nonato Coo', 225),
 (u'Yungay', 219),
 (u'San Pedro', 217),
 (u'Mapocho', 215),
 (u'Avenida Santa Rosa', 214),
 (u'Santo Domingo', 213),
 (u"Avenida Libertador Bernardo O'Higgins", 210),
 (u'Avenida Apoquindo', 207),
 (u'Jose Miguel Carrera', 207),
 (u'Avenida Las Condes', 206)

## Validity: conforms to a schema

Mostly valid, in terms of the precessence of mandatory information for nodes, ways and relations. However, the validy of the 'tags' is under discussion.

* Mandatory fields (empty tags, nodes without names)
* Fields with unique values (:,/,;)
* Valid references to other fields (Reference to nodes not in the data)
* Format validity: phone numbers, postal codes, Email, internet domain


## Accuracy: conforms to gold standard

* Do the phone number, email, et.c really exist? are they all correct?

## Completeness: are all records there?

## Consistency: matches other data
* Cross-field consistency (addr:street with highway:name)

## Uniformity: same units

In [None]:
street_names = []
components = []
for e in data:
    if e['type'] == 'way':
        if 'highway' in e.keys() and 'name' in e.keys():
            street_names.append(e['name'])
            components.append(e['name'].split(' ')[0])

In [None]:
components

In [None]:
import pandas as pd
df_1 = pd.DataFrame(data={'name': street_names, 'num': 1})
df_2 = pd.DataFrame(data={'name': components, 'num': 1})

In [None]:
df_1.groupby('name').count().reset_index().sort('name')

In [None]:
db = get_db('dataw')

In [144]:
phone = []
cursor = stgo.find({'$or': [{'phone': {'$exists': 1}}, {'contact:phone': {'$exists': 1}}]},
                   {'_id':0, 'phone':1, 'contact:phone':1})
for c in cursor:
    phone.append(c.values()[0])


In [145]:
phone

[u'+56 2 22098966',
 u'+56 2 29222690',
 u'+56 2 26313212',
 u'+56 2 22040952',
 u'+56 2 26313058',
 u'+56 2 24262000',
 u'+56 2 27999900',
 u'FIXME',
 u'FIXME',
 u'+56 2 24110600',
 u'+56 2 23421200',
 u'+56 2 23421200',
 u'+56 2 26313034',
 u'+56 2 25981000',
 u'+56 2 22283640',
 u'+56 2 22317308',
 u'+56 2 22342795',
 u'+56 2 22054592',
 u'+56 2 23342607',
 u'+56 2 22344688',
 u'+56 2 22170856',
 u'+56 2 23811654',
 u'+56 2 22286693',
 u'+56 2 22425777',
 u'+56 2 23416213',
 u'+56 2 22312173',
 u'+56 2 22748078',
 u'+56 2 23330795',
 u'+56 2 24707400',
 u'+56 2 22311393',
 u'+56 2 22429711',
 u'+56 2 25834727',
 u'+56 2 22337988',
 u'+56 2 22362144',
 u'+56 2 22363333',
 u'+56 2 22357086',
 u'FIXME',
 u'+56 2 22331897',
 u'FIXME',
 u'+56 2 23371900',
 u'+56 2 22220778',
 u'+56 2 22164833',
 u'+56 2 22180110',
 u'+56 2 22180105',
 u'+56 2 29538331',
 u'+56 2 22180111',
 u'+56 2 22189640',
 u'+56 2 22180116',
 u'+56 2 22180114',
 u'+56 2 22478484',
 u'+56 2 22478130',
 u'+56 2 2218011

In [146]:
correct = {}
nofix = 0
problemc = re.compile(r'\D')
for p in phone:
    ptemp = p.split(';')
    fix =  False
    for i in range(len(ptemp)):
        ptemp[i] = ptemp[i].replace(' ','').replace('(','').replace(')','').replace('-','').replace('+','')
        if problemc.search(ptemp[i]):
            #print p, ptemp, problemc.search(ptemp[i]).group()
            correct[p] = 'FIXME'
            fix = True
            break
        if len(ptemp[i]) > 11:
            #print p, ptemp
            correct[p] = 'FIXME'
            fix = True
            break
        if len(ptemp[i]) == 11  and ptemp[i].startswith('5622'):
            ptemp[i] = "+56 2 2" + ptemp[i][4:]
            nofix += 1
            continue
        if len(ptemp[i]) == 10  and ptemp[i].startswith('562'):
            ptemp[i] = "+56 2 2" + ptemp[i][3:]
            continue
        if len(ptemp[i]) == 11  and ptemp[i].startswith('5602'):
            ptemp[i] = "+56 2 2" + ptemp[i][4:]
            continue
        if len(ptemp[i]) == 12  and ptemp[i].startswith('56022'):
            ptemp[i] = "+56 2 2" + ptemp[i][5:]
            continue
        if len(ptemp[i]) == 11  and ptemp[i].startswith('569'):
            ptemp[i] = "+56 9" + ptemp[i][3] + ' ' + ptemp[i][4:] 
            continue
        if len(ptemp[i]) == 9 and ptemp[i].startswith('02'):
            ptemp[i] = "+56 2 2" + ptemp[i][2:]
            continue
        if len(ptemp[i]) == 10 and ptemp[i].startswith('022'):
            ptemp[i] = "+56 2 2" + ptemp[i][3:]
            continue
        if len(ptemp[i]) == 8 and ptemp[i].startswith('2'):
            ptemp[i] = "+56 2 2" + ptemp[i][1:]
            continue
        if len(ptemp[i]) == 9 and ptemp[i].startswith('22'):
            ptemp[i] = "+56 2 2" + ptemp[i][2:]
            continue
            
        if len(ptemp[i]) == 7:
            ptemp[i] = "+56 2 2" + ptemp[i]
            continue
        
        if len(ptemp[i]) == 10 and (ptemp[i].startswith('800') or ptemp[i].startswith('600')):
            ptemp[i] = ptemp[i][0:3] + ' ' + ptemp[i][3:6] + ' ' + ptemp[i][6:]
            continue
            
        correct[p] = 'FIXME'
        fix = True
        break
        
    if fix:
        continue
    if len(ptemp) == 1:
        ptemp = ptemp[0]

    correct[p] = ptemp

SyntaxError: invalid syntax (<ipython-input-146-d805fc8f2864>, line 33)

In [143]:
phone

[u'+56 2 2098966',
 u'+5629222690',
 u'+56226313212',
 u'+5622040952',
 u'+56 2 631 3058',
 u'+56 2 24262000',
 u'+56 2 27999900',
 u'+56 72 922200',
 u'+56 72 922200',
 u'+5624110600',
 u'3421200',
 u'3421200',
 u'+562631 3034',
 u'+5625981000',
 u'+562 2228 3640',
 u'+56222317308',
 u'+56222342795',
 u'+562 205 4592',
 u'+56 2 23342607',
 u'+56 2 234 4688',
 u'+5622170856',
 u'+5623811654',
 u'+56222286693',
 u'56 2 2425777',
 u'+5623416213',
 u'+56222312173',
 u'56 2 2748078',
 u'+56223330795',
 u'+56 224707400',
 u'+56 2 22311393',
 u'56 2 242 9711',
 u'+5625834727',
 u'+5622337988',
 u'+56 2 2362144',
 u'+5622363333',
 u'+5622357086',
 u'+5699340908',
 u'+56 2233 1897',
 u'0056 2 22073107',
 u'+5623371900',
 u'+56222220778',
 u'56 2 2164833',
 u'+5622180130',
 u'+5622180110',
 u'+5622180105',
 u'+5629538331',
 u'+5622180111',
 u'+5622189640',
 u'+5622180116',
 u'+5622180114',
 u'+5622478484',
 u'+5622478130',
 u'+5622180118',
 u'+5625844438',
 u'+56 2 229 08 100',
 u'+56 2 2437117

In [135]:
stgo.update({'contact:phone': u'+56 2 2386082'}, {'$set': {'contact:phone': u'+56 2 22386082'}})

{u'n': 0, u'nModified': 0, u'ok': 1, 'updatedExisting': False}

In [None]:
stgo.find()

In [93]:
len('222089442')

9

In [136]:
?stgo.update

In [140]:
for k, v in correct.iteritems():
    stgo.update(
        {'contact:phone': k},
        {'$set': {'contact:phone': v}},
        multi=True)
    stgo.update(
        {'phone': k},
        {'$set': {'phone': v}},
        multi=True)

In [None]:
number_in_name = re.compile(r'\b[A-z]+[0-9]')

In [None]:
for n in street_names:
    if number_in_name.search(n):
        print n
            

In [None]:
db = get_db('dataw')

In [None]:
c = db.santiago.find({'name': {'$regex': 'Avd'}})

In [None]:
for r in c:
    print r['name']

In [78]:
len('56998207704')

11

In [103]:
'56228842752'[3:5]

'28'