# OpenStreetMap数据探索

# 1.节点检查

In [1]:
import xml.etree.ElementTree as ET
import pprint
import re

OSM_FILE_NAME = "map.xml"

In [2]:
NODE_TYPES = set()

def get_node_types(file_name="sample.xml"):
    context = ET.iterparse(file_name, events=('start', 'end'))
    _, root = next(context)
    for event, elem in context:
        if event=='start':
            #print('start----'+elem.tag)
            NODE_TYPES.add(elem.tag)


In [3]:
get_node_types(OSM_FILE_NAME)
print(NODE_TYPES)

{'nd', 'remark', 'tag', 'way', 'member', 'bounds', 'relation', 'node', 'note', 'meta'}


In [2]:
CHECK_NODES = ["relation", "node", "way"]

way_attribs=set()
way_tag_attribs=set()
node_attribs=set()
node_tag_attribs=set()
relation_attribs=set()
relation_tag_attribs=set()
relation_member_attribs=set()

def get_basic_attributes(file_name="sample.xml"):
    context = ET.iterparse(file_name, events=('start',))
    _, root = next(context)
    for event, elem in context:
        if event=='start' and elem.tag=='node':
            for attrib in elem.attrib.keys():
                node_attribs.add(attrib)
            for node_tag in elem.iter("tag"):
                for attrib in node_tag.attrib.keys():
                    node_tag_attribs.add(attrib)
        elif event=='start' and elem.tag=='way':
            for attrib in elem.attrib.keys():
                way_attribs.add(attrib)
            for way_tag in elem.iter("tag"):
                for attrib in way_tag.attrib.keys():
                    way_tag_attribs.add(attrib)
        elif event=='start' and elem.tag=='relation':
            for attrib in elem.attrib.keys():
                relation_attribs.add(attrib)
            for relation_tag in elem.iter("tag"):
                for attrib in relation_tag.attrib.keys():
                    relation_tag_attribs.add(attrib)
            for relation_member in elem.iter("member"):
                for attrib in relation_member.attrib.keys():
                    relation_member_attribs.add(attrib)

            

In [3]:
get_basic_attributes("map.xml")

In [6]:
#输出各类标签的 attribute 类型
print("way_attribs:=" + str(way_attribs))
print("way_tag_attribs:="+ str(way_tag_attribs))
print("node_attribs:="+ str(node_attribs))
print("node_tag_attribs:="+ str(node_tag_attribs))
print("relation_attribs:="+ str(relation_attribs))
print("relation_tag_attribs:="+ str(relation_tag_attribs))
print("relation_member_attribs:="+ str(relation_member_attribs))

way_attribs:={'version', 'timestamp', 'id', 'changeset', 'uid', 'user'}
way_tag_attribs:={'k', 'v'}
node_attribs:={'version', 'lat', 'timestamp', 'id', 'changeset', 'uid', 'lon', 'user'}
node_tag_attribs:={'k', 'v'}
relation_attribs:={'version', 'timestamp', 'id', 'changeset', 'uid', 'user'}
relation_tag_attribs:={'k', 'v'}
relation_member_attribs:={'ref', 'type', 'role'}


## 检查node,way,relation中的TAG标签的K值

In [19]:
WAY_TAG_K=set()
NODE_TAG_K=set()
REL_TAG_K=set()

def get_tag_k(file_name="sample.xml"):
    context = ET.iterparse(file_name, events=('start',))
    _, root = next(context)
    for event, elem in context:
        if event=='start' and elem.tag=='way':
            for way_tag in elem.iter("tag"):
                WAY_TAG_K.add(way_tag.attrib['k'])
                
        elif event=='start' and elem.tag=='node':
            for node_tag in elem.iter("tag"):
                NODE_TAG_K.add(node_tag.attrib['k'])
                
        elif event=='start' and elem.tag=='relation':
            for rel_tag in elem.iter("tag"):
                REL_TAG_K.add(rel_tag.attrib['k'])

In [21]:
get_tag_k("map.xml")

#### 检查每一个K值出现的次数

In [28]:
way_tag_k_counter = {k_name:0 for k_name in WAY_TAG_K}
node_tag_k_counter = {k_name:0 for k_name in NODE_TAG_K}
rel_tag_k_counter = {k_name:0 for k_name in REL_TAG_K}

In [30]:
def count_K_field(file_name="sample.xml"):
    context = ET.iterparse(file_name, events=('start',))
    _, root = next(context)
    for event, elem in context:
        if event=='start' and elem.tag=='way':
            for way_tag in elem.iter("tag"):
                way_tag_k_counter[way_tag.attrib['k']] +=1
                
        elif event=='start' and elem.tag=='node':
            for node_tag in elem.iter("tag"):
                node_tag_k_counter[node_tag.attrib['k']] +=1
                
        elif event=='start' and elem.tag=='relation':
            for rel_tag in elem.iter("tag"):
                rel_tag_k_counter[rel_tag.attrib['k']] +=1

In [31]:
count_K_field("map.xml")

In [34]:
print(node_tag_k_counter, way_tag_k_counter, rel_tag_k_counter)

{'name:lt': 2, 'stars': 6, 'sport': 34, 'tower:type': 18, 'fuel:octane_98': 2, 'seats': 9, 'is_in:country_code': 7, 'toilets:wheelchair': 1, 'highway': 5133, 'diplomatic': 1, 'service:bicycle:repair': 2, 'artist:wikidata': 2, 'artist_name': 5, 'gns:UNI': 12, 'layer': 50, 'wikipedia': 17, 'cuisine': 426, 'place': 222, 'seamark:light:character': 2, 'heritage': 36, 'barrier': 980, 'addr:floor': 2, 'seamark:mooring:category': 1, 'source': 6716, 'gns:DSG': 13, 'ele': 7, 'fee': 51, 'junction': 1, 'is_in:iso_3166_2': 1, 'smoking': 41, 'food': 4, 'internet_access:ssid': 1, 'wifi': 1, 'motorcycle': 13, 'surveillance': 2, 'exit_to:en': 9, 'entrance': 77, 'name:fa': 1, 'healthcare': 2, 'drink': 2, 'ref:vi': 1, 'railway': 1811, 'rooms': 1, 'fuel:diesel': 2, 'height': 2714, 'organic': 2, 'vehicle': 1, 'wpt_symbol': 3, 'notes': 3, 'network': 223, 'crossing_ref': 82, 'name:hr': 1, 'artwork_type': 28, 'trolleybus': 46, 'local_ref': 21, 'floor': 2, 'surface': 1, 'vending': 2, 'source:ref': 10, 'addr:fl

#### 检查TAG中K值是否是其他K值的前缀

In [70]:
def check_one_prefix_of_set(k, src_set, result_list):
    for k_name in src_set:
        regex = re.compile(r'^' + k + ':.*$')
        m = re.search(regex, k_name)
        if m:
            result_list.append(k_name)

def check_prefix(k_set):
    result_counter = {k_name:[] for k_name in k_set}
    
    for (k_name, k_list) in result_counter.items():
        check_one_prefix_of_set(k_name, k_set, k_list)
    
    keys = list(result_counter.keys())
    keys.sort()
    return [(k, result_counter[k]) for k in keys]

In [71]:
#Reletion_tag_k的前缀情况
for (k, k_list) in check_prefix(REL_TAG_K):
    if len(k_list)>=1:
        print(k+"==>" + str(k_list))

addr:street==>['addr:street:en']
building==>['building:part', 'building:min_level', 'building:levels', 'building:colour']
from==>['from:en']
heritage==>['heritage:ref', 'heritage:operator']
is_in==>['is_in:country_code']
name==>['name:zh_pinyin', 'name:zh-traditional', 'name:pl', 'name:zh-classical', 'name:ia', 'name:ja', 'name:zh-hant', 'name:pt', 'name:ru', 'name:cs', 'name:eo', 'name:nl', 'name:ko', 'name:fr', 'name:ca', 'name:de', 'name:vi', 'name:en', 'name:zh-simplified', 'name:hu', 'name:kn', 'name:io', 'name:zh']
old_name==>['old_name:fr', 'old_name:en']
operator==>['operator:en']
public_transport==>['public_transport:version']
ref==>['ref:en']
restriction==>['restriction:motorcycle', 'restriction:bicycle']
short_name==>['short_name:en']
source==>['source:name']
to==>['to:en']


In [72]:
#Node_tag_k的前缀情况
for (k, k_list) in check_prefix(NODE_TAG_K):
    if len(k_list)>=1:
        print(k+"==>" + str(k_list))

addr:street==>['addr:street:alley', 'addr:street:en']
alt_name==>['alt_name:en', 'alt_name:vi']
artist_name==>['artist_name:en', 'artist_name:zh_pinyin']
building==>['building:levels']
entrance==>['entrance:main']
exit_to==>['exit_to:en']
healthcare==>['healthcare:speciality']
heritage==>['heritage:ref', 'heritage:operator']
internet_access==>['internet_access:ssid', 'internet_access:fee']
memorial==>['memorial:type']
motorcycle==>['motorcycle:type']
name==>['name:lt', 'name:fa', 'name:hr', 'name:he', 'name:jbo', 'name:zh_pinyin', 'name:sk', 'name:zh-hant-CN', 'name:pl', 'name:ro', 'name:tr', 'name:ia', 'name:ja', 'name:el', 'name:zh-CN', 'name:af', 'name:ku', 'name:sv', 'name:zh-hant', 'name:pt', 'name:ru', 'name:cs', 'name:th', 'name:eo', 'name:ko', 'name:fr', 'name:ca', 'name:cn', 'name:de', 'name:it', 'name:sr', 'name:es', 'name:vi', 'name:ar', 'name:uk', 'name:en', 'name:hu', 'name:kn', 'name:da', 'name:zh', 'name:bg', 'name:io']
note==>['note:plack', 'note:de', 'note:en']
operato

In [73]:
#Way_tag_k的前缀情况
for (k, k_list) in check_prefix(WAY_TAG_K):
    if len(k_list)>=1:
        print(k+"==>" + str(k_list))

addr:city==>['addr:city:en']
addr:housenumber==>['addr:housenumber:en']
addr:street==>['addr:street:en']
aerodrome==>['aerodrome:type']
alt_name==>['alt_name:zh_pinyin', 'alt_name:zh-hant', 'alt_name:zh', 'alt_name:en']
bicycle==>['bicycle:right']
bridge==>['bridge:structure']
building==>['building:level', 'building:levels', 'building:colour', 'building:min_level', 'building:type', 'building:height', 'building:part', 'building:material', 'building:levels:underground', 'building:use']
building:levels==>['building:levels:underground']
communication==>['communication:radio', 'communication:television']
construction==>['construction:leisure']
cycleway==>['cycleway:left', 'cycleway:right']
destination==>['destination:ref:to', 'destination:street', 'destination:lanes', 'destination:to', 'destination:ref', 'destination:street:to', 'destination:backward']
destination:ref==>['destination:ref:to']
destination:street==>['destination:street:to']
disused==>['disused:aeroway']
handrail==>['handrail:

# 2.审计元素与字符

# 3.探索用户

In [13]:
BASIC_NODES = ["relation", "node", "way"]

def get_user(element):
    return element.get("uid"), element.get("user")

def get_user_of_map(filename):
    users = set()
    for _, element in ET.iterparse(filename):
        if element.tag in BASIC_NODES:
            user_id, user_name = get_user(element)
            users.add(user_id + ':' + user_name)
    return users

In [14]:
users_set = get_user_of_map(OSM_FILE_NAME)

In [17]:
print(len(users_set))
print(users_set)

1538
{'5226660:Hamed Azimi', '4038854:Chip packet', '2719768:j_c_schwartz', '831660:JJS123', '5555475:Jonathan Mercier', '92274:adjuva', '3872017:Duc D', '4651232:Stevenhuxin', '3649021:Branos', '2433219:Julien Nguyen', '193291:MapMakinMeyers', '6765615:大乐0', '4190070:Fringson', '4140016:Richy_B', '1803172:zenorm', '2348833:lexrkt', '580542:miklas', '5825765:gsw343', '67507:Wolle98', '84069:AfricaTwinTreiber', '4040769:ZFWZFW', '94660:m_angyal', '345866:hob', '5798659:Tommy0933', '364:Edward', '26726:lonvia', '1976819:forsaken628', '624466:panhoong', '2427054:saic', '1189602:Jack007', '55381:spi010', '1219875:Theodin', '1829683:Luis36995', '7406:jynus', '4902267:Will Rynearson', '1877821:greendyj', '3805176:Gareth Nicholson', '17497:katpatuka', '934310:Deruid', '4590461:姚依辰', '6624298:家在花桥', '852124:CyrilTLS', '6447498:CENTRALHUB', '1967105:louisliang', '4645989:lightxu', '4702432:binjian', '2314966:winsky', '632378:3yoda', '1167947:Beinuo', '6603006:家涞坊酒店公寓', '6853928:Swordie', '15228

# 4.1 完善街道

In [18]:
import xml.etree.cElementTree as ET
from collections import defaultdict
import re
import pprint

OSM_FILE_NAME = "map.xml"

In [19]:
#匹配街道名称最后独立字符串（称谓）
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)

expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons"]

# UPDATE THIS VARIABLE
mapping = { "St": "Street",
            "St.": "Street",
            "Rd": "Road",
            "Rd.": "Road",
            "Ave": "Avenue"
            }

In [None]:
def audit_street_type(street_types, street_name):
    m = re.search(street_type_re, street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)


def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")


def audit(osmfile):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    
    for event, elem in ET.iterparse(osm_file, events=("start",)):
        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
    
    osm_file.close()
    return street_types


def update_name(name, mapping):
    for from_name, to_name in mapping.iteritems():
        #print '(.*)(' + from_name + ')$'
        name_regex = re.compile(r'(.*)(' + from_name + ')$', re.IGNORECASE)
        match_result = re.search(name_regex, name)

        if match_result:
            name = match_result.group(1) + to_name
            break;
        

    return name

# 5.生成CSV文件

In [33]:
import csv
import codecs
import pprint
import re
import xml.etree.cElementTree as ET

import cerberus

import schema

OSM_PATH = "map.xml"

NODES_PATH = "nodes.csv"
NODE_TAGS_PATH = "nodes_tags.csv"
WAYS_PATH = "ways.csv"
WAY_NODES_PATH = "ways_nodes.csv"
WAY_TAGS_PATH = "ways_tags.csv"
REL_PATH = "relations.csv"
REL_TAGS_PATH = "rel_tags.csv"
REL_MEMBERS_PATH = "rel_members.csv"

NODE_TAG_COLON = re.compile(r'^(.*?):(.*)')
LOWER_COLON = re.compile(r'^([a-z]|_)+:([a-z]|_)+')

PROBLEMCHARS = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')


SCHEMA = schema.schema

In [35]:
# Make sure the fields order in the csvs matches the column order in the sql table schema
NODE_FIELDS = ['id', 'lat', 'lon', 'user', 'uid', 'version', 'changeset', 'timestamp']
NODE_TAGS_FIELDS = ['id', 'key', 'value', 'type']

WAY_FIELDS = ['id', 'user', 'uid', 'version', 'changeset', 'timestamp']
WAY_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_NODES_FIELDS = ['id', 'node_id', 'position']

REL_FIELDS = ['id', 'user', 'uid', 'version', 'changeset', 'timestamp']
REL_TAGS_FIELDS = ['id', 'key', 'value', 'type']
REL_MEMBER_FIELDS = ['id', 'ref', 'type', 'role', 'position']


"""Clean and shape node or way XML element to Python dict"""
def shape_element(element, node_attr_fields=NODE_FIELDS, way_attr_fields=WAY_FIELDS, rel_attr_fields=REL_FIELDS,
                  problem_chars=PROBLEMCHARS, default_tag_type='regular'):
    node_attribs = {}
    
    way_attribs = {}
    way_nodes = []
    
    rel_attribs = {}
    rel_members = []
    
    tags = []  # Handle secondary tags the same way for both node and way elements

    # 检查 node 节点
    if element.tag == 'node':
        for node_attr_field in node_attr_fields:
            node_attribs[node_attr_field] = element.get(node_attr_field)
        for tag in element.iter("tag"):
            key_string = tag.get('k')
            if re.search(PROBLEMCHARS, key_string):
                continue
            if key_string.find(':')<0:
                current_tag = {}
                current_tag['id'] = element.get('id')
                current_tag['value'] = tag.get('v')
                current_tag['key'] = key_string
                current_tag['type'] = "regular"
                tags.append(current_tag)
            else:
                current_tag = {}
                m = re.search(NODE_TAG_COLON, key_string)
                current_tag['id'] = element.get('id')
                current_tag['value'] = tag.get('v')
                current_tag['key'] = m.group(2)
                current_tag['type'] = m.group(1)
                tags.append(current_tag)
        return {'node': node_attribs, 'node_tags': tags}
    #检查 way 节点    
    elif element.tag == 'way':
        for way_attr_field in way_attr_fields:
            way_attribs[way_attr_field] = element.get(way_attr_field)
        for tag in element.iter("tag"):
            key_string = tag.get('k')
            if re.search(PROBLEMCHARS, key_string):
                continue
            if key_string.find(':')<0:
                current_tag = {}
                current_tag['id'] = element.get('id')
                current_tag['value'] = tag.get('v')
                current_tag['key'] = key_string
                current_tag['type'] = "regular"
                tags.append(current_tag)
            else:
                current_tag = {}
                m = re.search(NODE_TAG_COLON, key_string)
                current_tag['id'] = element.get('id')
                current_tag['value'] = tag.get('v')
                current_tag['key'] = m.group(2)
                current_tag['type'] = m.group(1)
                tags.append(current_tag)
        cnt=0
        for node in element.iter("nd"):
            way_node = {}
            way_node['id'] = element.get('id')
            way_node['node_id'] = node.get('ref')
            way_node['position'] = cnt
            cnt+=1
            way_nodes.append(way_node)
            
        return {'way': way_attribs, 'way_nodes': way_nodes, 'way_tags': tags}
    
    #检查 relation 节点
    elif element.tag == 'relation':
        #relation节点的属性
        for rel_attr_field in rel_attr_fields:
            rel_attribs[rel_attr_field] = element.get(rel_attr_field)
        #迭代tag子节点
        for tag in element.iter('tag'):
            key_string = tag.get('k')
            if re.search(PROBLEMCHARS, key_string):
                continue
            if key_string.find(':')<0:
                current_tag = {}
                current_tag['id'] = element.get('id')
                current_tag['value'] = tag.get('v')
                current_tag['key'] = key_string
                current_tag['type'] = "regular"
                tags.append(current_tag)
            else:
                current_tag = {}
                m = re.search(NODE_TAG_COLON, key_string)
                current_tag['id'] = element.get('id')
                current_tag['value'] = tag.get('v')
                current_tag['key'] = m.group(2)
                current_tag['type'] = m.group(1)
                tags.append(current_tag)
        #迭代member子节点
        cnt=0
        for member in element.iter('member'):
            member_node = {}
            member_node['id'] = element.get('id')
            member_node['type'] = member.get('type')
            member_node['ref'] = member.get('ref')
            member_node['role'] = member.get('role')
            member_node['position'] = cnt
            cnt+=1
            rel_members.append(member_node)
        return {'relation': rel_attribs, 'rel_tags': tags, 'rel_members': rel_members}        

In [36]:
# ================================================== #
#               Helper Functions                     #
# ================================================== #
def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag"""

    context = ET.iterparse(osm_file, events=('start', 'end'))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


def validate_element(element, validator, schema=SCHEMA):
    """Raise ValidationError if element does not match schema"""
    if validator.validate(element, schema) is not True:
        #field, errors = next(validator.errors.items())
        #message_string = "\nElement of type '{0}' has the following errors:\n{1}"
        #error_string = pprint.pformat(errors)
        #raise Exception(message_string.format(field, error_string))
        raise Exception(validator._errors)


class UnicodeDictWriter(csv.DictWriter, object):
    """Extend csv.DictWriter to handle Unicode input"""

    def writerow(self, row):
        super(UnicodeDictWriter, self).writerow({
            #k: (v.encode('utf-8') if isinstance(v, str) else v) for k, v in row.items()
            k: (v) for k, v in row.items()
        })

    def writerows(self, rows):
        for row in rows:
            self.writerow(row)

In [37]:
# ================================================== #
#               Main Function                        #
# ================================================== #
def process_map(file_in, validate):
    """Iteratively process each XML element and write to csv(s)"""

    with codecs.open(NODES_PATH, 'w', encoding='utf-8') as nodes_file, \
         codecs.open(NODE_TAGS_PATH, 'w', encoding='utf-8') as nodes_tags_file, \
         codecs.open(WAYS_PATH, 'w', encoding='utf-8') as ways_file, \
         codecs.open(WAY_NODES_PATH, 'w', encoding='utf-8') as way_nodes_file, \
         codecs.open(WAY_TAGS_PATH, 'w', encoding='utf-8') as way_tags_file, \
         codecs.open(REL_PATH, 'w', encoding='utf-8') as relations_file, \
         codecs.open(REL_TAGS_PATH, 'w', encoding='utf-8') as rel_tags_file, \
         codecs.open(REL_MEMBERS_PATH, 'w', encoding='utf-8') as rel_members_file:

        nodes_writer = UnicodeDictWriter(nodes_file, NODE_FIELDS)
        node_tags_writer = UnicodeDictWriter(nodes_tags_file, NODE_TAGS_FIELDS)
        
        ways_writer = UnicodeDictWriter(ways_file, WAY_FIELDS)
        way_nodes_writer = UnicodeDictWriter(way_nodes_file, WAY_NODES_FIELDS)
        way_tags_writer = UnicodeDictWriter(way_tags_file, WAY_TAGS_FIELDS)
        
        relations_writer = UnicodeDictWriter(relations_file, REL_FIELDS)
        rel_tags_writer = UnicodeDictWriter(rel_tags_file, REL_TAGS_FIELDS)
        rel_members_writer = UnicodeDictWriter(rel_members_file, REL_MEMBER_FIELDS)
        
        nodes_writer.writeheader()
        node_tags_writer.writeheader()
        
        ways_writer.writeheader()
        way_nodes_writer.writeheader()
        way_tags_writer.writeheader()
        
        relations_writer.writeheader()
        rel_tags_writer.writeheader()
        rel_members_writer.writeheader()

        validator = cerberus.Validator()

        for element in get_element(file_in, tags=('node', 'way', 'relation')):
        #for element in get_element(file_in, tags=('node')):
            el = shape_element(element)
            if el:
                if validate is True:
                    validate_element(el, validator)

                if element.tag == 'node':
                    nodes_writer.writerow(el['node'])
                    node_tags_writer.writerows(el['node_tags'])
                elif element.tag == 'way':
                    ways_writer.writerow(el['way'])
                    way_nodes_writer.writerows(el['way_nodes'])
                    way_tags_writer.writerows(el['way_tags'])
                elif element.tag == 'relation':
                    relations_writer.writerow(el['relation'])
                    rel_members_writer.writerows(el['rel_members'])
                    rel_tags_writer.writerows(el['rel_tags'])


In [27]:
process_map("map.xml", validate=True)

# 后续处理函数

In [1]:
import re

#用于替换value字段内的奇怪字符
VALUE_REPLACE_CHARS = '[📮]'
def reject_bad_chars_of_value(src_string, badchars=VALUE_REPLACE_CHARS):
    result = re.sub(badchars, "", src_string)
    return result
#print(reject_bad_chars("邮筒📮"))

#用于去除错误邮编
REGULAR_POSTCODE =re.compile(r'^[0-9]{6}$')
def is_postcode(src_postcode, regular_postcode=REGULAR_POSTCODE):
    postcode = re.search(regular_postcode, src_postcode)
    if postcode:
        return True
    else:
        return False
#print(is_postcode('12345 '))

#用于规范电话的格式
#如果校验通过，返回正确格式的电话，否则返回空字符串
REGULER_PHONE = re.compile(r'^(86)?(021|21)?([0-9]{8}|[0-9]{11}|400[0-9]{7}|[0-9]{5})$')
def audit_phone(src_phone, regular_phone=REGULER_PHONE):
    new_phone = re.sub(r'[+ \-()]', '', src_phone)
    phone = re.search(regular_phone, new_phone)
    if phone:
        phone_num = phone.group(3)
        phone_type = len(phone_num)
        #固话
        if phone_type==8:
            return '+86-021-' + phone_num
        #手机
        elif phone_type==11:
            return '+86-' + phone_num
        #400电话
        elif phone_type==10:
            return '+86-' + phone_num
        #全国通用电话
        elif phone_type==5:
            return '+86-' + phone_num
        else:
            return ''
    else:
        return ''
#test_phone = ['4008123123','862122163900','+86 6361 2898','021-63914848, 021-63522222','+86 21 38809988','2164312091','86-21-50559888','+2147483647','+862164712821','+18 13621675140','02162883030','+86-21-5160-7888','+86 138 1609 3747','(021) 3356-3996','021-63779282','+86 (0)21-68778787']
#for phone in test_phone:
#    print(audit_phone(phone))

# 处理csv文件并生成新文件

In [59]:
import csv

def audit_tag_files(file_name):
    file_data = []
    
    with open(file_name, "r", encoding='utf-8') as f:
        reader= csv.DictReader(f)
        
        for row in reader:
            #去除坏字符
            row['value'] = reject_bad_chars_of_value(row['value'])
            #规范电话格式
            if row['key']=='phone':
                tmp_phone = audit_phone(row['value'])
                if tmp_phone!='':
                    row['value'] = tmp_phone
                    file_data.append(row)
                else:
                    print('错误的电话格式' + str(row))
            #去除错误的邮编
            elif row['key']=='postcode':
                if is_postcode(row['value']):
                    file_data.append(row)
                else:
                    print('错误的邮编格式' + str(row))
            #其他情况默认保留
            else:
                file_data.append(row)
                
    return file_data

In [71]:
FIELD_NAMES=['id','key','value','type']
def write_csv_file(file_name, file_data, field_names=FIELD_NAMES):
    with open(file_name, "w", encoding='utf-8', newline='') as f:
        writer = csv.DictWriter(f, delimiter=",", fieldnames=field_names)
        writer.writeheader()
        for row in file_data:
            writer.writerow(row)
        print('完成: ' + file_name + "文件写入.")

In [72]:
def check_and_write_new_csv_file(src_file_name):
    file_data = audit_tag_files(src_file_name)
    new_file_name = src_file_name.split('.')[0] + '_new.csv'
    write_csv_file(new_file_name, file_data)

In [75]:
check_and_write_new_csv_file('nodes_tags.csv')
check_and_write_new_csv_file('ways_tags.csv')
check_and_write_new_csv_file('rel_tags.csv')

错误的电话格式{'value': '021-63914848, 021-63522222', 'key': 'phone', 'type': 'regular', 'id': '477661623'}
错误的电话格式{'value': '+18 13621675140', 'key': 'phone', 'type': 'regular', 'id': '2345419578'}
错误的电话格式{'value': '+86 8621 5118 1222', 'key': 'phone', 'type': 'regular', 'id': '3609090494'}
错误的邮编格式{'value': '2000080', 'key': 'postcode', 'type': 'addr', 'id': '4364315493'}
完成: nodes_tags_new.csv文件写入.
错误的邮编格式{'value': '201315 上海', 'key': 'postcode', 'type': 'addr', 'id': '148014167'}
错误的邮编格式{'value': '201315 上海', 'key': 'postcode', 'type': 'addr', 'id': '148014201'}
错误的电话格式{'value': '8008103088', 'key': 'phone', 'type': 'regular', 'id': '159787864'}
错误的邮编格式{'value': '2000080', 'key': 'postcode', 'type': 'addr', 'id': '293504473'}
错误的电话格式{'value': '+86 21 64874095*208', 'key': 'phone', 'type': 'regular', 'id': '293862126'}
错误的邮编格式{'value': '20032', 'key': 'postcode', 'type': 'addr', 'id': '307449542'}
错误的邮编格式{'value': '20032', 'key': 'postcode', 'type': 'addr', 'id': '307455604'}
错误的电话格式{'value

# ERROR SHEET

#### 语句：
super(UnicodeDictWriter, self).writerow({
    k: (v.encode('utf-8') if isinstance(v, unicode) else v) for k, v in row.iteritems()
})

Error: 'dict' object has no attribute 'iteritems'

原因：pyton3与2的语法区别


解决办法: python3中使用：row.items()

#### 语句：
def writerow(self, row):
    super(UnicodeDictWriter, self).writerow({
        k: (v.encode('utf-8') if isinstance(v, unicode) else v) for k, v in row.items()         
    }) 

Error: isinstance() arg 2 must be a type or tuple of types

原因：python3语法不同

解决办法：使用 str 代替 unicode

In [None]:
文件打开指定utf-8编码，否则读取带特殊人名（非26字母字符）时会出现gnk编码读取错误