### How to extract last character from names

Need to read in a file of Chinese names and for each name, extract just the last character.  It's tricky because these are unicode strings, so the string needs to be decoded before you can inspect individual characters.

In [32]:
name_data = open('chinese_names.clean', 'rb').readlines()
sample = name_data[0].strip()  # need to remove the \n from each entry'd line-ending
print sample.decode('utf-8')[-1]

市


### Analyze Chinese names last characters

So the question now is, what are the unique last characters in these Chinese names?  These *should* represent the suffixes for things like "city" or "state" etc.

In [33]:
from collections import defaultdict

name_dict = defaultdict(int)

for name in name_data:
    name = name.strip()
    last_char = name.decode('utf-8')[-1]
    name_dict[last_char] += 1
    
for char, count in name_dict.items():
    print char, count

省 23
市 656
划 5
盟 3
旗 52
区 1244
县 1495
州 30


Here are the translations for each one of those:

`
省 23   <-- "province"
市 656  <-- "city"
划 5    <-- "draw" (omit)
盟 3    <-- "league"
旗 52   <-- "flag" (omit)
区 1244 <-- "district", "area", or "prefecture"
县 1495 <-- "county"
州 30   <-- "state"
`

In [34]:
import csv

# remove the 2 entries we can omit
name_dict.pop(u'划')
name_dict.pop(u'旗')

for char, count in name_dict.items():
    print char, count
    


省 23
市 656
盟 3
区 1244
县 1495
州 30


### Look for Names in FDB with the unique last characters in them

In [35]:

# open FDB source files
city_data = open('fdb_city_names.csv', 'rb')
city_reader = csv.reader(city_data, delimiter='|')
next(city_reader)

test_chars = name_dict.keys()

check_city = {}

for city in city_reader:
    id = city[0]
    name = city[1].decode('utf-8')
    last_char = name[-1]
    if len(name) > 2:
        for char in test_chars:
            if char == last_char:
                #print id, name
                check_city[id] = name


In [36]:
# open pg_dev source files
county_data = open('fdb_county_names.csv', 'rb')
county_reader = csv.reader(county_data, delimiter='|')
next(county_reader)

test_chars = name_dict.keys()

check_county = {}

for county in county_reader:
    id = county[0]
    name = county[1].decode('utf-8')
    last_char = name[-1]
    if len(name) > 2:
        for char in test_chars:
            if char == last_char:
                #print id, name
                check_county[id] = name

### Check for presence of existing short-form synonyms

Compare "check_city" and "check_county" to the contents of CitySynonyms and CountySynonyms.  Look for whether an entry exists in the names set that matches the "check" name minus the last character.

In [37]:
# open the FDB files
CitySynonyms_data = open('CitySynonyms.csv', 'rb')
CitySynonyms = csv.reader(CitySynonyms_data, delimiter='|')
next(CitySynonyms)

check_CitySynonyms = defaultdict(list)

for city in CitySynonyms:
    [id, name, locale, canonical, map_code] = city
    name = name.decode('utf-8')
    if map_code == '1':
        check_CitySynonyms[id].append(name)

print check_CitySynonyms['456607']


[u'Beijing', u'P\xe9kin', u'\u5317\u4eac', u'\ubca0\uc774\uc9d5', u'\u5317\u4eac', u'Beijing', u'Peking']


In [38]:
# open the FDB files
CountySynonyms_data = open('CountySynonyms.csv', 'rb')
CountySynonyms = csv.reader(CountySynonyms_data, delimiter='|')
next(CountySynonyms)

check_CountySynonyms = defaultdict(list)

for county in CountySynonyms:
    [id, name, locale, canonical, map_code] = county
    name = name.decode('utf-8')
    if map_code == '1':
        check_CountySynonyms[id].append(name)

print check_CountySynonyms['1073884']

[u'Beijing', u'P\xe9kin', u'\u5317\u4eac', u'\ubca0\uc774\uc9d5', u'\u5317\u4eac\u5e02', u'Beijing', u'Peking', u'\u5317\u4eac']


### Remove existing synonym-features from processing list

Since we already have short-form names for these, remove them from the list of features that we need to process.

In [39]:
print check_county['456057']

ok_cities = set()
ok_counties = set()

for id, name in check_city.iteritems():
    last_char = name[-1]
    short_name = name[:-1]
    for known_name in check_CitySynonyms[id]:
        if known_name == short_name:
            ok_cities.add(id)
            
for id, name in check_county.iteritems():
    last_char = name[-1]
    short_name = name[:-1]
    for known_name in check_CountySynonyms[id]:
        if known_name == short_name:
            ok_counties.add(id)            

for id in ok_cities:
    check_city.pop(id)

for id in ok_counties:
    check_county.pop(id)
    


石嘴山市


### What's the damage?

How many features are going to need new synonyms?

In [40]:
print len(check_city.keys())
print len(check_county.keys())

77
352


### Create test files

Next 2 cells used to generate FDB test files.  Loaded these into pg_dev and then attached parent_ids and state names to them

In [41]:
#city_out = open('./city_adds.csv', 'a')
for (city_id, name) in check_city.iteritems():
    short_name = name[:-1]
    pretty_city = u"%s|%s\n" % (city_id, short_name)
    #city_out.write(pretty_city.encode('utf8'))
#city_out.close()

In [42]:
#county_out = open('./county_adds.csv', 'a')
for (county_id, name) in check_county.iteritems():
    short_name = name[:-1]
    pretty_county = u"%s|%s\n" % (county_id, short_name)
    #county_out.write(pretty_county.encode('utf8'))
#county_out.close()


### Create SQL insert statements

Copied and pasted the output from this cell into a SQL file that I ran against pg_dev afterwards.

In [43]:
def create_name_insert(name_rec, feat_type):
    (feat_id, name) = name_rec
    #insert into names (id, name, class, "default", map_code, language_code, type, iso3, tfs_id) 
    #VALUES (1093602, 'Tiemenguan City', 10, False, 1, null, 'synonym', 'CHN', '694570');
    for map_code in ['1','2','3']:
        tfs_id = '694569'
        short_name = name[:-1]
        print ('INSERT INTO names (id, name, class, "default", map_code, language_code, type, iso3, tfs_id) ' 
               'VALUES (%s, \'%s\', %s, False, %s, null, \'synonym\', \'CHN\', \'%s\');') % (feat_id, short_name, feat_type, map_code, tfs_id)
    
for rec in check_city.iteritems():
    create_name_insert(rec, '10')
    
for rec in check_county.iteritems():
    create_name_insert(rec, '2')
    

INSERT INTO names (id, name, class, "default", map_code, language_code, type, iso3, tfs_id) VALUES (1093428, '贺州', 10, False, 1, null, 'synonym', 'CHN', '694569');
INSERT INTO names (id, name, class, "default", map_code, language_code, type, iso3, tfs_id) VALUES (1093428, '贺州', 10, False, 2, null, 'synonym', 'CHN', '694569');
INSERT INTO names (id, name, class, "default", map_code, language_code, type, iso3, tfs_id) VALUES (1093428, '贺州', 10, False, 3, null, 'synonym', 'CHN', '694569');
INSERT INTO names (id, name, class, "default", map_code, language_code, type, iso3, tfs_id) VALUES (1093429, '呼伦贝尔', 10, False, 1, null, 'synonym', 'CHN', '694569');
INSERT INTO names (id, name, class, "default", map_code, language_code, type, iso3, tfs_id) VALUES (1093429, '呼伦贝尔', 10, False, 2, null, 'synonym', 'CHN', '694569');
INSERT INTO names (id, name, class, "default", map_code, language_code, type, iso3, tfs_id) VALUES (1093429, '呼伦贝尔', 10, False, 3, null, 'synonym', 'CHN', '694569');
INSERT INT