# 2. Data Enrichment and Cleanup

## Enrich the data with geonames lat/long

In [109]:
import time
import json
import pandas as pd
import requests

Read CSV and create a dataframe

In [110]:
df1 = pd.read_csv("cities_data_merged_2019-04-21.csv")

In [111]:
df1.head()

Unnamed: 0.1,Unnamed: 0,id,name,count
0,0,2643743,London,468
1,1,2988507,Paris,293
2,2,3128760,Barcelona,176
3,3,2759794,Amsterdam,172
4,4,3117735,Madrid,171


Drop first column and rename the second one

In [112]:
df1.drop(['Unnamed: 0'], axis=1, inplace=True)

In [113]:
df1.columns = ['geonamesId', 'name', 'count']
df1.head()

Unnamed: 0,geonamesId,name,count
0,2643743,London,468
1,2988507,Paris,293
2,3128760,Barcelona,176
3,2759794,Amsterdam,172
4,3117735,Madrid,171


Query geonames
* http://www.geonames.org/export/web-services.html
* http://www.geonames.org/export/credits.html
> note: hourly limit of 1000 credits

In [134]:
geonameuser = "mpasin"
url = "http://api.geonames.org/hierarchyJSON?geonameId=%s&username=%s"
# eg http://api.geonames.org/hierarchyJSON?geonameId=2643743&username=mpasin
def open_geonames(_id):
    r = requests.get(url % (str(_id), geonameuser))
    return r.json()

def geonames_details(_id):
    _id = int(_id) # make sure it's a number
    data = check_geonames(_id)
    try:
        for x in data['geonames']:
            if x['geonameId'] == _id:
                lat = x['lat']
                lng = x['lng']
                countryCode = x['countryCode']
                countryName = x['countryName']
                return [lat, lng, countryCode, countryName]
    except Exception as e: 
        print(e)
        print("Error parsing JSON: %s" % str(data))
        return 'None'
    

In [147]:
# EG
geonames_details(3372783)

Goal: Retrieve geonames details and store into a dict which we can serialize to json later on. 
We use a list of dictionaries so to preserve the original order.

Init the JSON file storing the data from geonames.

In [136]:
temp = {}
temp['data'] = []
for x in df1['geonamesId']:
    temp['data'].append({x: None})
with open('geonames_temp.json', 'w') as outfile:  
    json.dump(temp  , outfile)

Read the data back in for iteration

In [145]:
with open('geonames_temp.json') as infile:  
    tempfiledata = json.load(infile)

Iterate and enrich the dict with the geonames details. 

If geonames API fails, the value for a place ID remains null. 

The iteration only takes null-place info elements, so we can rerun this cell as many times as needed to get data for all places. 

In [146]:
counter = 0
for ddict in tempfiledata['data']:
    _id = next(iter(ddict)) # get first element
    if not ddict[_id]:
        print(counter, "...")
        res = geonames_details(_id)
        if res:
            tempfiledata['data'][counter][_id] = res
        time.sleep(1)
    else:
        print("skipping", counter)
    counter += 1

# now save to file
with open('geonames_temp.json', 'w') as outfile:  
    json.dump(tempfiledata, outfile)
print("DONE: data saved")

skipping 0
skipping 1
skipping 2
skipping 3
skipping 4
skipping 5
skipping 6
skipping 7
skipping 8
skipping 9
skipping 10
skipping 11
skipping 12
skipping 13
skipping 14
skipping 15
skipping 16
skipping 17
skipping 18
skipping 19
skipping 20
skipping 21
skipping 22
skipping 23
skipping 24
skipping 25
skipping 26
skipping 27
skipping 28
skipping 29
skipping 30
skipping 31
skipping 32
skipping 33
skipping 34
skipping 35
skipping 36
skipping 37
skipping 38
skipping 39
skipping 40
skipping 41
skipping 42
skipping 43
skipping 44
skipping 45
skipping 46
skipping 47
skipping 48
skipping 49
skipping 50
skipping 51
skipping 52
skipping 53
skipping 54
skipping 55
skipping 56
skipping 57
skipping 58
skipping 59
skipping 60
skipping 61
skipping 62
skipping 63
skipping 64
skipping 65
skipping 66
skipping 67
skipping 68
skipping 69
skipping 70
skipping 71
skipping 72
skipping 73
skipping 74
skipping 75
skipping 76
skipping 77
skipping 78
skipping 79
skipping 80
skipping 81
skipping 82
skipping 83
sk

Finally, add the geonames data to the original dframe

In [149]:
lats, longs, countryCodes, countryNames = [], [], [], []
for x in tempfiledata['data']:
    _id = next(iter(x)) # get first element
    if not x[_id]:
        lats.append([""])
        longs.append([""])
        countryCodes.append([""])
        countryNames.append([""])
    else:
        lats.append(x[_id][0])
        longs.append(x[_id][1])
        countryCodes.append(x[_id][2])
        countryNames.append(x[_id][3])

In [154]:
len(df1) == len(lats) == len(longs) == len(countryCodes) == len(countryNames)

True

In [152]:
# when finished, update the dataframe and save 
df1['lat'] = lats
df1['lng'] = longs
df1['countryCode'] = countryCodes
df1['countryName'] = countryNames
df1.to_csv(r'cities_data_enriched_2019-04-22.csv')

## Remove non-EU cities from table

In [185]:
df1 = pd.read_csv("cities_data_enriched_2019-04-22.csv")
europe_countries = ["AD","AL","AT","AX","BA","BE","BG","BY","CH","CZ","DE","DK","EE","ES","FI","FO","FR","GB","GG","GI","GR","HR","HU","IE","IM","IS","IT","JE","LI","LT","LU","LV","MC","MD","ME","MK","MT","NL","NO","PL","PT","RO","RS","RU","SE","SI","SJ","SK","SM","UA","VA"]
df1.head()
df1.describe()

Unnamed: 0.1,Unnamed: 0,geonamesId,count
count,1466.0,1466.0,1466.0
mean,732.5,2993313.0,10.591405
std,423.34206,1453465.0,24.304303
min,0.0,101628.0,1.0
25%,366.25,2343778.0,1.0
50%,732.5,2922342.0,3.0
75%,1098.75,3455342.0,9.0
max,1465.0,10630000.0,468.0


In [186]:
to_drop = []

for x,y in enumerate(df1['countryCode']):
    if y not in europe_countries:
        print(x, y)
        to_drop += [x]
    
df1.drop(df1.index[to_drop], inplace=True)
df1.describe()

14 US
23 US
30 AU
35 JP
36 CA
37 US
38 US
43 CN
44 US
47 CA
52 AU
54 KR
60 US
63 BR
66 US
67 US
68 US
71 SG
74 US
75 TW
85 US
90 IL
93 US
95 CA
97 AU
100 US
108 US
112 US
115 US
117 US
124 US
128 MY
129 CA
130 US
132 US
134 JP
138 US
140 US
142 BR
146 HK
151 US
154 US
156 US
158 US
159 IL
160 US
161 US
169 BR
170 US
171 US
172 AU
178 US
179 US
180 AR
181 US
183 JP
184 US
185 TW
186 US
187 CN
189 US
193 US
194 JP
195 TN
197 US
199 US
219 US
220 JP
222 TW
230 US
236 US
238 US
239 US
250 CA
251 US
254 US
257 AU
258 JP
259 US
262 NZ
268 CN
269 AU
270 CN
273 JP
274 US
275 US
280 CL
281 US
282 IR
283 IN
284 ZA
287 JP
292 KR
296 US
298 US
301 CA
302 US
307 US
308 JP
311 JP
312 US
316 JP
317 US
318 CA
320 US
322 CN
333 CN
334 US
338 US
340 US
346 JP
351 LB
352 US
353 JP
354 US
355 US
356 US
358 CN
359 US
360 US
362 BR
364 CA
370 IR
371 CA
375 US
379 US
380 US
381 US
386 CA
388 IL
389 CO
397 IL
398 AU
400 AU
401 KR
404 US
405 CN
409 US
413 US
414 AU
415 GE
416 AU
417 CA
418 US
419 NZ
420 US
422

Unnamed: 0.1,Unnamed: 0,geonamesId,count
count,808.0,808.0,808.0
mean,712.892327,2627860.0,13.493812
std,454.439758,813793.4,30.379793
min,0.0,251833.0,1.0
25%,294.75,2641306.0,1.0
50%,685.5,2825690.0,3.0
75%,1173.25,3031248.0,13.0
max,1464.0,6543862.0,468.0


In [188]:
df1.drop(['Unnamed: 0'], axis=1, inplace=True)
df1.to_csv(r'cities_data_final_2019-04-22.csv')