In [1]:
# prepare
!pip install geonamescache
!pip install unidecode

import re
import json
import pandas as pd
import numpy as np
from unidecode import unidecode
from geonamescache import GeonamesCache




In [2]:
# read headlines
#regexp = re.compile(r"\b(miami beach|miami|new york city|new york|dallas|houston|sao paulo|brownsville|st. louis|louisiana|orlando|tampa|baltimore)\b",flags=re.IGNORECASE)
count = 0
headlines = []
file = open("data/headlines.txt", 'r')
for line in file.readlines():
        headlines.append(line.rstrip('\n'))
        count = count + 1
file.close()
print("read %d headlines."%count)
print("sample: ")
print(headlines[0:4])

read 650 headlines.
sample: 
['Zika Outbreak Hits Miami', 'Could Zika Reach New York City?', 'First Case of Zika in Miami Beach', 'Mystery Virus Spreads in Recife, Brazil']


In [3]:
# read cities
geonc = GeonamesCache()
print(type(geonc))
print(dir(geonc))

cities = geonc.get_cities()
#print(type(cities))
#print(dir(cities))
print("read %d cities."%len(cities))
print("sample: ")
print(json.dumps(list(cities.items())[0:2],indent=3))


<class 'geonamescache.GeonamesCache'>
['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_load_data', 'cities', 'cities_by_names', 'cities_items', 'continents', 'countries', 'datadir', 'get_cities', 'get_cities_by_name', 'get_continents', 'get_countries', 'get_countries_by_names', 'get_dataset_by_key', 'get_us_counties', 'get_us_states', 'get_us_states_by_names', 'us_counties', 'us_states']
read 24336 cities.
sample: 
[
   [
      "3041563",
      {
         "geonameid": 3041563,
         "name": "Andorra la Vella",
         "latitude": 42.50779,
         "longitude": 1.52109,
         "countrycode": "AD",
         "population": 20430,
         "timezone": "Europe/Andorra",
         "admin1code":

In [4]:
# read countries
countries = geonc.get_countries()
print("read %d countries."%len(countries))
print("sample: ")
print(json.dumps(list(countries.items())[0:2],indent=3))

read 252 countries.
sample: 
[
   [
      "AD",
      {
         "geonameid": 3041565,
         "name": "Andorra",
         "iso": "AD",
         "iso3": "AND",
         "isonumeric": 20,
         "fips": "AN",
         "continentcode": "EU",
         "capital": "Andorra la Vella",
         "areakm2": 468,
         "population": 84000,
         "tld": ".ad",
         "currencycode": "EUR",
         "currencyname": "Euro",
         "phone": "376",
         "postalcoderegex": "^(?:AD)*(\\d{3})$",
         "languages": "ca",
         "neighbours": "ES,FR"
      }
   ],
   [
      "AE",
      {
         "geonameid": 290557,
         "name": "United Arab Emirates",
         "iso": "AE",
         "iso3": "ARE",
         "isonumeric": 784,
         "fips": "AE",
         "continentcode": "AS",
         "capital": "Abu Dhabi",
         "areakm2": 82880,
         "population": 4975593,
         "tld": ".ae",
         "currencycode": "AED",
         "currencyname": "Dirham",
         "phone": "9

In [5]:
# create some auxiliary lists
countryNames = list(map(lambda x:x["name"], list(countries.values())))
countryNamesUnidecoded=[unidecode(country) for country in countryNames]
cityNames = list(map(lambda x:x["name"], list(cities.values())))
cityNamesUnidecoded=[unidecode(city) for city in cityNames]
countryKeys = list(countries.keys())
cityKeys = list(cities.keys())
print(cityKeys[0:10])
print(cityNames[0:10])
print(cityNamesUnidecoded[0:10])
print(countryKeys[0:10])
print(countryNames[0:10])
print(countryNamesUnidecoded[0:10])


['3041563', '290594', '291074', '291580', '291696', '292223', '292231', '292239', '292672', '292688']
['Andorra la Vella', 'Umm Al Quwain City', 'Ras Al Khaimah City', 'Zayed City', 'Khawr Fakkān', 'Dubai', 'Dibba Al-Fujairah', 'Dibba Al-Hisn', 'Sharjah', 'Ar Ruways']
['Andorra la Vella', 'Umm Al Quwain City', 'Ras Al Khaimah City', 'Zayed City', 'Khawr Fakkan', 'Dubai', 'Dibba Al-Fujairah', 'Dibba Al-Hisn', 'Sharjah', 'Ar Ruways']
['AD', 'AE', 'AF', 'AG', 'AI', 'AL', 'AM', 'AO', 'AQ', 'AR']
['Andorra', 'United Arab Emirates', 'Afghanistan', 'Antigua and Barbuda', 'Anguilla', 'Albania', 'Armenia', 'Angola', 'Antarctica', 'Argentina']
['Andorra', 'United Arab Emirates', 'Afghanistan', 'Antigua and Barbuda', 'Anguilla', 'Albania', 'Armenia', 'Angola', 'Antarctica', 'Argentina']


In [6]:
# create data frame
regexMap = {} # regex cache for better performance
headlines_df=pd.DataFrame(columns=["headline", "cities", "countries"])
for headline in headlines:
    foundCity=""
    foundCountry=""
    index=0
    for cityName in cityNamesUnidecoded:
        if cityName in regexMap:
            regexCity = regexMap[cityName]
        else:
            regexCity = re.compile(r'\b'+cityName+r'\b') 
            regexMap[cityName]=regexCity
        if regexCity.search(headline):
            foundCity=cityName;
            foundCountryCode=cities[cityKeys[index]]["countrycode"]
            foundCountry=countries[foundCountryCode]["name"]
            #print("Assigned %s of %s with iso code %s to headline %s"%(foundCity,foundCountry,foundCountryCode,headline))
            pdSeries=pd.Series([headline,foundCity,foundCountry])
            pdDataFrame=pd.DataFrame([pdSeries])
            headlines_df=pd.concat([pdDataFrame,headlines_df],ignore_index=True)
            break;
        index=index+1
    if foundCity=="":
        print("found no city for headline %s"%headline)
            
print(headlines_df.to_string())

found no city for headline Louisiana Zika cases up to 26
found no city for headline Zika infects pregnant woman in Cebu
found no city for headline Spanish Flu Sighted in Antigua
found no city for headline Carnival under threat in Rio De Janeiro due to Zika outbreak
found no city for headline Zika case reported in Oton
found no city for headline Hillsborough uses innovative trap against Zika 20 minutes ago
found no city for headline Maka City Experiences Influenza Outbreak
found no city for headline More Zika patients reported in Mcallen
found no city for headline West Nile Virus Outbreak in Saint Johns
found no city for headline More people in Mclean are infected with Hepatitis A every year
found no city for headline Malaria Exposure in Sussex
found no city for headline Greenwich Establishes Zika Task Force
found no city for headline Will West Nile Virus vaccine help Parsons?
found no city for headline Yulee takes a hit from Spreading Sickness
found no city for headline The Spread of C