In [5]:
import re
import pandas as pd
import geonamescache as gnc
from unidecode import unidecode


gc = gnc.GeonamesCache()
cities = [unidecode(city["name"]) for city in gc.get_cities().values()]
countries = [unidecode(country["name"]) for country in gc.get_countries().values()]

def read_file(filename):
    headlines = []
    with open(filename, 'r') as file:
        for line in file.readlines():
            headlines.append(unidecode(line).rstrip("\n"))
            
    return headlines

def parse_data(headlines):
    rows = {}
    
    # For each city name, search each headline for that name and store results in a dictionary
    for city in cities:
        regexp = re.compile(".*({})( |-|'|\?|,|$)".format(city))
        for line in headlines:
            if re.match(regexp, line):
                rows[line]={"city":city}

    # For each country name, search each headline for that name and store results in a dictionary
    for country in countries:
        regexp = re.compile(".*({})( |-|'|\?|,|$)".format(country))
        for line in headlines:
            if re.match(regexp, line):
                row = rows.get(line)
                if row:
                    row["country"]=country
                else:
                    rows[line]={"country":country}

    # move the contents of the dictionary into a DataFrame and display it
    data = []
    for row in rows.keys():
        data.append([row, rows[row].get("country", "NaN"), rows[row].get("city", "NaN")])
    print(data)
    
if __name__ == '__main__':
    filename = "./data/headlines.txt"
    parse_data(read_file(filename))

[['Authorities are Worried about the Spread of Norovirus in Dubai', 'NaN', 'Dubai'], ['West Nile Virus Symptoms Spread all over Yerevan', 'NaN', 'Yerevan'], ['Tuberculosis Hits Luanda', 'NaN', 'Luanda'], ['Outbreak of Zika in Santa Elena', 'NaN', 'Santa Elena'], ['Will Hepatitis B vaccine help La Paz?', 'NaN', 'La Paz'], ['Authorities are Worried about the Spread of Norovirus in Buenos Aires', 'NaN', 'Buenos Aires'], ['Hepatitis E re-emerges in Santa Rosa', 'NaN', 'Santa Rosa'], ['Bronchitis Symptoms Spread all over Santa Fe', 'NaN', 'Santa Fe'], ['Zika spreads to San Luis Potosi', 'NaN', 'San Luis'], ['San Juan reports 1st U.S. Zika-related death amid outbreak', 'NaN', 'San Juan'], ['Rumors about Hepatitis D Spreading in San Juan Capistrano have been Refuted', 'NaN', 'San Juan Capistrano'], ['Zika Virus Reaches San Francisco', 'NaN', 'San Francisco'], ['Zika alert for Rio Grande Valley residents', 'NaN', 'Rio Grande'], ['More Zika patients reported in Bella Vista', 'NaN', 'Vista'], ['