In [152]:
import pandas as pd
from geonamescache import GeonamesCache
from unidecode import unidecode_expect_ascii

In [153]:
dataset = pd.read_csv("data/news_location_data.csv", na_values=["Missing"])

In [154]:
dataset = dataset[dataset['cities'].notna()]

In [155]:
len(dataset)

607

In [156]:
gc = GeonamesCache()

In [157]:
dataset[:10]

Unnamed: 0,headline,countries,cities
0,Zika Outbreak Hits Miami,,Miami
1,Could Zika Reach New York City?,,New York City
2,First Case of Zika in Miami Beach,,Miami Beach
3,"Mystery Virus Spreads in Recife, Brazil",Brazil,Recife
4,Dallas man comes down with case of Zika,,Dallas
5,Trinidad confirms first Zika case,,Trinidad
6,Zika Concerns are Spreading in Houston,,Houston
7,Geneve Scientists Battle to Find Cure,,Geneve
8,The CDC in Atlanta is Growing Worried,,Atlanta
9,Zika Infested Monkeys in Sao Paulo,,Sao Paulo


In [158]:
accented_names = [city['name'] for city in gc.get_cities().values()
                  if city['name'] != unidecode_expect_ascii(city['name'])]

alternative_names = {unidecode_expect_ascii(name): name
                     for name in accented_names}

In [159]:
geo_data = {"latitude":[], "longitude":[], "countrycode":[]}

for city, country in zip(dataset["cities"], dataset["countries"]):
    city_data = gc.get_cities_by_name(city)

    if (city_data):
        if (len(city_data) > 1):
            if (not pd.isna(country)):
                matched_city = match_city_country(city_data, country)
                
                if(len(matched_city) == 1):
                    for key, values in city_data[matched_city].items():
                        geo_data["latitude"].append(values["latitude"])
                        geo_data["longitude"].append(values["longitude"])
                        geo_data["countrycode"].append(values["countrycode"])
                else:
                    for key, values in city_data[most_populous_city(city_data[i] for i in matched_city)].items():
                        geo_data["latitude"].append(values["latitude"])
                        geo_data["longitude"].append(values["longitude"])
                        geo_data["countrycode"].append(values["countrycode"])
            else:
                for key, values in city_data[most_populous_city(city_data)].items():
                    geo_data["latitude"].append(values["latitude"])
                    geo_data["longitude"].append(values["longitude"])
                    geo_data["countrycode"].append(values["countrycode"])
        else:
            for key, values in city_data[0].items():
                geo_data["latitude"].append(values["latitude"])
                geo_data["longitude"].append(values["longitude"])
                geo_data["countrycode"].append(values["countrycode"])
    else:
        alt_city_data = gc.get_cities_by_name(alternative_names[city])
        
        if (len(alt_city_data) > 1):
            if (not pd.isna(country)):
                matched_city = match_city_country(alt_city_data, country)
                
                if(len(matched_city) == 1):
                    for key, values in alt_city_data[matched_city].items():
                        geo_data["latitude"].append(values["latitude"])
                        geo_data["longitude"].append(values["longitude"])
                        geo_data["countrycode"].append(values["countrycode"])
                else:
                    for key, values in alt_city_data[most_populous_city(alt_city_data[i] for i in matched_city)].items():
                        geo_data["latitude"].append(values["latitude"])
                        geo_data["longitude"].append(values["longitude"])
                        geo_data["countrycode"].append(values["countrycode"])
            else:
                for key, values in alt_city_data[most_populous_city(alt_city_data)].items():
                    geo_data["latitude"].append(values["latitude"])
                    geo_data["longitude"].append(values["longitude"])
                    geo_data["countrycode"].append(values["countrycode"])
        else:
            for key, values in alt_city_data[0].items():
                geo_data["latitude"].append(values["latitude"])
                geo_data["longitude"].append(values["longitude"])
                geo_data["countrycode"].append(values["countrycode"])

In [160]:
len(geo_data["countrycode"])

607

In [161]:
def most_populous_city(city_data):
    highest_population = 0
    highest_population_idx = 0
    for idx, city in enumerate(city_data):
        for key, values in city.items():
            if(values.get("population") > highest_population):
                highest_population = values.get("population")
                highest_population_idx = idx
    
    return highest_population_idx

In [162]:
def match_city_country(city_data, country):
    matched_idx = []
    cntry_code = gc.get_countries_by_names()[country]["iso"]
    for idx, city in enumerate(city_data):
        for key, values in city.items():   
            if(values.get("countrycode") == cntry_code):
                matched_idx.append(idx)
                
    return matched_idx

In [164]:
dataset = dataset.assign(**geo_data)

In [168]:
dataset[:20]

Unnamed: 0,headline,countries,cities,latitude,longitude,countrycode
0,Zika Outbreak Hits Miami,,Miami,25.77427,-80.19366,US
1,Could Zika Reach New York City?,,New York City,40.71427,-74.00597,US
2,First Case of Zika in Miami Beach,,Miami Beach,25.79065,-80.13005,US
3,"Mystery Virus Spreads in Recife, Brazil",Brazil,Recife,-8.05389,-34.88111,BR
4,Dallas man comes down with case of Zika,,Dallas,32.78306,-96.80667,US
5,Trinidad confirms first Zika case,,Trinidad,-14.83333,-64.9,BO
6,Zika Concerns are Spreading in Houston,,Houston,29.76328,-95.36327,US
7,Geneve Scientists Battle to Find Cure,,Geneve,46.20222,6.14569,CH
8,The CDC in Atlanta is Growing Worried,,Atlanta,33.749,-84.38798,US
9,Zika Infested Monkeys in Sao Paulo,,Sao Paulo,-23.5475,-46.63611,BR
