## Parsing the News Headlines ## 

Find any city and/or country names mentioned in each of the news headlines.

In [1]:
import pandas as pd

In [3]:
headline_file = open('data/headlines.txt', 'r')

In [4]:
headlines = [line.strip()
             for line in headline_file.readlines()]

In [6]:
num_headlines = len(headlines)
print(f"{num_headlines} headlines have been loaded")

650 headlines have been loaded


In [7]:
headlines

['Zika Outbreak Hits Miami',
 'Could Zika Reach New York City?',
 'First Case of Zika in Miami Beach',
 'Mystery Virus Spreads in Recife, Brazil',
 'Dallas man comes down with case of Zika',
 'Trinidad confirms first Zika case',
 'Zika Concerns are Spreading in Houston',
 'Geneve Scientists Battle to Find Cure',
 'The CDC in Atlanta is Growing Worried',
 'Zika Infested Monkeys in Sao Paulo',
 'Brownsville teen contracts Zika virus',
 'Mosquito control efforts in St. Louis take new tactics with Zika threat',
 'San Juan reports 1st U.S. Zika-related death amid outbreak',
 'Flu outbreak in Galveston, Texas',
 'Zika alert â€“ Manila now threatened',
 'Zika afflicts 7 in Iloilo City',
 'New Los Angeles Hairstyle goes Viral',
 'Louisiana Zika cases up to 26',
 'Orlando volunteers aid Zika research',
 'Zika infects pregnant woman in Cebu',
 "Chicago's First Zika Case Confirmed",
 'Tampa Bay Area Zika Case Count Climbs',
 'Bad Water Leads to Sickness in Flint, Michigan',
 'Baltimore plans for 

In [8]:
headlines.sort()

In [9]:
headlines

['18 new Zika Cases in Bogota',
 '19 new Zika Cases in Sengkang',
 'Alameda Residents Recieve Rabies vaccine',
 'Albany Residents Recieve Respiratory Syncytial Virus vaccine',
 'Antipolo under threat from Zika Virus',
 'Arhus is infested with Bronchitis',
 'Arvada is infested with Syphilis',
 'Authorities a Miami',
 'Authorities are Worried about the Spread of Bronchitis in Silver Spring',
 'Authorities are Worried about the Spread of Chickenpox in Hemet',
 'Authorities are Worried about the Spread of Chickenpox in Richmond',
 'Authorities are Worried about the Spread of Dengue in Kingston',
 'Authorities are Worried about the Spread of Gonorrhea in Taoyuan City',
 'Authorities are Worried about the Spread of Hepatitis B in Yiwu',
 'Authorities are Worried about the Spread of Hepatitis D in Akron',
 'Authorities are Worried about the Spread of Hepatitis D in Ganja',
 'Authorities are Worried about the Spread of Hepatitis D in North Bay',
 'Authorities are Worried about the Spread of In

In [17]:
from unidecode import unidecode
import re

def name_to_regex(name):
    decoded_name = unidecode(name)
    if name != decoded_name:
        regex = fr'\b({name}|{decoded_name})\b'
    else:
        regex = fr'\b{name}\b'
    return re.compile(regex, flags=re.IGNORECASE)

In [18]:
def get_name_in_text(text, dictionary):
    for regex, name in sorted(dictionary.items(), key=lambda x: x[1]):
        if regex.search(text):
            return name
    return None

In [19]:
from geonamescache import GeonamesCache
gc = GeonamesCache()

countries = [country['name'] for country in gc.get_countries().values()]
country_to_name = {name_to_regex(name): name for name in countries}

cities = [city['name'] for city in gc.get_cities().values()]
city_to_name = {name_to_regex(name): name for name in cities}

In [20]:
import pandas as pd

matched_countries = [get_name_in_text(headline, country_to_name) for headline in headlines]
matched_cities = [get_name_in_text(headline, city_to_name) for headline in headlines]
data = {'Headline': headlines, 'City': matched_cities, 'Country': matched_countries}
df = pd.DataFrame(data)

In [21]:
df

Unnamed: 0,Headline,City,Country
0,18 new Zika Cases in Bogota,Bogotá,
1,19 new Zika Cases in Sengkang,Sengkang,
2,Alameda Residents Recieve Rabies vaccine,Alameda,
3,Albany Residents Recieve Respiratory Syncytial...,Albany,
4,Antipolo under threat from Zika Virus,Antipolo,
5,Arhus is infested with Bronchitis,Århus,
6,Arvada is infested with Syphilis,Arvada,
7,Authorities a Miami,Miami,
8,Authorities are Worried about the Spread of Br...,Of,
9,Authorities are Worried about the Spread of Ch...,Hemet,


In [22]:
summary = df[['City', 'Country']].describe()
print(summary)

       City   Country
count   619        15
unique  510        10
top      Of  Malaysia
freq     45         3


In [23]:
of_cities = df[df.City == 'Of'][['City', 'Headline']]
ten_of_cities = of_cities.head(10)
print(ten_of_cities.to_string(index=False))

City                                           Headline
  Of  Authorities are Worried about the Spread of Br...
  Of  Authorities are Worried about the Spread of Ch...
  Of  Authorities are Worried about the Spread of Go...
  Of  Authorities are Worried about the Spread of He...
  Of  Authorities are Worried about the Spread of In...
  Of  Authorities are Worried about the Spread of Ma...
  Of  Authorities are Worried about the Spread of Ro...
  Of  Authorities are Worried about the Spread of Sy...
  Of             Case of Measles Reported in Springdale
  Of              Case of Measles Reported in Vancouver
