## Parsing the News Headlines ## 

## Objective ## 

Find any city and/or country names mentioned in each of the news headlines.

In [1]:
import pandas as pd

## Workflow #1 ##
Load in the headline data and examine it for any data quality issues.
- Use any library/data structure to read in the headlines.
- Read through some of the headlines and identify potential problems.

In [2]:
headline_file = open('data/headlines.txt', 'r')

In [3]:
headlines = [line.strip()
             for line in headline_file.readlines()]

In [4]:
num_headlines = len(headlines)
print(f"{num_headlines} headlines have been loaded")

650 headlines have been loaded


In [5]:
headlines

['Zika Outbreak Hits Miami',
 'Could Zika Reach New York City?',
 'First Case of Zika in Miami Beach',
 'Mystery Virus Spreads in Recife, Brazil',
 'Dallas man comes down with case of Zika',
 'Trinidad confirms first Zika case',
 'Zika Concerns are Spreading in Houston',
 'Geneve Scientists Battle to Find Cure',
 'The CDC in Atlanta is Growing Worried',
 'Zika Infested Monkeys in Sao Paulo',
 'Brownsville teen contracts Zika virus',
 'Mosquito control efforts in St. Louis take new tactics with Zika threat',
 'San Juan reports 1st U.S. Zika-related death amid outbreak',
 'Flu outbreak in Galveston, Texas',
 'Zika alert â€“ Manila now threatened',
 'Zika afflicts 7 in Iloilo City',
 'New Los Angeles Hairstyle goes Viral',
 'Louisiana Zika cases up to 26',
 'Orlando volunteers aid Zika research',
 'Zika infects pregnant woman in Cebu',
 "Chicago's First Zika Case Confirmed",
 'Tampa Bay Area Zika Case Count Climbs',
 'Bad Water Leads to Sickness in Flint, Michigan',
 'Baltimore plans for 

In [6]:
headlines.sort()

In [7]:
headlines

['18 new Zika Cases in Bogota',
 '19 new Zika Cases in Sengkang',
 'Alameda Residents Recieve Rabies vaccine',
 'Albany Residents Recieve Respiratory Syncytial Virus vaccine',
 'Antipolo under threat from Zika Virus',
 'Arhus is infested with Bronchitis',
 'Arvada is infested with Syphilis',
 'Authorities a Miami',
 'Authorities are Worried about the Spread of Bronchitis in Silver Spring',
 'Authorities are Worried about the Spread of Chickenpox in Hemet',
 'Authorities are Worried about the Spread of Chickenpox in Richmond',
 'Authorities are Worried about the Spread of Dengue in Kingston',
 'Authorities are Worried about the Spread of Gonorrhea in Taoyuan City',
 'Authorities are Worried about the Spread of Hepatitis B in Yiwu',
 'Authorities are Worried about the Spread of Hepatitis D in Akron',
 'Authorities are Worried about the Spread of Hepatitis D in Ganja',
 'Authorities are Worried about the Spread of Hepatitis D in North Bay',
 'Authorities are Worried about the Spread of In

Comments on the cities appearing in each headline:
- Has to rely on external data to figure out city names
- There are city names with multiple words
- City name require case-insensitve matching and accent marks removed

## Workflow #2 ##
Using regular expressions and the cities and countries within the geonamescache library, match any cities/countries within each headline. 
- Make sure to normalize headlines and city/country names by removing accent marks. This can be done with the unidecode library.
- Watch out for multiple cities in a headline and matches on short words! We want the match to be on the entire city—for example San Marino—and not a partial match—San.

In [8]:
from unidecode import unidecode
import re

def name_to_regex(name):
    decoded_name = unidecode(name)
    if name != decoded_name:
        regex = fr'\b({name}|{decoded_name})\b'
    else:
        regex = fr'\b{name}\b'
    return re.compile(regex, flags=re.IGNORECASE)

In [9]:
unidecode('Shibirghān')

'Shibirghan'

In [10]:
def get_name_in_text(text, dictionary):
    for regex, name in sorted(dictionary.items(), key=lambda x: x[1]):
        if regex.search(text):
            return name
    return None

In [11]:
from geonamescache import GeonamesCache
gc = GeonamesCache()

countries = [country['name'] for country in gc.get_countries().values()]
country_to_name = {name_to_regex(name): name for name in countries}

cities = [city['name'] for city in gc.get_cities().values()]
city_to_name = {name_to_regex(name): name for name in cities}

In [12]:
countries

['Andorra',
 'United Arab Emirates',
 'Afghanistan',
 'Antigua and Barbuda',
 'Anguilla',
 'Albania',
 'Armenia',
 'Angola',
 'Antarctica',
 'Argentina',
 'American Samoa',
 'Austria',
 'Australia',
 'Aruba',
 'Aland Islands',
 'Azerbaijan',
 'Bosnia and Herzegovina',
 'Barbados',
 'Bangladesh',
 'Belgium',
 'Burkina Faso',
 'Bulgaria',
 'Bahrain',
 'Burundi',
 'Benin',
 'Saint Barthelemy',
 'Bermuda',
 'Brunei',
 'Bolivia',
 'Bonaire, Saint Eustatius and Saba ',
 'Brazil',
 'Bahamas',
 'Bhutan',
 'Bouvet Island',
 'Botswana',
 'Belarus',
 'Belize',
 'Canada',
 'Cocos Islands',
 'Democratic Republic of the Congo',
 'Central African Republic',
 'Republic of the Congo',
 'Switzerland',
 'Ivory Coast',
 'Cook Islands',
 'Chile',
 'Cameroon',
 'China',
 'Colombia',
 'Costa Rica',
 'Cuba',
 'Cabo Verde',
 'Curacao',
 'Christmas Island',
 'Cyprus',
 'Czechia',
 'Germany',
 'Djibouti',
 'Denmark',
 'Dominica',
 'Dominican Republic',
 'Algeria',
 'Ecuador',
 'Estonia',
 'Egypt',
 'Western Saha

In [13]:
cities

['Andorra la Vella',
 'Umm Al Quwain City',
 'Ras Al Khaimah City',
 'Zayed City',
 'Khawr Fakkān',
 'Dubai',
 'Dibba Al-Fujairah',
 'Dibba Al-Hisn',
 'Sharjah',
 'Ar Ruways',
 'Al Fujairah City',
 'Al Ain City',
 'Ajman City',
 'Adh Dhayd',
 'Abu Dhabi',
 'Khalifah A City',
 'Bani Yas City',
 'Musaffah',
 'Al Shamkhah City',
 'Reef Al Fujairah City',
 'Zaranj',
 'Taloqan',
 'Shīnḏanḏ',
 'Shibirghān',
 'Shahrak',
 'Sar-e Pul',
 'Sang-e Chārak',
 'Aībak',
 'Rustāq',
 'Qarqīn',
 'Qarāwul',
 'Pul-e Khumrī',
 'Paghmān',
 'Nahrīn',
 'Maymana',
 'Mehtar Lām',
 'Mazār-e Sharīf',
 'Lashkar Gāh',
 'Kushk',
 'Kunduz',
 'Khōst',
 'Khulm',
 'Khāsh',
 'Khanabad',
 'Karukh',
 'Kandahār',
 'Kabul',
 'Jalālābād',
 'Jabal os Saraj',
 'Herāt',
 'Ghormach',
 'Ghazni',
 'Gereshk',
 'Gardez',
 'Fayzabad',
 'Farah',
 'Kafir Qala',
 'Charikar',
 'Baraki Barak',
 'Bāmyān',
 'Balkh',
 'Baghlān',
 'Ārt Khwājah',
 'Āsmār',
 'Asadābād',
 'Andkhōy',
 'Bāzārak',
 'Markaz-e Woluswalī-ye Āchīn',
 'Saint John’s',
 'Th

In [14]:
cities.sort()
print(cities)

["'Ali Sabieh", "'s-Gravenzande", "'s-Hertogenbosch", 'A Coruña', 'A Estrada', 'Aabenraa', 'Aachen', 'Aalborg', 'Aalen', 'Aalsmeer', 'Aalst', 'Aalten', 'Aalter', 'Aarau', 'Aarschot', 'Aba', 'Abadan', 'Abadan', 'Abaetetuba', 'Abaeté', 'Abakaliki', 'Abakan', 'Abancay', 'Abano Terme', 'Abashiri', 'Abasolo', 'Abay', 'Abaza', 'Abbeville', 'Abbiategrasso', 'Abbotsford', 'Abbottabad', 'Abdulino', 'Abengourou', 'Abeokuta', 'Abepura', 'Aberdare', 'Aberdeen', 'Aberdeen', 'Aberdeen', 'Aberdeen', 'Abergele', 'Aberystwyth', 'Abha', 'Abhar', 'Abhayāpuri', 'Abidjan', 'Abiko', 'Abilene', 'Abingdon', 'Abington', 'Abinsk', 'Abnūb', 'Abobo', 'Abohar', 'Aboisso', 'Abomey', 'Abomey-Calavi', 'Abomsa', 'Abony', 'Abou el Hassan', 'Abovyan', 'Abqaiq', 'Abrama', 'Abreu e Lima', 'Abreus', 'Abrīsham', 'Abu Dhabi', 'Abu Jibeha', 'Abucay', 'Abuja', 'Aburi', 'Abuyog', 'Abéché', 'Abū Ghurayb', 'Abū Kabīr', 'Abū Qurqāş', 'Abū Tīj', 'Abū Zabad', 'Abū al Maţāmīr', 'Abū ‘Arīsh', 'Acacías', 'Acajete', 'Acajutla', 'Acapone

In [15]:
city_values = gc.get_cities().values()

city_data = [city['name'] for city in city_values]
print(city_data)

['Andorra la Vella', 'Umm Al Quwain City', 'Ras Al Khaimah City', 'Zayed City', 'Khawr Fakkān', 'Dubai', 'Dibba Al-Fujairah', 'Dibba Al-Hisn', 'Sharjah', 'Ar Ruways', 'Al Fujairah City', 'Al Ain City', 'Ajman City', 'Adh Dhayd', 'Abu Dhabi', 'Khalifah A City', 'Bani Yas City', 'Musaffah', 'Al Shamkhah City', 'Reef Al Fujairah City', 'Zaranj', 'Taloqan', 'Shīnḏanḏ', 'Shibirghān', 'Shahrak', 'Sar-e Pul', 'Sang-e Chārak', 'Aībak', 'Rustāq', 'Qarqīn', 'Qarāwul', 'Pul-e Khumrī', 'Paghmān', 'Nahrīn', 'Maymana', 'Mehtar Lām', 'Mazār-e Sharīf', 'Lashkar Gāh', 'Kushk', 'Kunduz', 'Khōst', 'Khulm', 'Khāsh', 'Khanabad', 'Karukh', 'Kandahār', 'Kabul', 'Jalālābād', 'Jabal os Saraj', 'Herāt', 'Ghormach', 'Ghazni', 'Gereshk', 'Gardez', 'Fayzabad', 'Farah', 'Kafir Qala', 'Charikar', 'Baraki Barak', 'Bāmyān', 'Balkh', 'Baghlān', 'Ārt Khwājah', 'Āsmār', 'Asadābād', 'Andkhōy', 'Bāzārak', 'Markaz-e Woluswalī-ye Āchīn', 'Saint John’s', 'The Valley', 'Sarandë', 'Pogradec', 'Kukës', 'Korçë', 'Gjirokastër', 'E

In [16]:
print(country_to_name)

{re.compile('\\bAndorra\\b', re.IGNORECASE): 'Andorra', re.compile('\\bUnited Arab Emirates\\b', re.IGNORECASE): 'United Arab Emirates', re.compile('\\bAfghanistan\\b', re.IGNORECASE): 'Afghanistan', re.compile('\\bAntigua and Barbuda\\b', re.IGNORECASE): 'Antigua and Barbuda', re.compile('\\bAnguilla\\b', re.IGNORECASE): 'Anguilla', re.compile('\\bAlbania\\b', re.IGNORECASE): 'Albania', re.compile('\\bArmenia\\b', re.IGNORECASE): 'Armenia', re.compile('\\bAngola\\b', re.IGNORECASE): 'Angola', re.compile('\\bAntarctica\\b', re.IGNORECASE): 'Antarctica', re.compile('\\bArgentina\\b', re.IGNORECASE): 'Argentina', re.compile('\\bAmerican Samoa\\b', re.IGNORECASE): 'American Samoa', re.compile('\\bAustria\\b', re.IGNORECASE): 'Austria', re.compile('\\bAustralia\\b', re.IGNORECASE): 'Australia', re.compile('\\bAruba\\b', re.IGNORECASE): 'Aruba', re.compile('\\bAland Islands\\b', re.IGNORECASE): 'Aland Islands', re.compile('\\bAzerbaijan\\b', re.IGNORECASE): 'Azerbaijan', re.compile('\\bBosn

In [17]:
print(city_to_name)

{re.compile('\\bAndorra la Vella\\b', re.IGNORECASE): 'Andorra la Vella', re.compile('\\bUmm Al Quwain City\\b', re.IGNORECASE): 'Umm Al Quwain City', re.compile('\\bRas Al Khaimah City\\b', re.IGNORECASE): 'Ras Al Khaimah City', re.compile('\\bZayed City\\b', re.IGNORECASE): 'Zayed City', re.compile('\\b(Khawr Fakkān|Khawr Fakkan)\\b', re.IGNORECASE): 'Khawr Fakkān', re.compile('\\bDubai\\b', re.IGNORECASE): 'Dubai', re.compile('\\bDibba Al-Fujairah\\b', re.IGNORECASE): 'Dibba Al-Fujairah', re.compile('\\bDibba Al-Hisn\\b', re.IGNORECASE): 'Dibba Al-Hisn', re.compile('\\bSharjah\\b', re.IGNORECASE): 'Sharjah', re.compile('\\bAr Ruways\\b', re.IGNORECASE): 'Ar Ruways', re.compile('\\bAl Fujairah City\\b', re.IGNORECASE): 'Al Fujairah City', re.compile('\\bAl Ain City\\b', re.IGNORECASE): 'Al Ain City', re.compile('\\bAjman City\\b', re.IGNORECASE): 'Ajman City', re.compile('\\bAdh Dhayd\\b', re.IGNORECASE): 'Adh Dhayd', re.compile('\\bAbu Dhabi\\b', re.IGNORECASE): 'Abu Dhabi', re.comp

## Workflow #3 ##
Put the extracted data into a pandas DataFrame with three columns: headline, city, country.

In [18]:
import pandas as pd

matched_countries = [get_name_in_text(headline, country_to_name) for headline in headlines]
matched_cities = [get_name_in_text(headline, city_to_name) for headline in headlines]
data = {'Headline': headlines, 'City': matched_cities, 'Country': matched_countries}
df = pd.DataFrame(data)

In [19]:
matched_countries

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 'Belize',
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 N

In [20]:
matched_cities

['Bogotá',
 'Sengkang',
 'Alameda',
 'Albany',
 'Antipolo',
 'Århus',
 'Arvada',
 'Miami',
 'Of',
 'Hemet',
 'Of',
 'Kingston',
 'Of',
 'Of',
 'Akron',
 'Ganja',
 'Bay',
 'Of',
 'Edinburgh',
 'Of',
 'Corona',
 'Buenos Aires',
 'Dubai',
 'Of',
 'Of',
 'Abuja',
 'Cantù',
 'Clovis',
 'Missoula',
 'Flint',
 'Baltimore',
 'Barcelona',
 'Barcelona',
 'Barstow',
 'Batangas',
 'Beijing',
 'Belize City',
 'Benton',
 'Biloxi',
 'Birmingham',
 'Bradenton',
 'Bridgeport',
 'Brisbane',
 'Hayward',
 'San',
 None,
 'Santa Fe',
 'Tehran',
 'Brownsville',
 'Vero Beach',
 'Cancún',
 'Caracas',
 'Rio de Janeiro',
 'Gaithersburg',
 'Calgary',
 'Melbourne',
 'Hilden',
 'Aurora',
 'Of',
 'Of',
 'Dayton',
 'Of',
 'Of',
 'Concord',
 'Of',
 'Chicago',
 'Stillwater',
 'Simpsonville',
 'Chula Vista',
 'Toledo',
 'Denver',
 'North Vancouver',
 None,
 'Kobe',
 'Sanaa',
 'University',
 'Dakar',
 'Bonn',
 'New York City',
 'Dallas',
 'Davenport',
 'Deerfield',
 'Kathmandu',
 'Easton',
 'Doral',
 'Duisburg',
 'Duluth

In [21]:
df

Unnamed: 0,Headline,City,Country
0,18 new Zika Cases in Bogota,Bogotá,
1,19 new Zika Cases in Sengkang,Sengkang,
2,Alameda Residents Recieve Rabies vaccine,Alameda,
3,Albany Residents Recieve Respiratory Syncytial...,Albany,
4,Antipolo under threat from Zika Virus,Antipolo,
5,Arhus is infested with Bronchitis,Århus,
6,Arvada is infested with Syphilis,Arvada,
7,Authorities a Miami,Miami,
8,Authorities are Worried about the Spread of Br...,Of,
9,Authorities are Worried about the Spread of Ch...,Hemet,


## Workflow #4 ##
Make sure there were no issues with the extraction by sampling some of the headlines and examining the city and country names. One method for finding problems is to look for the most common names and see if there are any issues. 

In [22]:
summary = df[['City', 'Country']].describe()
print(summary)

       City   Country
count   619        15
unique  510        10
top      Of  Malaysia
freq     45         3


In [23]:
of_cities = df[df.City == 'Of'][['City', 'Headline']]
ten_of_cities = of_cities.head(10)
print(ten_of_cities.to_string(index=False))

City                                           Headline
  Of  Authorities are Worried about the Spread of Br...
  Of  Authorities are Worried about the Spread of Ch...
  Of  Authorities are Worried about the Spread of Go...
  Of  Authorities are Worried about the Spread of He...
  Of  Authorities are Worried about the Spread of In...
  Of  Authorities are Worried about the Spread of Ma...
  Of  Authorities are Worried about the Spread of Ro...
  Of  Authorities are Worried about the Spread of Sy...
  Of             Case of Measles Reported in Springdale
  Of              Case of Measles Reported in Vancouver


In [24]:
def get_cities_in_headline(headline):
    cities_in_headline = set()
    for regex, name in city_to_name.items():
        match = regex.search(headline)
        if match:
            if headline[match.start()].isupper():
                cities_in_headline.add(name)
                
    return list(cities_in_headline)

df['Cities'] = df['Headline'].apply(get_cities_in_headline)
df['Num_cities'] = df['Cities'].apply(len)

In [25]:
df['Num_cities']

0      1
1      1
2      1
3      1
4      1
      ..
645    1
646    1
647    3
648    1
649    1
Name: Num_cities, Length: 650, dtype: int64

In [26]:
df.Num_cities > 1

0      False
1      False
2      False
3      False
4      False
       ...  
645    False
646    False
647     True
648    False
649    False
Name: Num_cities, Length: 650, dtype: bool

In [27]:
df_multiple_cities = df[df.Num_cities > 1]

In [28]:
df_multiple_cities

Unnamed: 0,Headline,City,Country,Cities,Num_cities
8,Authorities are Worried about the Spread of Br...,Of,,"[Spring, Silver Spring]",2
16,Authorities are Worried about the Spread of He...,Bay,,"[Bay, North Bay]",2
34,Batangas Tourism Takes a Hit as Virus Spreads,Batangas,,"[Hīt, Batangas]",2
44,Bronchitis Keeps Spreading in San Mateo,San,,"[San, San Mateo]",2
49,Can Zika make it here to Vero Beach?,Vero Beach,,"[Vero Beach, Çan]",2
68,Chickenpox has Arrived in Chula Vista,Chula Vista,,"[Chula Vista, Vista]",2
71,Chikungunya Keeps Spreading in North Vancouver,North Vancouver,,"[North Vancouver, Vancouver]",2
75,Chlamydia Symptoms Spread all over University ...,University,,"[University City, University]",2
78,Could Zika Reach New York City?,New York City,,"[York, New York City]",2
93,First Case of Zika in Miami Beach,Miami,,"[Miami Beach, Miami]",2


In [29]:
num_rows, _ = df_multiple_cities.shape

In [30]:
print(f"{num_rows} headlines match multiple cities")

68 headlines match multiple cities


In [31]:
ten_cities = df_multiple_cities[['Cities', 'Headline']].head(10)
print(ten_cities.to_string(index=False))

                        Cities                                           Headline
       [Spring, Silver Spring]  Authorities are Worried about the Spread of Br...
              [Bay, North Bay]  Authorities are Worried about the Spread of He...
               [Hīt, Batangas]      Batangas Tourism Takes a Hit as Virus Spreads
              [San, San Mateo]            Bronchitis Keeps Spreading in San Mateo
             [Vero Beach, Çan]               Can Zika make it here to Vero Beach?
          [Chula Vista, Vista]              Chickenpox has Arrived in Chula Vista
  [North Vancouver, Vancouver]     Chikungunya Keeps Spreading in North Vancouver
 [University City, University]  Chlamydia Symptoms Spread all over University ...
         [York, New York City]                    Could Zika Reach New York City?
          [Miami Beach, Miami]                  First Case of Zika in Miami Beach


In [32]:
ten_cities

Unnamed: 0,Cities,Headline
8,"[Spring, Silver Spring]",Authorities are Worried about the Spread of Br...
16,"[Bay, North Bay]",Authorities are Worried about the Spread of He...
34,"[Hīt, Batangas]",Batangas Tourism Takes a Hit as Virus Spreads
44,"[San, San Mateo]",Bronchitis Keeps Spreading in San Mateo
49,"[Vero Beach, Çan]",Can Zika make it here to Vero Beach?
68,"[Chula Vista, Vista]",Chickenpox has Arrived in Chula Vista
71,"[North Vancouver, Vancouver]",Chikungunya Keeps Spreading in North Vancouver
75,"[University City, University]",Chlamydia Symptoms Spread all over University ...
78,"[York, New York City]",Could Zika Reach New York City?
93,"[Miami Beach, Miami]",First Case of Zika in Miami Beach


In [33]:
df['Cities']

0                           [Bogotá]
1                         [Sengkang]
2                          [Alameda]
3                           [Albany]
4                         [Antipolo]
                   ...              
645                    [Westchester]
646                      [Cleveland]
647    [Salvador, San, San Salvador]
648                        [Ardmore]
649                          [Delhi]
Name: Cities, Length: 650, dtype: object

In [34]:
def get_longest_city(cities):
    if cities:
        return max(cities, key=len)
    return None

df['City'] = df['Cities'].apply(get_longest_city)

In [35]:
df['Cities'].apply(get_longest_city)

0            Bogotá
1          Sengkang
2           Alameda
3            Albany
4          Antipolo
           ...     
645     Westchester
646       Cleveland
647    San Salvador
648         Ardmore
649           Delhi
Name: Cities, Length: 650, dtype: object

Confirm that no erroneous short city-name (4 characters or less) is getting assigned to one of our headlines.

In [36]:
short_cities = df[df.City.str.len() <= 4][['City', 'Headline']]
print(short_cities.to_string(index=False))

 City                                           Headline
 Yiwu  Authorities are Worried about the Spread of He...
 Rome  Authorities are Worried about the Spread of Ma...
 Kobe                     Chikungunya re-emerges in Kobe
 Bonn  Contaminated Meat Brings Trouble for Bonn Farmers
 Erie                        Erie County sets Zika traps
 Kent                       Kent is infested with Rabies
 Lima                Lima tries to address Zika Concerns
 Lyon                   Mad Cow Disease Detected in Lyon
 Molo                Molo Cholera Spread Causing Concern
 Waco                More Zika patients reported in Waco
 Nadi  More people in Nadi are infected with HIV ever...
 Pune                     Pune woman diagnosed with Zika
 Baud  Rumors about Tuberculosis Spreading in Baud ha...
 Suva  Suva authorities confirmed the spread of Rotav...
 Reno  The Spread of Gonorrhea in Reno has been Confi...
 Baku    The Spread of Herpes in Baku has been Confirmed
 Jaén                         Z

In [37]:
df_countries = df[df.Country.notnull()][['City', 'Country', 'Headline']]
print(df_countries.to_string(index=False))

             City    Country                                           Headline
      Belize City     Belize                 Belize City under threat from Zika
           Recife     Brazil            Mystery Virus Spreads in Recife, Brazil
    Kota Kinabalu   Malaysia           New Zika Case in Kota Kinabalu, Malaysia
        Hong Kong  Hong Kong                    Norovirus Exposure in Hong Kong
      Panama City     Panama                    Outbreak of Zika in Panama City
           Panamá     Panama           Panama Cityâ€™s first Zika related death
   Guatemala City  Guatemala  Rumors about Meningitis spreading in Guatemala...
         Campinas     Brazil                   Student sick in Campinas, Brazil
          Bangkok   Thailand                     Thailand-Zika Virus in Bangkok
        Singapore  Singapore                  Zika cases in Singapore reach 393
 Ho Chi Minh City    Vietnam     Zika cases in Vietnam's Ho Chi Minh City surge
       Piracicaba     Brazil            

In [38]:
df_countries

Unnamed: 0,City,Country,Headline
36,Belize City,Belize,Belize City under threat from Zika
293,Recife,Brazil,"Mystery Virus Spreads in Recife, Brazil"
301,Kota Kinabalu,Malaysia,"New Zika Case in Kota Kinabalu, Malaysia"
316,Hong Kong,Hong Kong,Norovirus Exposure in Hong Kong
333,Panama City,Panama,Outbreak of Zika in Panama City
339,Panamá,Panama,Panama Cityâ€™s first Zika related death
384,Guatemala City,Guatemala,Rumors about Meningitis spreading in Guatemala...
450,Campinas,Brazil,"Student sick in Campinas, Brazil"
459,Bangkok,Thailand,Thailand-Zika Virus in Bangkok
603,Singapore,Singapore,Zika cases in Singapore reach 393


In [39]:
df[df.Country.isnull()][['City', 'Country', 'Headline']].count()

City        596
Country       0
Headline    635
dtype: int64

In [40]:
df[df.Country.notnull()][['City', 'Country', 'Headline']].count()

City        15
Country     15
Headline    15
dtype: int64

In [41]:
df[df.City.str.len() > 0][['City', 'Headline']].describe()

Unnamed: 0,City,Headline
count,611,611
unique,577,608
top,Monroe,Spanish Flu Outbreak in Lisbon
freq,4,2


In [42]:
df

Unnamed: 0,Headline,City,Country,Cities,Num_cities
0,18 new Zika Cases in Bogota,Bogotá,,[Bogotá],1
1,19 new Zika Cases in Sengkang,Sengkang,,[Sengkang],1
2,Alameda Residents Recieve Rabies vaccine,Alameda,,[Alameda],1
3,Albany Residents Recieve Respiratory Syncytial...,Albany,,[Albany],1
4,Antipolo under threat from Zika Virus,Antipolo,,[Antipolo],1
5,Arhus is infested with Bronchitis,Århus,,[Århus],1
6,Arvada is infested with Syphilis,Arvada,,[Arvada],1
7,Authorities a Miami,Miami,,[Miami],1
8,Authorities are Worried about the Spread of Br...,Silver Spring,,"[Spring, Silver Spring]",2
9,Authorities are Worried about the Spread of Ch...,Hemet,,[Hemet],1


In [43]:
df[['Headline', 'City', 'Country']].describe()

Unnamed: 0,Headline,City,Country
count,650,611,15
unique,647,577,10
top,Spanish Flu Spreading through Madrid,Monroe,Malaysia
freq,2,4,3


In [None]:
df_headline_city_country = df[['Headline', 'City', 'Country']]

In [None]:
print(df_headline_city_country.to_string(index=False))

In [None]:
df_headline_city_country[df_headline_city_country.City.str.contains("of|Of") > 0]

In [None]:
df_headline_city_country

In [None]:
df_unmatched = df_headline_city_country[df_headline_city_country.City.isnull()]
num_unmatched = len(df_unmatched)
print(f"{num_unmatched} headlines contain no city matches.")
print(df_unmatched.head(10)[['Headline']].values)

In [None]:
gc.cities

In [None]:
gc.countries