# Manning Publications: Data Science Bookcamp
    
## Case Study 3: Tracking Disease Outbreaks Using News Headlines

### Part 1: Extracting city and country names from news headlines

                                                 2020 Jamie Shaffer

In [1]:
# Goal: read in headlines.txt
#       extract city and country names using regex and the names in the geonamescache library
#       put them in a pandas DataFrame
#  headline   city    country

# Watch out for:
#  Multiple city names in a headline
#  Matches on short words (San) when it is part of a longer word (San Diego)
#  Accents and diacritical markings (use unidecode)
#  
# PLUS my discoveries: headlines are in English
#  Watch out for common English words that are also city names: 'Of' is a prime example

# TODO
#   A review of the output parsed_headlines.csv indicates that there are 
#   quite a few cities and/or
#   regions that are not in geonamescache. Options: 
#       create my own list of names to cover these
#       help with the geonamescache project
#   About 7 of the headlines have BOTH city and country; if we don't resolve 
#       country names until later, remember to remove the duplicates.

## Imports and setup

In [2]:
import re                         # 2.2.1
import pandas as pd               # 0.25.0
import geonamescache              # 1.1.0
#import unidecode                  # has no __version__
from unidecode import unidecode

# PEP 484 type hints are easier with this
from typing import Tuple

for library in [re,pd,geonamescache]:
    libname = (str(library)).split()[1]
    print(libname," version is ",library.__version__)
    


're'  version is  2.2.1
'pandas'  version is  0.25.0
'geonamescache'  version is  1.1.0


## Globals

In [3]:
## GLOBALS

# For debugging, set this to 1
debug = 0

# Headlines that we will use are in this file; there are 650
fname = "../../../../discovering-disease-outbreaks-base/data/headlines.txt"

# First step when using geonamescache
gc = geonamescache.GeonamesCache()   

## Functions

In [4]:
def fetch_headlines(fname:str)->list:
    '''Input: fname - path to the file with the headlines, one per line
       Output: headlines - a list of the headlines with minor processing:
           \n removed 
           unidecode used to remove accents and special characters
    '''
    with open(fname,'r') as f1:
        headlines = f1.read().splitlines()  # creates a list of headlines with \n removed

    # Just in case they came in with special chars, unidecode the headlines:
    for h in range(0,len(headlines)):
        headlines[h] = unidecode(headlines[h])
        
    return headlines

In [5]:
def prepare_city_list()->pd.DataFrame:
    '''Read in the cities and put them in a simplified dataframe for easy access
       GLOBALS used: gc
    '''

    # gc_countries = gc.get_countries()
    gc_cities    = gc.get_cities()

    df_cities = pd.DataFrame(columns=['original_name','geonameid','latitude','longitude','countrycode'])

    for c in gc_cities:
        df_cities = df_cities.append({'original_name':gc_cities[c]['name'], \
                                  'geonameid':c,\
                                  'latitude':gc_cities[c]['latitude'],\
                                  'longitude': gc_cities[c]['longitude'],\
                                  'countrycode':gc_cities[c]['countrycode']},\
                         ignore_index=True)

    # Tidy up the names

    df_cities['tidy_name'] = df_cities['original_name'].apply(lambda x: unidecode(x)) # 4904 names are affected

    # Sort the cities so that the longer ones come first
    df_cities['tidy_len'] = df_cities['tidy_name'].apply(lambda x: len(x))
    df_cities.sort_values(by='tidy_len',ascending=False,inplace=True)
    
    len_original = len(df_cities)
    
    # Drop the tricky ones
    # A smarter algo might be able to figure out if they meant the city, but this is the less smart version for now
    tricky_cities = ['Of','Gap','Boom','Hit','Can','Man','Goes','Come','Bay','Spring','Borne','Buy',\
                    'Bury','Bra','Papa']
    
    df_cities.drop(df_cities.loc[df_cities['tidy_name'].isin(tricky_cities)].index, inplace=True)
    
    len_final = len(df_cities)
    
    print("df_cities original: {} and final: {}".format(len_original,len_final))
    
    return df_cities

In [6]:
def prepare_country_list()->pd.DataFrame:
    '''Read in the countries and put them in a simplified dataframe for easy access
       GLOBALS used: gc
    '''
    gc_countries = gc.get_countries()
    # gc_cities    = gc.get_cities()

    df_countries = pd.DataFrame(columns=['original_name','geonameid','countrycode'])

    for c in gc_countries:
        df_countries = df_countries.append({'original_name':gc_countries[c]['name'], \
                                  'geonameid':c,\
                                  'countrycode':gc_countries[c]['iso']},\
                         ignore_index=True)

    # Tidy up the names

    df_countries['tidy_name'] = df_countries['original_name'].apply(lambda x: unidecode(x)) # 4904 names are affected

    # Sort the cities so that the longer ones come first
    df_countries['tidy_len'] = df_countries['tidy_name'].apply(lambda x: len(x))
    df_countries.sort_values(by='tidy_len',ascending=False,inplace=True)
    
    return df_countries

In [7]:
def check_lines(search_term: str, clean_list: list, hack_list: list, \
                isCity = True) -> Tuple[pd.DataFrame, list]:
    '''Look for the search_term in the clean_list
       When found, add the line and the term to a pd.DataFrame
           if isCity is True, add as a City, otherwise add as a Country
       Remove the search_term from that line in the hack_list so substrings won't match it
       Return the pd.DataFrame when all done
       
       Example:
         clean_list entry             'Zika Outbreak Hits Miami'
            leads to hack_list entry  'Zika Outbreak Hits   '
    '''
    df_found = pd.DataFrame(columns=['headline','city'])
    
    uni_search = r'\b' + unidecode(search_term) + r'\b'
    #print("uni_search is: ",uni_search)
    regexp = re.compile(uni_search,flags=re.IGNORECASE)
    
    count = 0
    for hnum in range(0,len(clean_list),1):
        
        h1 = hack_list[hnum]
        h2 = clean_list[hnum]
        
        if regexp.search(h1):
            count = count + 1
            if (isCity):
                df_found = df_found.append({'headline':h2,'city':search_term}, ignore_index=True)
            else:
                df_found = df_found.append({'headline':h2,'country':search_term}, ignore_index=True)
            hack_list[hnum] = regexp.sub("  ", hack_list[hnum])
            if (debug):
                print("Match line {}: {} is in {} so the hack is now {}".format(hnum,\
                                                                            uni_search,\
                                                                            clean_list[hnum],\
                                                                            hack_list[hnum]))
    
    return df_found, hack_list

## MAIN

In [8]:
headlines = fetch_headlines(fname)

if (debug):
    print(type(headlines))
    #print(headlines)
    print(headlines[2])
    
len(headlines)

650

In [9]:
df_cities = prepare_city_list()
(df_cities[df_cities['original_name'] != df_cities['tidy_name']]).count()  
# 4904 names were affected in the original list; now that I'm removing the tricky ones, the count may be lower

df_cities original: 24336 and final: 24321


original_name    4900
geonameid        4900
latitude         4900
longitude        4900
countrycode      4900
tidy_name        4900
tidy_len         4900
dtype: int64

In [10]:
df_countries = prepare_country_list()
(df_countries[df_countries['original_name'] != df_countries['tidy_name']]).count()  # 0 names are affected

original_name    0
geonameid        0
countrycode      0
tidy_name        0
tidy_len         0
dtype: int64

In [11]:
len(df_cities)

# Is "Of" a real city? If so, I need to learn some geography!
df_cities.to_csv('cities.csv')

In [12]:
df_cities.head()

Unnamed: 0,original_name,geonameid,latitude,longitude,countrycode,tidy_name,tidy_len
16758,Chak Two Hundred Forty-nine Thal Development A...,1179305,31.17772,71.2048,PK,Chak Two Hundred Forty-nine Thal Development A...,54
15007,Dolores Hidalgo Cuna de la Independencia Nacional,4023117,21.15611,-100.9325,MX,Dolores Hidalgo Cuna de la Independencia Nacional,49
15048,Ampliación San Mateo (Colonia Solidaridad),8858118,19.61639,-99.14722,MX,Ampliacion San Mateo (Colonia Solidaridad),42
15033,Licenciado Benito Juárez (Campo Gobierno),8858101,24.65667,-107.545,MX,Licenciado Benito Juarez (Campo Gobierno),41
6530,"Sant Pere, Santa Caterina i La Ribera",3119123,41.3845,2.18152,ES,"Sant Pere, Santa Caterina i La Ribera",37


In [13]:
df_countries.head()

Unnamed: 0,original_name,geonameid,countrycode,tidy_name,tidy_len
89,South Georgia and the South Sandwich Islands,GS,GS,South Georgia and the South Sandwich Islands,44
232,United States Minor Outlying Islands,UM,UM,United States Minor Outlying Islands,36
29,"Bonaire, Saint Eustatius and Saba",BQ,BQ,"Bonaire, Saint Eustatius and Saba",34
95,Heard Island and McDonald Islands,HM,HM,Heard Island and McDonald Islands,33
237,Saint Vincent and the Grenadines,VC,VC,Saint Vincent and the Grenadines,32


In [14]:
df_final = pd.DataFrame(columns=['headline','city','country'])

# Shorten headlines for debugging
if (debug):
    headlines = headlines[0:10]

In [15]:
# keeping 2 versions of the headline:
# headlines -- untouched, handy for quoting the headline, and we'll need it at the end
# hack_headlines -- every time a match is found, hack it out so a smaller substring can't match it later
hack_headlines = headlines.copy()  # this will be butchered by the time we're done, so we can look at it to see what was missed

for c in df_cities['tidy_name']:
    df_out, hack_headlines = check_lines(c,clean_list = headlines, hack_list = hack_headlines, isCity = True)
    
    df_final = df_final.append(df_out,sort=False)  # sort=True changes the order of columns and is the default
    

In [16]:
# Did the list of headlines get mangled?
print("Checking headline lists: original {} and hacked {} lines.".format(len(headlines),len(hack_headlines)))


Checking headline lists: original 650 and hacked 650 lines.


In [17]:
for c in df_countries['tidy_name']:
    df_out, hack_headlines = check_lines(c,clean_list = headlines, hack_list = hack_headlines, isCity = False)
    
    df_final = df_final.append(df_out,sort=False)

In [18]:
headlines[0], hack_headlines[0]

('Zika Outbreak Hits Miami', 'Zika Outbreak Hits   ')

In [19]:
# Did the list of headlines get mangled?
print("Checking headline lists: original {} and hacked {} lines.".format(len(headlines),len(hack_headlines)))


Checking headline lists: original 650 and hacked 650 lines.


In [20]:
# Do a visual check -- this was handy when I was only working with the first 10 or so lines
if (debug):
    for c in range(0,len(headlines)):      # hack_headlines < headlines  -- some did not have any city data?
        print("{}     vs       {}".format(headlines[c],hack_headlines[c]))
    
# Zip them together into a dataframe and write it out for easier comparison offline
df_check = pd.DataFrame(list(zip(headlines,hack_headlines)),columns=['OriginalHeadline','GeoNamesHackedOut'])
df_check.to_csv('check_headline_hacks.csv',index=False)

In [21]:
# Did the list of headlines get mangled?
print("Checking headline lists: original {} and hacked {} lines.".format(len(headlines),len(hack_headlines)))

Checking headline lists: original 650 and hacked 650 lines.


In [22]:
pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1)

df_final.head()

Unnamed: 0,headline,city,country
0,Pedro Juan Caballero Encounters Severe Symptoms of Pneumonia,Pedro Juan Caballero,
0,Rumors about Hepatitis D Spreading in San Juan Capistrano have been Refuted,San Juan Capistrano,
0,Zika Spreads to Palm Beach Gardens,Palm Beach Gardens,
0,Zika Reported in North Miami Beach,North Miami Beach,
0,Zika cases in Vietnam's Ho Chi Minh City surge,Ho Chi Minh City,


In [23]:
df_final.shape

(620, 3)

In [24]:
# How many headlines did we keep from the original 650?
df_final['headline'].nunique()

609

In [25]:
# Sanity check -- which headlines did not end up in the final frame?
if (1):
    count_drops = 0
    for c in range(0,len(headlines)):      # hack_headlines < headlines  -- some did not have any city data?
        if (headlines[c] == hack_headlines[c]):
            count_drops += 1
            print("{} Dropped - no city or country: {}".format(count_drops,headlines[c]))

1 Dropped - no city or country: Louisiana Zika cases up to 26
2 Dropped - no city or country: Zika infects pregnant woman in Cebu
3 Dropped - no city or country: Spanish Flu Sighted in Antigua
4 Dropped - no city or country: Zika case reported in Oton
5 Dropped - no city or country: Hillsborough uses innovative trap against Zika 20 minutes ago
6 Dropped - no city or country: Maka City Experiences Influenza Outbreak
7 Dropped - no city or country: West Nile Virus Outbreak in Saint Johns
8 Dropped - no city or country: Malaria Exposure in Sussex
9 Dropped - no city or country: Greenwich Establishes Zika Task Force
10 Dropped - no city or country: Will West Nile Virus vaccine help Parsons?
11 Dropped - no city or country: Yulee takes a hit from Spreading Sickness
12 Dropped - no city or country: The Spread of Chikungunya in Davidson has been Confirmed
13 Dropped - no city or country: Zika case reported in Los Fresnos
14 Dropped - no city or country: More people in Boucau are infected with

In [26]:
df_final.index = range(0,df_final.shape[0])

df_final.head()

Unnamed: 0,headline,city,country
0,Pedro Juan Caballero Encounters Severe Symptoms of Pneumonia,Pedro Juan Caballero,
1,Rumors about Hepatitis D Spreading in San Juan Capistrano have been Refuted,San Juan Capistrano,
2,Zika Spreads to Palm Beach Gardens,Palm Beach Gardens,
3,Zika Reported in North Miami Beach,North Miami Beach,
4,Zika cases in Vietnam's Ho Chi Minh City surge,Ho Chi Minh City,


In [27]:
#df_final

In [28]:
df_final.to_csv("parsed_headlines.csv",index=False)