# CH1 Information Extraction

In [1]:
import pandas as pd
import numpy as np
import re
import unidecode
import geonamescache
# Retrieve names, ISO and FIPS codes of continents, countries, US states and counties as dictionaries + population and geographic info.

## Read the headlines

In [2]:
hl = pd.read_csv('E:/Self-Dev/ManningProject-Headlines/data/headlines.txt', sep="\n", header=None,names = ['headline'])
num = hl.shape[0]
hl.head()

Unnamed: 0,headline
0,Zika Outbreak Hits Miami
1,Could Zika Reach New York City?
2,First Case of Zika in Miami Beach
3,"Mystery Virus Spreads in Recife, Brazil"
4,Dallas man comes down with case of Zika


## remove accent marks

In [3]:
hl2 = hl.applymap(lambda x: unidecode.unidecode(x)) # apply function to each cell, instead of cols/rows
change = int((hl2 != hl).sum(axis = 0))
print(hl.loc[(hl != hl2)['headline'].tolist()].merge( hl2.loc[(hl != hl2)['headline'].tolist()], \
                                                               how = 'outer',left_index = True, right_index = True))
print('There are {0} headlines containing accents in a total of {1}. Percentage is {2:.3f}%.'.format(change, num, change/num)) 
# float formatting

                                 headline_x  \
14       Zika alert – Manila now threatened   
179             BREAKING – Zika in Missoula   
196        Zika case recorded in Tunapuna »   
576  Panama City’s first Zika related death   

                                 headline_y  
14       Zika alert - Manila now threatened  
179             BREAKING - Zika in Missoula  
196       Zika case recorded in Tunapuna >>  
576  Panama City's first Zika related death  
There are 4 headlines containing accents in a total of 650. Percentage is 0.006%.


## Create geography list

In [49]:
gc = geonamescache.GeonamesCache()

countries = gc.get_countries_by_names()
country_list = [unidecode.unidecode(x) for x in countries.keys()]

cities = gc.get_cities()
city_list = []
for k,v in cities.items():
    city_list.append(unidecode.unidecode(v['name']))

cities2 = dict() # City name as key
for k in list(cities.keys()):
    cities2[unidecode.unidecode(cities[k]['name']).lower()] = cities[k]
    # turn them into lower cases to avoid extraction problem later due to case sensitivity.

countries2 = dict() # Country code as key
for k in list(countries.keys()):
    countries2[countries[k]['iso']] = countries[k]

## Geo match: Country and City

In [6]:
'''
1. Watch out for multiple cities in a headline 
2. Short words, for example San Marino – and not a partial match – San
'''
hl2['city'] = pd.Series() # np.nan is not None
hl2['country'] = pd.Series()

compile_city = re.compile(r'\b\(' + '|'.join(city_list) + r'\)\b')
compile_country = re.compile(r'\b\(' + '|'.join(country_list) + r'\)\b')

for ix in range(hl2.shape[0]):
    h = hl2.headline[ix]
    
    compile_list = [re.compile(r"\b("+ unidecode.unidecode(x) +r")\b", flags=re.IGNORECASE) for x in city_list]
    c = []
    for x in compile_list:
        c += re.findall(x,h)
    hl2.city[ix] = max(c, key = len) if len(c) > 0 else None
    
    compile_list = [re.compile(r"\b("+ unidecode.unidecode(x) +r")\b", flags=re.IGNORECASE) for x in country_list]
    c = []
    for x in compile_list:
        c += re.findall(x,h)
    hl2.country[ix] = max(c, key = len) if len(c) > 0 else None

hl2.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,headline,city,country
0,Zika Outbreak Hits Miami,Miami,
1,Could Zika Reach New York City?,New York City,
2,First Case of Zika in Miami Beach,Miami Beach,
3,"Mystery Virus Spreads in Recife, Brazil",Recife,Brazil
4,Dallas man comes down with case of Zika,Dallas,


## Furthur Macth to counties and States: fill out the missing info

In [53]:
co = list(hl2.index[hl2.country.isna()])
ci = list(hl2.index[hl2.city.isna()])

states = gc.get_us_states()
state_list = [states[x]['name'] for x in states]

states2 = dict() 
for k in list(states.keys()):
    states2[states[k]['name']] = states[k]

counties = gc.get_us_counties()
counties2 = dict()
for k in counties:
    counties2[k['name']] = k
county_list = list(counties2.keys())

compile_state = [re.compile(r"\b("+ unidecode.unidecode(x) +r")\b", flags=re.IGNORECASE) for x in state_list]
compile_county = [re.compile(r"\b("+ unidecode.unidecode(x) +r")\b", flags=re.IGNORECASE) for x in county_list]
drop = 0
for ix in range(hl2.shape[0]):
    try:
        # print(ix)
        h = hl2.headline[ix]
        if ix not in co: #!!!
            continue
        elif ix not in ci:
            hl2['country'][ix] = countries2[cities2[hl2.city[ix].lower()]['countrycode']]['name']
        else:
            c = []
            for x in compile_state:
                c += re.findall(x,h)
            res = max(c, key = len) if len(c) > 0 else None
            if not res:
                c = []
                for x in compile_county:
                    c += re.findall(x,h)
                res = max(c, key = len) if len(c) > 0 else None
            if not res:
                print(ix, hl2.headline[ix])
                drop += 1
            else:
                hl2.city[ix] = res
                hl2.country[ix] = 'United States'
    except:
        print('error', ix, hl2.headline[ix])
        drop +=1
print('There\'s',drop,'rows which can\'t be recognized in geospacecache.')

19 Zika infects pregnant woman in Cebu
48 Spanish Flu Sighted in Antigua
73 Zika case reported in Oton
76 Hillsborough uses innovative trap against Zika 20 minutes ago
88 Maka City Experiences Influenza Outbreak
156 West Nile Virus Outbreak in Saint Johns
234 Malaria Exposure in Sussex
248 Greenwich Establishes Zika Task Force
252 Will West Nile Virus vaccine help Parsons?
286 Zika case reported in Los Fresnos
308 More people in Boucau are infected with HIV every year
327 Bronchitis Outbreak in Manhasset
342 Rumors about Influenza Spreading in Dobbs Ferry have been Refuted
366 More people in Huron are infected with Dengue every year
378 Will Tuberculosis vaccine help Cherry Creek?
379 Gympie Patient in Critical Condition after Contracting Chlamydia
460 Martinsville tests new cure for Measles
463 More Patients in Magnolia are Getting Diagnosed with Malaria
482 Rumors about Syphilis spreading in Penal have been refuted
508 Fort Belvoir tests new cure for Hepatitis C
509 More people in Oa

In [58]:
hl2.to_csv('../data/hl.csv')
hl2.head(10)

Unnamed: 0,headline,city,country
0,Zika Outbreak Hits Miami,Miami,United States
1,Could Zika Reach New York City?,New York City,United States
2,First Case of Zika in Miami Beach,Miami Beach,United States
3,"Mystery Virus Spreads in Recife, Brazil",Recife,Brazil
4,Dallas man comes down with case of Zika,Dallas,United States
5,Trinidad confirms first Zika case,Trinidad,Uruguay
6,Zika Concerns are Spreading in Houston,Houston,United States
7,Geneve Scientists Battle to Find Cure,Geneve,Switzerland
8,The CDC in Atlanta is Growing Worried,Atlanta,United States
9,Zika Infested Monkeys in Sao Paulo,Sao Paulo,Brazil


## Library Practices

In [28]:
# Practices Unidecode
print("\xac",'\u1234','\u20ac','\U00008000')

accented_string = u'Málaga'
unaccented_string = unidecode.unidecode(accented_string)
print(unaccented_string)
unaccented_string == accented_string

¬ ሴ € 耀
Malaga


False

In [115]:
# Practice Geonamescache
gc = geonamescache.GeonamesCache()

# gets nested dictionary for countries by countrycode
countries = gc.get_countries()

# gets nested dictionary for cities by city id
cities = gc.get_cities()

# gets nested dictionary for countries by country name
country_names = gc.get_countries_by_names()

# gets nested dictionary for countries by city name
# country_names = gc.get_cities_by_names() #error

In [119]:
# cities
def gen_dict_extract(var, key):
    if isinstance(var, dict):
        for k, v in var.items():
            if k == key:
                yield v
            if isinstance(v, (dict, list)):
                yield from gen_dict_extract(v, key)
    elif isinstance(var, list):
        for d in var:
            yield from gen_dict_extract(d, key)
cities = [*gen_dict_extract(cities, 'name')]
len(cities)

24336

In [33]:
# practice re
regexp = re.compile("hello")
print(regexp.search('hi, hello world')) # returns start and end index(+1) including space or NONE

print(regexp.match('hi, hello world')) # search only from pos (default 0)
#equivalent to
print(re.match(regexp, 'hello   '))
print(regexp.match('hi, hello world', 4))

regexp = re.compile("hello|Hello") # | and () are both special ch
regexp = re.compile("(h|H)ello")

#[0-9A-Z] # match any digit or any uppercase character
#[-012] # match a hyphen, a 0, a 1, or a 2

# quick check
print(re.match('^\-?[0-5]$','3')) # numbers -5 through 5
print(re.match('[0-9a-fA-F]','0f4'))
# match a hexadecimal digit: 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, A, a, B, b, C, c, D, d, E, e, F, and f.

#\\ # single backslash 
# search for string "\ten":
## WRONG!!!
regexp = re.compile("\\ten")
# Before re.compile is invoked, Python interprets the string you typed as meaning \ten ->
# passed to re.compile -> \t means tab -> tab + 'en'.
### RIGHT
regexp = re.compile("\\\\ten")

# RAW STRINGS: Strings that doesn’t apply the normal Python rules
# “Don’t process special sequences in this string.”
print(r"""\tTo be\n\tor not to be""")

# surname, firstname middlename: phonenumber
# name: [-a-zA-z]+
# phonenumber: (\d{3}-)?\d{3}-\d{4}
# in sum: [-a-zA-Z]+, [-a-zA-Z]+( [-a-zA-Z]+)?: (\d{3}-)?\d{3}-\d{4}
#!!! Group each subpattern and give name with (?P<name>regex) don't forget parenthesis!
# (?P<last>[-a-zA-Z]+), (?P<first>[-a-zA-Z]+)( (?P<middle>([-a-zA-Z]+)))?: (?P<phone>(\d{3}-)?\d{3}-\d{4}
regexp = re.compile(r"(?P<last>[-a-zA-Z]+)," # comma not in surname
                    r" (?P<first>[-a-zA-Z]+)" # space not in first name
                    r"( (?P<middle>([-a-zA-Z]+)))?"
                    r": (?P<phone>((\d{3}-)?\d{3}-\d{4}))"
                   ) # Python implicitly concatenates any set of strings separated by whitespace
result = regexp.search('liu, Beibei: 111-111-1111')
print(result) # If the value is None, the match failed; otherwise, you can extract information from it.
lastname = result.group('last') # group: extract whatever data matched your named subpatterns. input: name of the subpattern
firstname = result.group('first')
middlename = result.group('middle')
if middlename == None: # w/o this line: Beibei None Liu
    middlename = ""
phonenumber = result.group('phone')
print('Name:', firstname, middlename, lastname,' Number:', phonenumber)

# TRY THIS: Making international calls
'''
'+two-digit country code'
not all numbers have a country code
'''
regexp = re.compile(r"(?P<last>[-a-zA-Z]+),"
                    r" (?P<first>[-a-zA-Z]+)"
                    r"( (?P<middle>([-a-zA-Z]+)))?"
                    r": (?P<phone>((\+\d{2} )?(\d{3}-)?\d{3}-\d{4}))"
                   )
result = regexp.search('liu, Beibei: +01 111-111-1111')
lastname = result.group('last') 
firstname = result.group('first')
middlename = result.group('middle')
if middlename == None: 
    middlename = ""
phonenumber = result.group('phone')
print('Name:', firstname, middlename, lastname,' Number:', phonenumber)

# replacement
string = "If the the problem is textual, use the the re module"
pattern = r"the the"
regexp = re.compile(pattern)
print(regexp.sub("the", string)) # not in place, scan its second argument, replacing all matching with the first argument
# more general, use a function on match obj 
int_string = "1 2 3 4 5"
def int_match_to_float(match_obj):
    return(match_obj.group('num') + ".0")
pattern = r"(?P<num>[0-9]+)"
regexp = re.compile(pattern)
print(regexp.sub(int_match_to_float, int_string))

# TRY THIS: Make any numbers that didn’t have a country code now have +1
regexp = re.compile(r"(?P<last>[-a-zA-Z]+),"
                    r" (?P<first>[-a-zA-Z]+)"
                    r"( (?P<middle>([-a-zA-Z]+)))?"
                    r": (?P<countrycode>\+\d{2} )?"
                    r"(?P<phone>(\d{3}-)?\d{3}-\d{4})"
                   )
result = regexp.search('liu, Beibei: 111-111-1111')
print(result)
lastname = result.group('last') 
firstname = result.group('first')
middlename = result.group('middle')
if middlename == None: 
    middlename = ""
countrycode = result.group('countrycode')
if countrycode == None: 
    countrycode = "+1 "
phonenumber = result.group('phone')
print('Name:', firstname, middlename, lastname,' Number:', countrycode, phonenumber)

'''
LAB 16: PHONE-NUMBER NORMALIZER
In practice, (NNN) NNN-NNNN, NNN-NNN-NNNN, NNN NNN-NNNN, NNN.NNN.NNNN,
and NNN NNN NNNN, to name a few. 
Also, the country code may not be present, may not have a +, 
and usually (not always) is separated from the number by a space or dash.
Create a phone-number normalizer that takes any of the formats and returns a normalized 1-NNN-NNN-NNNN.
'''
number_list = ['+1 223-456-7890', '1-223-456-7890', '+1 223 456-7890', '(223) 456-7890','1 223 456 7890', '223.456.7890','01 223.456.7890']
regexp = re.compile(r"(\+)?((?P<countrycode>\d{1,2})[\s-])?"
                    r"(\()?(?P<phone1>\d{3})(\))?[-\.)\s]"
                    r"(?P<phone2>\d{3})[-\.\s]"
                    r"(?P<phone3>\d{4})"
                   )
for i in number_list:
    result = regexp.search(i)
    # print(i, result)
    countrycode = result.group('countrycode')
    # print(countrycode)
    countrycode = "1"
    print('Phone number:', countrycode + '-' + result.group('phone1') + '-' + result.group('phone2') + '-' + result.group('phone3'))

<re.Match object; span=(4, 9), match='hello'>
None
<re.Match object; span=(0, 5), match='hello'>
<re.Match object; span=(4, 9), match='hello'>
<re.Match object; span=(0, 1), match='3'>
<re.Match object; span=(0, 1), match='0'>
\tTo be\n\tor not to be
<re.Match object; span=(0, 25), match='liu, Beibei: 111-111-1111'>
Name: Beibei  liu  Number: 111-111-1111
Name: Beibei  liu  Number: +01 111-111-1111
If the problem is textual, use the re module
1.0 2.0 3.0 4.0 5.0
<re.Match object; span=(0, 25), match='liu, Beibei: 111-111-1111'>
Name: Beibei  liu  Number: +1  111-111-1111
+1 223-456-7890 <re.Match object; span=(0, 15), match='+1 223-456-7890'>
1
Phone number: 1-223-456-7890
1-223-456-7890 <re.Match object; span=(0, 14), match='1-223-456-7890'>
1
Phone number: 1-223-456-7890
+1 223 456-7890 <re.Match object; span=(0, 15), match='+1 223 456-7890'>
1
Phone number: 1-223-456-7890
(223) 456-7890 <re.Match object; span=(0, 14), match='(223) 456-7890'>
None
Phone number: 1-223-456-7890
1 223 4

In [46]:
w = "TEMPLATES = ( ('index.html', 'home'), ('base.html', 'base'))"

# find outer parens
outer = re.compile("\((.+)\)")
mm = outer.search(w) # match="( ('index.html', 'home'), ('base.html', 'base'))" will match the most outter one
inner_str = mm.group(1) # ('index.html', 'home'), ('base.html', 'base')
'''
Group(0) locates the whole match expression. 
group(1) means the first paranthesis pair locates matching expression 1
group(2) says the second next paranthesis pair locates the match expression 2, and so on.
'''
m = re.search('(?<=abc)(d(e))f', 'abcdef')
print(m.group(0))
print(m.group(1))
print(m.group(2))

# find inner pairs
innerre = re.compile("\('([^']+)', '([^']+)'\)")

results = innerre.findall(inner_str)
print(results)
for x,y in results:
    print("%s <-> %s" % (x,y))


def
de
e
[('index.html', 'home'), ('base.html', 'base')]
index.html <-> home
base.html <-> base


In [44]:
# determine if one string is a substring of another, or if the start of a string contains some predefined text.
assert 'Boston' in 'Boston Marathon' # assert: false will raise error and stop running
assert 'Boston Marathon'.startswith('Boston')
assert 'Boston Marathon'.endswith('Boston') == False

regex = 'Boston'
random_text = 'Clown Patty'
match = re.search(regex, random_text)
assert match is None

matchable_text = 'Boston Marathon'
match = re.search(regex, matchable_text)
assert match is not None
start, end = match.start(), match.end() # method start/end -> no need to adjust by +1/-1
matched_string = matchable_text[start: end]
assert matched_string == 'Boston'

# case insenstive: 
for text in ['BOSTON', 'boston', 'BoSTOn']:
    assert re.search(regex, text, flags=re.IGNORECASE) is not None

# can’t directly distinguish between sub-characters and sub-phrases
assert 'in a' in 'sin apple'
assert 'in a' in 'win attached'
# solve by \b: capture the start and end points of words (as whitespaces and punctuation)
for regex in ['\\bin a\\b', r'\bin a\b']:
    for text in ['sin apple', 'win attached']:
        assert re.search(regex, text) is None

    text = 'Match in a string'
    assert re.search(regex, text) is not None

regex = r'I visited \b(Boston|Philadelphia|San Francisco)\b yesterday.'
assert re.search(regex, 'I visited Chicago yesterday.') is None

cities = ['Boston', 'Philadelphia', 'San Francisco']
for city in cities:
    assert re.search(regex, f'I visited {city} yesterday.') is not None

# Use compile to increase efficiency: 
# Match a regex against 100 strings. For every match, re.search will transform the regex into Python PatternObject.
# If we intend to use case-independent matching, then we must pass flags=re.IGNORECASE into re.compile.