### Get countries and make regexes out of them

In [None]:
import geonamescache
import re
gc = geonamescache.GeonamesCache()

gc_countries = gc.get_countries()
countries = list(gc_countries.values())

country_res = [re.compile('\\b{}\\b'.format(country['name']), flags=re.IGNORECASE) for country in countries]
country_res

### Get cities and make regexes out of them

In [None]:
import geonamescache
import re
gc = geonamescache.GeonamesCache()

gc_cities = gc.get_cities()
cities = list(gc_cities.values())

city_res = [re.compile('\\b{}\\b'.format(city['name']), flags=re.IGNORECASE) for city in cities]
city_res

### Get all US states and make regexes out of them

In [None]:
import geonamescache
import re
gc = geonamescache.GeonamesCache()

gc_us_states = gc.get_us_states()
us_states = list(gc_us_states.values())

us_state_res = [re.compile('\\b{}\\b'.format(us_state['name']), flags=re.IGNORECASE) for us_state in us_states]

### Get all US counties and make regexes out of them
#### Check if county ends with ' County' - if so regex for county X should be r'\b(X County|X)\b'

In [None]:
import geonamescache
import re
gc = geonamescache.GeonamesCache()

us_counties = gc.get_us_counties()

us_county_res = list()
for county_dict in us_counties:
    if county_dict['name'][-7:] == ' County':
        us_county_res.append(re.compile('\\b({}|{})\\b'.format(county_dict['name'], county_dict['name'][:-7]), flags=re.IGNORECASE))
    else:
        us_county_res.append(re.compile('\\b{}\\b'.format(county_dict['name']), flags=re.IGNORECASE))

us_county_res

### Get headlines in list

In [None]:
import os

headlines_path = os.path.join('..', 'data', 'headlines.txt')
headlines_file = open(headlines_path, "r")
headlines_str = headlines_file.read()
headlines_list = headlines_str.split('\n')
headlines_list

### Create a factory to avoid repeating same logic to match with various regexes
    

In [None]:
def match_line_factory(regex_list, associated_list):
    def match_line(line):
        matches = list()
        for i in range(len(regex_list)):
            regex = regex_list[i]
            match = regex.search(line)
            if match: matches.append((line, match, regex, associated_list[i]))
        return matches
    return match_line


### Match each line in headlines with the various regexes
#### result of match: (line, match, regex, regex_data_source)
#### final data dict: each key has list of "result of match"

In [None]:
match_country = match_line_factory(country_res, countries)
match_city = match_line_factory(city_res, cities)
match_us_state = match_line_factory(us_state_res, us_states)
match_us_county = match_line_factory(us_county_res, us_counties)

country_matches = list()
city_matches = list()
us_state_matches= list()
us_county_matches = list()
for line in headlines_list:
    country_match = match_country(line)
    if len(country_match): country_matches.append(country_match)
    city_match = match_city(line)
    if len(city_match): city_matches.append(city_match)
    us_state_match = match_us_state(line)
    if len(us_state_match): us_state_matches.append(us_state_match)
    us_county_match = match_us_county(line)
    if len(us_county_match): us_county_matches.append(us_county_match)

data = {}
data['country_matches'] = country_matches
data['city_matches'] = city_matches
data['us_state_matches'] = us_state_matches
data['us_county_matches'] = us_county_matches

data

In [None]:
data['country_matches']

In [None]:
data['city_matches']