### Get countries and make regexes out of them

In [None]:
import geonamescache
import re
import unidecode
gc = geonamescache.GeonamesCache()

gc_countries = gc.get_countries()
countries = list(gc_countries.values())
for country in countries:
    country['@manning/type'] = 'country'
    country['@manning/decoded_name'] = unidecode.unidecode(country['name'])

country_res = [re.compile('\\b{}\\b'.format(country['@manning/decoded_name']), flags=re.IGNORECASE) for country in countries]
country_res

### Get cities and make regexes out of them

In [None]:
import geonamescache
import re
import unidecode
gc = geonamescache.GeonamesCache()

gc_cities = gc.get_cities()
cities = list(gc_cities.values())
for city in cities:
    city['@manning/type'] = 'city'
    city['@manning/decoded_name'] = unidecode.unidecode(city['name'])

city_res = [re.compile('\\b{}\\b'.format(city['@manning/decoded_name']), flags=re.IGNORECASE) for city in cities]
city_res

### Get all US states and make regexes out of them

In [None]:
import geonamescache
import re
import unidecode
gc = geonamescache.GeonamesCache()

gc_us_states = gc.get_us_states()
us_states = list(gc_us_states.values())
for us_state in us_states:
    us_state['@manning/type'] = 'us_state'
    us_state['@manning/decoded_name'] = unidecode.unidecode(us_state['name'])

us_state_res = [re.compile('\\b{}\\b'.format(us_state['@manning/decoded_name']), flags=re.IGNORECASE) for us_state in us_states]

### Get all US counties and make regexes out of them
#### Check if county ends with ' County' - if so regex for county X should be r'\b(X County|X)\b'

In [None]:
import geonamescache
import re
import unidecode
gc = geonamescache.GeonamesCache()

us_counties = gc.get_us_counties()
for us_county in us_counties:
    us_county['@manning/type'] = 'us_county'
    us_county['@manning/decoded_name'] = unidecode.unidecode(us_county['name'])

us_county_res = list()
for county_dict in us_counties:
    if county_dict['@manning/decoded_name'][-7:] == ' County':
        us_county_res.append(re.compile('\\b({}|{})\\b'.format(county_dict['@manning/decoded_name'], county_dict['@manning/decoded_name'][:-7]), flags=re.IGNORECASE))
    else:
        us_county_res.append(re.compile('\\b{}\\b'.format(county_dict['@manning/decoded_name']), flags=re.IGNORECASE))

us_county_res

### Get headlines in list

In [None]:
import os
import unidecode

headlines_path = os.path.join('..', 'data', 'headlines.txt')
headlines_file = open(headlines_path, "r")
headlines_str = headlines_file.read()
headlines_list = [unidecode.unidecode(headline.strip()) for headline in headlines_str.split('\n')]
headlines_list

### Create a factory to avoid repeating same logic to match with various regexes
    

In [None]:
def match_line_factory(regex_list, associated_list):
    def match_line(line):
        matches = list()
        for i in range(len(regex_list)):
            regex = regex_list[i]
            match = regex.search(line)
            if match: matches.append({
                'headline': line,
                'match': match,
                'regex': regex,
                'src_data': associated_list[i],
                'weight': 0
            })
        return matches
    return match_line


### Match each line in headlines with the various regexes

In [None]:
match_country = match_line_factory(country_res, countries)
match_city = match_line_factory(city_res, cities)
match_us_state = match_line_factory(us_state_res, us_states)
match_us_county = match_line_factory(us_county_res, us_counties)

country_matches = list()
city_matches = list()
us_state_matches= list()
us_county_matches = list()
headline_to_all_matches = {}
for line in headlines_list:
    country_matches_for_line = match_country(line)
    if len(country_matches_for_line): country_matches.extend(country_matches_for_line)
    city_matches_for_line = match_city(line)
    if len(city_matches_for_line): city_matches.extend(city_matches_for_line)
    us_state_match_for_line = match_us_state(line)
    if len(us_state_match_for_line): us_state_matches.extend(us_state_match_for_line)
    us_county_match_for_line = match_us_county(line)
    if len(us_county_match_for_line): us_county_matches.extend(us_county_match_for_line)
    headline_to_all_matches[line] = country_matches_for_line + city_matches_for_line + us_state_match_for_line + us_county_match_for_line

data = {}
data['country_matches'] = country_matches
data['city_matches'] = city_matches
data['us_state_matches'] = us_state_matches
data['us_county_matches'] = us_county_matches
data['all'] = headline_to_all_matches

data

### 'Zika Outbreak Hits Miami' headline

Matches:
- 'Miami' city
- 'Miami' us_county state 'IN'
- 'Miami' us_county state 'KS
- 'Miami' us_county state 'OH

See below for example of how to inspect matches for headline:

In [None]:
matches = data['all']['Zika Outbreak Hits Miami']
# same:
# matches = list(data['all'].values())[0]

assert(len(matches) == 4)
matches

### Some functions to weed out results

`zero_out_weights` resets teh weights on matches

`reward_longest_name` adds weight to longest name match (but ignores matches with ' County' in them as I think they arbitrarily inflate length). Also, counties don't seem to be that important here.

`reward_cities` as if you have a country and a city, city will have more info: the city, and the countrycode. If it's a country, you won't get the city. And because city only has countrycode (no country name), I'm only going for country['iso'] if I ever need to extract from country. Also the reward for cities should be greater than that of longest name as if you get a country with a longer name, you want the city to win.

In [None]:
import re

county_re = re.compile(r' County', flags=re.IGNORECASE)

def zero_out_weights(list_of_matches):
    for match in list_of_matches: match['weight'] = 0


def reward_longest_name(list_of_matches):
    longest_name_length = 0
    match_with_longest_name = None
    for match in list_of_matches:
        match_name = match['src_data']['name']
        # skip names with ' County':
        if county_re.search(match_name): continue
        match_name_length = len(match_name)
        if (match_name_length > longest_name_length):
            longest_name_length = match_name_length
            match_with_longest_name = match
    if match_with_longest_name: match_with_longest_name['weight'] += 1
    return list_of_matches


def reward_cities(list_of_matches):
    for match in list_of_matches:
        if match['src_data']['@manning/type'] == 'city':
            match['weight'] += 2

def get_best_match(list_of_matches = []):
    return max(list_of_matches, key = lambda match: match['weight'])
    
    
# list_of_matches = list(data['all'].values())[1]
# zero_out_weights(list_of_matches)
# reward_longest_name(list_of_matches)

# get_best_match(list_of_matches)

In [None]:
list_of_list_of_matches = list(data['all'].values())

df_dict = {}
df_dict['headline_col'] = list()
df_dict['city_col'] = list()
df_dict['country_col'] = list()

for list_of_matches in list_of_list_of_matches:
    if len(list_of_matches):
        zero_out_weights(list_of_matches)
        reward_longest_name(list_of_matches)
        reward_cities(list_of_matches)
        best_match = get_best_match(list_of_matches)
        if best_match['src_data']['@manning/type'] == 'city':
            df_dict['headline_col'].append(best_match['headline'])
            df_dict['city_col'].append(best_match['src_data']['name'])
            df_dict['country_col'].append(best_match['src_data']['countrycode'])
        elif best_match['src_data']['@manning/type'] == 'country':
    #         since in city there's only countrycode, I'm taking iso here:
            df_dict['headline_col'].append(best_match['headline'])
            df_dict['city_col'].append(float('nan'))
            df_dict['country_col'].append(best_match['src_data']['iso'])
        elif best_match['src_data']['@manning/type'] == 'us_state':
    #         there is no city so go for state instead (location / surrounding area will include city...)
            df_dict['headline_col'].append(best_match['headline'])
            df_dict['city_col'].append(best_match['src_data']['name'])
            df_dict['country_col'].append('US')
        elif best_match['src_data']['@manning/type'] == 'us_county':
    #         there is no city so go for county instead (location / surrounding area will include city...)
            df_dict['headline_col'].append(best_match['headline'])
            df_dict['city_col'].append(best_match['src_data']['name'])
            df_dict['country_col'].append('US')
        

# df_dict
(len(df_dict['headline_col']), len(df_dict['city_col']), len(df_dict['country_col']))

### Deliverable chapter 1

In [None]:
import pandas as pd

df = pd.DataFrame({'headline': df_dict['headline_col'], 'city': df_dict['city_col'], 'country': df_dict['country_col'] })
df