# Exploring Wikipedia as a dataset for tar/inc tagging model
This notebook walks through the methods used to build a dataste of wikipedia pages to train our relational tagging model.

In [2]:
# standard library imports
import sys
import re
sys.path.append('../')
# third party imports
import wikipedia
from tqdm import tqdm
from shapely.geometry import Point, shape
from geopy.distance import distance
import pandas as pd
# local imports
from BERT_geoparser.gazetteer import Gazetteer
from BERT_geoparser.tokenizer import Tokenizer
from BERT_geoparser.model import BertModel
from BERT_geoparser.data import Data

## 2 Building a dataset of wikipedia locations
Now we have a trained NER-tagging model we can move on to buildng a new dataset to tage. To do this, we will randomly sample locations from a gazetteer of world cities, and use the wikipedia api to access the corresponding wikipedia page. We will then randomly sample sentences from that page an split them into tokens to input into the model.

# 2.1 Building a list of locations to search for
We have used work by Hertford et al (Hertford, 2023) to limit our search to countries which have relatively high OSM completeness. These countries are stored in the parameter `accepted_countries`. We then use the GeoNames datasbase to search for all cities with a population greater than 1000 within those countries. This dataset requires a little cleaning before use.

In [71]:
world_cities = pd.read_csv('../data/NB2/geonames_world_cities.csv', delimiter=';')
world_cities.tail()

Unnamed: 0,Geoname ID,Name,ASCII Name,Alternate Names,Feature Class,Feature Code,Country Code,Country name EN,Country Code 2,Admin1 Code,Admin2 Code,Admin3 Code,Admin4 Code,Population,Elevation,DIgital Elevation Model,Timezone,Modification date,LABEL EN,Coordinates
142027,1694792,Paraiso,Paraiso,Paraiso,P,PPL,PH,Philippines,,12,70,126306000,,4153,,90,Asia/Manila,2018-02-06,Philippines,"6.47606, 124.80954"
142028,1694812,Paracale,Paracale,Paracale,P,PPLA3,PH,Philippines,,5,15,51608000,,8290,,5,Asia/Manila,2017-12-13,Philippines,"14.2804, 122.7881"
142029,1695427,Pangal Sur,Pangal Sur,"Pangal,Pangal Sur",P,PPL,PH,Philippines,,2,31,23112000,,2710,,93,Asia/Manila,2017-12-13,Philippines,"16.60036, 121.66979"
142030,1695804,Panabo,Panabo,"Kota Panabo,Panabo,Panabo City,Panabo Lakanbalen",P,PPL,PH,Philippines,,11,I7,112315000,,84749,,4,Asia/Manila,2017-12-13,Philippines,"7.30806, 125.68417"
142031,1695994,Palsong,Palsong,Palsong,P,PPL,PH,Philippines,,5,16,51706000,,4566,,18,Asia/Manila,2017-12-13,Philippines,"13.4248, 123.2963"


In [9]:
# clean up the coordinates column
get_lat = lambda coords : float(coords.split(', ')[0])
get_lng = lambda coords : float(coords.split(', ')[1])

world_cities['lat'] = world_cities.Coordinates.apply(get_lat)
world_cities['lng'] = world_cities.Coordinates.apply(get_lng)
world_cities = world_cities.rename(columns={'Name':'city', 'ASCII Name':'city_ascii', 'Country name EN':'country', 'Population':'population'})

In [10]:
countries = world_cities.country.unique()

In [11]:
accepted_countries = ['United States', 'Canada', 'Portugal', 'Spain', 'France', 'Ireland', 'United Kingdom', 'Norway', 'Sweden', 'Denmark', 'Finland',
                      'Belgium', 'Netherlands', 'Switzerland', 'Austria', 'Germany', 'Italy', 'Czech Republic', 'Slovakia', 'Slovenia', 'Hungary', 'Bosnia and Herzegovina', 'Croatia',
                      'Slovakia', 'Poland', 'Lithuania', 'Latvia', 'Estonia', 'Iceland']

In [12]:
# check all the countries are in the world_cities list
for country in accepted_countries:
    if country not in world_cities.country.unique():
        print(country)

In [14]:
# cut the list of cities down to just those in our list of accepted countries
world_cities = world_cities[world_cities.country.isin(accepted_countries)]
print(f'Total locations in dataset: {len(world_cities)}')

Total locations in dataset: 72757


## 2.1 Checking that locations have associated OSM polygons
Our method for relational tagging is reliant on the target locations being associated with a spatial polygon in OSM.. We will use the `Gazetteer` class to search the Nominatim API for each locaiton, and check whether a polygon object is returned. This can take some time, depnding on the speed of your internet connect. It took about 90 mins to complete for me. 

In [208]:
# We're going to use the gazetteer to limit these to cities with OSM polygons
gazetteer = Gazetteer(polygon=True, addressdetails=False)
accepted_indices = []
for i, city in tqdm(world_cities.iterrows(), total=len(world_cities)):
    name = city.city
    coords = (float(city.lat), float(city.lng))
    country = city.country
    try:
        matches = gazetteer.query(name).json()
    except TypeError:
        continue
    for match in matches:
        if 'geojson' not in match.keys():
            continue
        if 'Polygon' not in match['geojson']['type']:
            continue
        if Point(coords) in shape(match['geojson']):
            accepted_indices.append(i)
            break
# cut or cdity list down to just the ones with polygons, remove duplicates.
poly_cities = world_cities.loc[accepted_indices]
deduplicated = poly_cities.drop_duplicates()
print(len(deduplicated))   
print(poly_cities.head())         

100%|██████████| 10851/10851 [1:30:44<00:00,  1.99it/s]


In [216]:
# save the dataset
deduplicated.to_csv('../data/NB2/world_cities.csv', index=False)

## 2.3 Searching wikipedia for these locations
We now need to search the Wikipeida API for locations matching these. We will do various check to ensure that the identified page matches the expected location, including checking for coordinates, checking for key words, and checking for disambiguation pages.

First, though, let's take a look at the data provided by the Wikipedia wpi.

In [15]:
# lets take a quick look at the wikipedia api
glasgow = wikipedia.WikipediaPage(title='Glasgow')

In [16]:
glasgow.summary

'Glasgow (UK:  GLA(H)Z-goh, GLA(H)SS-; Scots: Glesca [ˈɡleskə] or Glesga [ˈɡlezɡə]; Scottish Gaelic: Glaschu [ˈkl̪ˠas̪əxu]) is the most populous city in Scotland, and is the third-most populous city in the United Kingdom, as well as being the 27th largest city by population in Europe. In 2022, it had an estimated population as a defined locality of \n632,350 and anchored an urban settlement of \n1,028,220. Formed as a county of itself in 1893, the city had previously been in the historic county of Lanarkshire (or Clydesdale) and has also grown to include settlements that were once part of Renfrewshire and Dunbartonshire (or the Lennox). It now forms the Glasgow City Council area, one of the 32 council areas of Scotland, and is administered by Glasgow City Council. \nGlasgow has the largest economy in Scotland and the third-highest GDP per capita of any city in the UK. Glasgow\'s major cultural institutions enjoy international reputations including The Royal Conservatoire of Scotland, B

In [17]:
# we'll need a function to remove anything inside brackets to avoid the pronounciation gunk
def remove_parentheses(input_string):
    result = ""
    stack = []
    for char in input_string:
        if char in ['(', '[']:
            stack.append(char)
        elif char in [')', ']']:
            if stack:
                stack.pop()
            else:
                result += char
        elif not stack:
            result += char
    
    return result.replace('  ', ' ')

In [18]:
out = remove_parentheses(glasgow.summary)
print(out)

Glasgow is the most populous city in Scotland, and is the third-most populous city in the United Kingdom, as well as being the 27th largest city by population in Europe. In 2022, it had an estimated population as a defined locality of 
632,350 and anchored an urban settlement of 
1,028,220. Formed as a county of itself in 1893, the city had previously been in the historic county of Lanarkshire and has also grown to include settlements that were once part of Renfrewshire and Dunbartonshire . It now forms the Glasgow City Council area, one of the 32 council areas of Scotland, and is administered by Glasgow City Council. 
Glasgow has the largest economy in Scotland and the third-highest GDP per capita of any city in the UK. Glasgow's major cultural institutions enjoy international reputations including The Royal Conservatoire of Scotland, Burrell Collection, Kelvingrove Art Gallery and Museum, Royal Scottish National Orchestra, BBC Scottish Symphony Orchestra, Scottish Ballet and Scottish

In [19]:
# and lets look at where disambiguation pages land
hull = wikipedia.page('Hull')
hull.categories



  lis = BeautifulSoup(html).find_all('li')


DisambiguationError: "tall" may refer to: 
height
human height
Tall, Semnan
River Tall
Tall: The American Skyscraper and Louis Sullivan
Mr. Tall
Tall (surname)
Tell (archaeology)
List of people known as the Tall
TAL (disambiguation)
Tell (disambiguation)
All pages with titles beginning with tall
All pages with titles beginning with tallest
All pages with titles containing tall

In [20]:
# all disambiguation pages lead to a page error, lets catch that 
def disamb_check(query):
    try:
        wikipedia.WikipediaPage(query)
        return False
    except (wikipedia.DisambiguationError, wikipedia.PageError):
        return True


In [21]:
check = disamb_check('hull')
check

True

In [22]:
check = disamb_check('glasgow')
check

False

We will also need a way to check that the found page refers to a city and not to something else. Unfortunately the wikipedia 'Location' category is not applied consistently enough to be suitable for this. Rather, we'll use a slightly hacky approach in which either use the coordinates provided on the iwkipedis page (if available), or other look for words related to locations within the first two sentences of the page.


In [23]:
# first we should check if the coordinates are roughly correct
def check_coordinates(page, true_coords, threshold=10):
    if not page.coordinates:
        return False
    wiki_coords = page.coordinates
    d = distance(true_coords, wiki_coords).km
    return d <= threshold
    
    

In [26]:
# Lets test it with the coordinates for Birmingham Alabama
page = wikipedia.WikipediaPage('Birmingham')
true_coords = (33.52066, -86.80249)

check_coordinates(page, true_coords)

False

In [27]:
page = wikipedia.WikipediaPage('Birmingham, AL')
true_coords = (33.52066, -86.80249)

check_coordinates(page, true_coords)

True

In [43]:
# looking for words related to cities in the first sentence (usually X is a city in Y)
city_words = ['city', 'town', 'village', 'township', 'commune', 'community', 'settlement', 'district', 'municipality', 'metropolis', 'conurbation']

def check_is_city(page):
    sentences = page.summary.split('.')
    out = False
    for word in city_words:
        if word in sentences[0].lower():
            out = True
    return out

In [47]:
# Looking for mentions of the country the city is situated in
def check_correct_country(page, country):
    sentences = page.summary.split('.')
    for sentence in sentences[:3]:
        if country in sentence:
            return True
    return False

In [48]:
# lets test this by looking for Paradise, California
page = wikipedia.WikipediaPage('paradise')

check_is_city(page)

False

In [49]:
page = wikipedia.WikipediaPage('paradise (California)')

check_is_city(page)

True

In [55]:
# And we need a check to see if the page exists at all.
def check_page_exists(query):
    try:
        wikipedia.WikipediaPage(query)
        return True
    except wikipedia.PageError:
        return False

Some of the wikipedia summaries are formatted so that line breaks can appear as `<word>.<word>` in the summary. We need to put in a space after the period when this occurs

In [56]:
def add_space_after_period(text):
    pattern = r'([A-Za-z])\.([A-Za-z])'
    replacement = r'\1. \2'
    result = re.sub(pattern, replacement, text)
    return result

finally, we'll need a function which can split a piece of text into full sentences, up to the `max_len` value in the model.

In [57]:
import nltk
nltk.download('punkt')

def split_into_sentences(text, max_words=60):
    sentences = nltk.sent_tokenize(text)
    result = []
    current_sentence = ""

    for sentence in sentences:
        if len(current_sentence.split()) + len(sentence.split()) <= max_words:
            current_sentence += " " + sentence
        else:
            result.append(current_sentence.strip())
            current_sentence = sentence

    # Add the last substring
    if current_sentence:
        result.append(current_sentence.strip())

    return result

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jws10y\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [58]:
sentences = split_into_sentences(glasgow.summary)
sentences

['Glasgow (UK:  GLA(H)Z-goh, GLA(H)SS-; Scots: Glesca [ˈɡleskə] or Glesga [ˈɡlezɡə]; Scottish Gaelic: Glaschu [ˈkl̪ˠas̪əxu]) is the most populous city in Scotland, and is the third-most populous city in the United Kingdom, as well as being the 27th largest city by population in Europe.',
 'In 2022, it had an estimated population as a defined locality of \n632,350 and anchored an urban settlement of \n1,028,220. Formed as a county of itself in 1893, the city had previously been in the historic county of Lanarkshire (or Clydesdale) and has also grown to include settlements that were once part of Renfrewshire and Dunbartonshire (or the Lennox).',
 'It now forms the Glasgow City Council area, one of the 32 council areas of Scotland, and is administered by Glasgow City Council. Glasgow has the largest economy in Scotland and the third-highest GDP per capita of any city in the UK.',
 "Glasgow's major cultural institutions enjoy international reputations including The Royal Conservatoire of S

In [59]:
# This function draws together each of the page processing functions. 
def process_page(page):
    raw_summary = page.summary
    p1_summary = remove_parentheses(raw_summary)
    p2_summary = add_space_after_period(p1_summary)
    sentences = split_into_sentences(p2_summary, 70)
    return sentences

 We now have all the funcitons we need to scrape Wikpedia for the locations in the dataset. This takes a long time and can run into problems if an internet connection drops out. It took around 5 hours to complete for me.

In [None]:
# we can now loop over each of our cities and find the wikipedia page associated with each
rows = []
for idx, row in tqdm(poly_cities.iterrows(), total = len(poly_cities)):
    location = row.city_ascii
    state = row['Admin1 Code']
    country = row.country
    coords = (row.lat, row.lng)
    
    # check for disambiguation and add state/country if disamb reached
    if disamb_check(location):
        # pages for US locations work better if you add the state
        if country == 'United States':
            location += f' ({state})'
        else:
            location += f' ({country})'
        # if the new location doesn't have a page then skip
        if disamb_check(location):
            continue
    
    ## check a page exists related to that location
    if not check_page_exists(location):
        continue
    
    # retrieve page
    page = wikipedia.WikipediaPage(location)
    # check if the page has coordinates:
    try:
        page.coordinates
        has_coords = True
    except KeyError:
        has_coords = False
        correct_coords = False
    # check the coordinates match
    if (has_coords) and not (check_coordinates(page, coords)):
        if country == 'united_states':
            location += state
        else:
            location += f'({country})'
        # check a page exists, skip otherwise
        if check_page_exists(location):  
            page = wikipedia.page(location)
            correct_coords = False
            # check again with the new page, if still a miss then skip
            if not check_coordinates(page, coords):
                continue
        else:
            continue

    elif (has_coords) and (check_coordinates(page, coords)):
        correct_coords = True
        
    # check country is correct
    if not (correct_coords) and not (check_correct_country(page, country)):
        location += f'({country})'
        # check a page exists, skip otherwise
        if check_page_exists(location):  
            page = wikipedia.page(location)
        else:
            continue
    
    # check this is a page for a city
    if not (correct_coords) and not (check_is_city(page)):
        continue 
    
    # If it gets this far then the wikipedia page is confirmed as the correct one! 
    sentences = process_page(page)
    for sentence in sentences:
        row = {}
        row['city'] = location
        row['country'] = country
        row['coordinates'] = coords
        row['text'] = sentence
        rows.append(row)
        
processed_wiki_cities = pd.DataFrame(rows)
        
processed_wiki_cities.to_csv('../data/NB2/wiki_cities_reference.csv', index=False)    

## 2.3 Adding US Counties
The current dataset is made mostly of towns and cities. As such, there is likely to be limited representation of [CHI] tags in the data. We will address this by adding US counties dataset. Once agian, we need to process these to ensure they can be associated to an OSM polygon and a valid Wikipedia page. This takes a couple of hours to run.

In [72]:
us_counties = pd.read_csv('../data/NB2/us_counties.csv', encoding='latin')
us_counties.head()

clean_latitude = lambda lat : float(lat[1:-1])
clean_longitude = lambda lng : -float(lng[1:-1])

us_counties['Latitude'] = us_counties['Latitude'].apply(clean_latitude)
us_counties['Longitude'] = us_counties['Longitude'].apply(clean_longitude)
us_counties = us_counties.rename(columns={'County [2]':'County'})

In [73]:
us_counties

Unnamed: 0,Sort [1],State,FIPS,County,County Seat(s) [3],Population,Land Area,Land Area.1,Water Area,Water Area.1,Total Area,Total Area.1,Latitude,Longitude
0,1,AL,1001,Autauga,Prattville,54571,1539.58,594.436,25.776,9.952,1565.36,604.388,32.536382,-86.644490
1,2,AL,1003,Baldwin,Bay Minette,182265,4117.52,1589.78,1133.19,437.527,5250.71,2027.31,30.659218,-87.746067
2,3,AL,1005,Barbour,Clayton,27457,2291.82,884.876,50.865,19.639,2342.68,904.515,31.870670,-85.405456
3,4,AL,1007,Bibb,Centreville,22915,1612.48,622.582,9.289,3.587,1621.77,626.169,33.015893,-87.127148
4,5,AL,1009,Blount,Oneonta,57322,1669.96,644.776,15.157,5.852,1685.12,650.628,33.977448,-86.567246
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3138,3139,WY,56037,Sweetwater,Green River,43806,27004.90,10426.65,166.887,64.436,27171.78,10491.09,41.660339,-108.875676
3139,3140,WY,56039,Teton,Jackson,21294,10347.98,3995.38,572.266,220.953,10920.25,4216.33,44.049321,-110.588102
3140,3141,WY,56041,Uinta,Evanston,21118,5390.45,2081.26,16.342,6.31,5406.79,2087.57,41.284726,-110.558947
3141,3142,WY,56043,Washakie,Worland,8533,5797.82,2238.55,10.762,4.155,5808.58,2242.70,43.878831,-107.669052


In [74]:
from shapely.geometry import shape
# We're again going to use the gazetteer to limit these to counties with OSM polygons
gazetteer = Gazetteer(polygon=True, addressdetails=False)
accepted_indices = []
for i, county in tqdm(us_counties.iterrows(), total=len(us_counties)):
    name = county.County
    coords = (float(county.Longitude), float(county.Latitude))
    state = county.State
    try:
        matches = gazetteer.query(name).json()
    except TypeError:
        continue
    for match in matches:
        if 'geojson' not in match.keys():
            continue
        if 'Polygon' not in match['geojson']['type']:
            continue
        if Point(coords).within(shape(match['geojson'])):
            accepted_indices.append(i)
            break

  0%|          | 0/3143 [00:00<?, ?it/s]

100%|██████████| 3143/3143 [27:43<00:00,  1.89it/s] 


In [75]:
location = 'Autauga County'
if disamb_check(location):
    new_location = location + f', {state}'
    
    # if the new location doesn't have a page then skip
    if disamb_check(new_location):
        print(f'"{location}" leads to disambiguation')

In [76]:
wikipedia.WikipediaPage('Autauga County, AL')

<WikipediaPage 'Autauga County, Alabama'>

In [77]:
poly_counties = us_counties.loc[list(set(accepted_indices))]

In [78]:
rows = []
for idx, row in tqdm(poly_counties.iterrows(), total = len(poly_counties)):
    location = remove_parentheses(row.County + ' County')
    state = row['State']
    country = 'United States'
    coords = (row.Latitude, row.Longitude)
    
    location += f', {state}'
    # check for disambiguation and add state/country if disamb reached
    if disamb_check(location):
        print(f'Page for "{location}" cannot be disambiguated and/or found.')
        continue
    # retrieve page
    page = wikipedia.WikipediaPage(location)   
    sentences = process_page(page)
    for sentence in sentences:
        row = {}
        row['city'] = location
        row['country'] = country
        row['coordinates'] = coords
        row['text'] = sentence
        rows.append(row)
        
processed_wiki_counties = pd.DataFrame(rows)

  2%|▏         | 62/2817 [02:20<1:20:22,  1.75s/it]

Page for "Aleutians East County, AK" cannot be disambiguated and/or found.


  2%|▏         | 63/2817 [02:21<1:02:10,  1.35s/it]

Page for "Anchorage County, AK" cannot be disambiguated and/or found.


  2%|▏         | 64/2817 [02:21<49:44,  1.08s/it]  

Page for "Bristol Bay County, AK" cannot be disambiguated and/or found.


  2%|▏         | 65/2817 [02:22<40:56,  1.12it/s]

Page for "Denali County, AK" cannot be disambiguated and/or found.


  2%|▏         | 66/2817 [02:22<34:48,  1.32it/s]

Page for "Fairbanks North Star County, AK" cannot be disambiguated and/or found.


  2%|▏         | 67/2817 [02:22<30:34,  1.50it/s]

Page for "Haines County, AK" cannot be disambiguated and/or found.


  2%|▏         | 68/2817 [02:23<27:34,  1.66it/s]

Page for "Juneau County, AK" cannot be disambiguated and/or found.


  2%|▏         | 69/2817 [02:23<25:29,  1.80it/s]

Page for "Kenai Peninsula County, AK" cannot be disambiguated and/or found.


  2%|▏         | 70/2817 [02:24<23:57,  1.91it/s]

Page for "Ketchikan Gateway County, AK" cannot be disambiguated and/or found.


  3%|▎         | 71/2817 [02:24<22:58,  1.99it/s]

Page for "Kodiak Island County, AK" cannot be disambiguated and/or found.


  3%|▎         | 72/2817 [02:25<22:13,  2.06it/s]

Page for "Lake and Peninsula County, AK" cannot be disambiguated and/or found.


  3%|▎         | 73/2817 [02:25<21:32,  2.12it/s]

Page for "Matanuska-Susitna County, AK" cannot be disambiguated and/or found.


  3%|▎         | 74/2817 [02:26<21:35,  2.12it/s]

Page for "North Slope County, AK" cannot be disambiguated and/or found.


  3%|▎         | 75/2817 [02:26<21:27,  2.13it/s]

Page for "Northwest Arctic County, AK" cannot be disambiguated and/or found.


  3%|▎         | 76/2817 [02:27<20:53,  2.19it/s]

Page for "Sitka County, AK" cannot be disambiguated and/or found.


  3%|▎         | 77/2817 [02:27<21:05,  2.17it/s]

Page for "Skagway County, AK" cannot be disambiguated and/or found.


  3%|▎         | 78/2817 [02:27<21:03,  2.17it/s]

Page for "Wrangell County, AK" cannot be disambiguated and/or found.


  3%|▎         | 79/2817 [02:28<20:42,  2.20it/s]

Page for "Yakutat County, AK" cannot be disambiguated and/or found.


 18%|█▊        | 496/2817 [19:02<1:06:50,  1.73s/it]

Page for "Hawai?i County, HI" cannot be disambiguated and/or found.


 18%|█▊        | 520/2817 [19:55<1:05:48,  1.72s/it]

Page for "Idaho County, ID" cannot be disambiguated and/or found.


 36%|███▌      | 1020/2817 [38:59<52:43,  1.76s/it]  

Page for "Acadia County, LA" cannot be disambiguated and/or found.


 36%|███▌      | 1021/2817 [38:59<40:49,  1.36s/it]

Page for "Allen County, LA" cannot be disambiguated and/or found.


 36%|███▋      | 1022/2817 [39:00<32:41,  1.09s/it]

Page for "Ascension County, LA" cannot be disambiguated and/or found.


 36%|███▋      | 1023/2817 [39:00<26:49,  1.11it/s]

Page for "Assumption County, LA" cannot be disambiguated and/or found.


 36%|███▋      | 1024/2817 [39:00<22:44,  1.31it/s]

Page for "Avoyelles County, LA" cannot be disambiguated and/or found.


 36%|███▋      | 1025/2817 [39:01<19:59,  1.49it/s]

Page for "Beauregard County, LA" cannot be disambiguated and/or found.


 36%|███▋      | 1026/2817 [39:01<18:09,  1.64it/s]

Page for "Bienville County, LA" cannot be disambiguated and/or found.


 36%|███▋      | 1027/2817 [39:02<16:41,  1.79it/s]

Page for "Bossier County, LA" cannot be disambiguated and/or found.


 36%|███▋      | 1028/2817 [39:02<15:41,  1.90it/s]

Page for "Caddo County, LA" cannot be disambiguated and/or found.


 37%|███▋      | 1029/2817 [39:03<14:55,  2.00it/s]

Page for "Calcasieu County, LA" cannot be disambiguated and/or found.


 37%|███▋      | 1030/2817 [39:03<14:27,  2.06it/s]

Page for "Caldwell County, LA" cannot be disambiguated and/or found.


 37%|███▋      | 1031/2817 [39:04<14:08,  2.11it/s]

Page for "Catahoula County, LA" cannot be disambiguated and/or found.


 37%|███▋      | 1032/2817 [39:04<14:08,  2.10it/s]

Page for "Claiborne County, LA" cannot be disambiguated and/or found.


 37%|███▋      | 1033/2817 [39:05<13:50,  2.15it/s]

Page for "Concordia County, LA" cannot be disambiguated and/or found.


 37%|███▋      | 1034/2817 [39:05<13:48,  2.15it/s]

Page for "De Soto County, LA" cannot be disambiguated and/or found.


 37%|███▋      | 1035/2817 [39:05<13:47,  2.15it/s]

Page for "East Carroll County, LA" cannot be disambiguated and/or found.


 37%|███▋      | 1036/2817 [39:06<13:38,  2.18it/s]

Page for "East Feliciana County, LA" cannot be disambiguated and/or found.


 37%|███▋      | 1037/2817 [39:06<13:30,  2.20it/s]

Page for "Evangeline County, LA" cannot be disambiguated and/or found.


 37%|███▋      | 1038/2817 [39:07<13:43,  2.16it/s]

Page for "Iberia County, LA" cannot be disambiguated and/or found.


 37%|███▋      | 1039/2817 [39:07<13:30,  2.19it/s]

Page for "Iberville County, LA" cannot be disambiguated and/or found.


 37%|███▋      | 1040/2817 [39:08<13:27,  2.20it/s]

Page for "Jefferson Davis County, LA" cannot be disambiguated and/or found.


 37%|███▋      | 1041/2817 [39:08<13:19,  2.22it/s]

Page for "Lafayette County, LA" cannot be disambiguated and/or found.


 37%|███▋      | 1042/2817 [39:09<13:15,  2.23it/s]

Page for "Lafourche County, LA" cannot be disambiguated and/or found.


 37%|███▋      | 1043/2817 [39:09<13:15,  2.23it/s]

Page for "La Salle County, LA" cannot be disambiguated and/or found.


 37%|███▋      | 1044/2817 [39:09<13:16,  2.23it/s]

Page for "Livingston County, LA" cannot be disambiguated and/or found.


 37%|███▋      | 1045/2817 [39:10<13:27,  2.20it/s]

Page for "Morehouse County, LA" cannot be disambiguated and/or found.


 37%|███▋      | 1046/2817 [39:10<13:21,  2.21it/s]

Page for "Natchitoches County, LA" cannot be disambiguated and/or found.


 37%|███▋      | 1047/2817 [39:11<13:14,  2.23it/s]

Page for "Ouachita County, LA" cannot be disambiguated and/or found.


 37%|███▋      | 1048/2817 [39:11<13:14,  2.23it/s]

Page for "Plaquemines County, LA" cannot be disambiguated and/or found.


 37%|███▋      | 1049/2817 [39:12<13:14,  2.23it/s]

Page for "Pointe Coupee County, LA" cannot be disambiguated and/or found.


 37%|███▋      | 1050/2817 [39:12<13:14,  2.22it/s]

Page for "Rapides County, LA" cannot be disambiguated and/or found.


 37%|███▋      | 1051/2817 [39:13<13:14,  2.22it/s]

Page for "Red River County, LA" cannot be disambiguated and/or found.


 37%|███▋      | 1052/2817 [39:13<13:14,  2.22it/s]

Page for "Richland County, LA" cannot be disambiguated and/or found.


 37%|███▋      | 1053/2817 [39:14<13:09,  2.23it/s]

Page for "Sabine County, LA" cannot be disambiguated and/or found.


 37%|███▋      | 1054/2817 [39:14<14:29,  2.03it/s]

Page for "St. Bernard County, LA" cannot be disambiguated and/or found.


 37%|███▋      | 1055/2817 [39:15<13:59,  2.10it/s]

Page for "St. Charles County, LA" cannot be disambiguated and/or found.


 37%|███▋      | 1056/2817 [39:15<14:09,  2.07it/s]

Page for "St. John the Baptist County, LA" cannot be disambiguated and/or found.


 38%|███▊      | 1057/2817 [39:16<13:48,  2.12it/s]

Page for "St. Landry County, LA" cannot be disambiguated and/or found.


 38%|███▊      | 1058/2817 [39:16<13:40,  2.14it/s]

Page for "St. Mary County, LA" cannot be disambiguated and/or found.


 38%|███▊      | 1059/2817 [39:16<13:28,  2.18it/s]

Page for "St. Tammany County, LA" cannot be disambiguated and/or found.


 38%|███▊      | 1060/2817 [39:17<13:38,  2.15it/s]

Page for "Tangipahoa County, LA" cannot be disambiguated and/or found.


 38%|███▊      | 1061/2817 [39:17<13:34,  2.16it/s]

Page for "Tensas County, LA" cannot be disambiguated and/or found.


 38%|███▊      | 1062/2817 [39:18<13:26,  2.18it/s]

Page for "Terrebonne County, LA" cannot be disambiguated and/or found.


 38%|███▊      | 1063/2817 [39:18<13:22,  2.19it/s]

Page for "Vermilion County, LA" cannot be disambiguated and/or found.


 38%|███▊      | 1064/2817 [39:19<13:16,  2.20it/s]

Page for "Vernon County, LA" cannot be disambiguated and/or found.


 38%|███▊      | 1065/2817 [39:19<13:14,  2.21it/s]

Page for "Webster County, LA" cannot be disambiguated and/or found.


 38%|███▊      | 1066/2817 [39:20<13:11,  2.21it/s]

Page for "West Baton Rouge County, LA" cannot be disambiguated and/or found.


 38%|███▊      | 1067/2817 [39:20<13:10,  2.21it/s]

Page for "West Carroll County, LA" cannot be disambiguated and/or found.


 38%|███▊      | 1068/2817 [39:21<13:09,  2.22it/s]

Page for "West Feliciana County, LA" cannot be disambiguated and/or found.


 38%|███▊      | 1069/2817 [39:21<13:02,  2.23it/s]

Page for "Winn County, LA" cannot be disambiguated and/or found.


 57%|█████▋    | 1597/2817 [59:39<35:56,  1.77s/it]  

Page for "Carson City County, NV" cannot be disambiguated and/or found.


 94%|█████████▍| 2652/2817 [1:40:50<04:45,  1.73s/it]

Page for "Charlottesville County, VA" cannot be disambiguated and/or found.


 94%|█████████▍| 2653/2817 [1:40:50<03:41,  1.35s/it]

Page for "Falls Church County, VA" cannot be disambiguated and/or found.


 94%|█████████▍| 2654/2817 [1:40:51<02:55,  1.08s/it]

Page for "Harrisonburg County, VA" cannot be disambiguated and/or found.


 94%|█████████▍| 2655/2817 [1:40:51<02:24,  1.12it/s]

Page for "Manassas Park County, VA" cannot be disambiguated and/or found.


 94%|█████████▍| 2656/2817 [1:40:52<02:01,  1.32it/s]

Page for "Martinsville County, VA" cannot be disambiguated and/or found.


 94%|█████████▍| 2657/2817 [1:40:52<01:46,  1.50it/s]

Page for "Poquoson County, VA" cannot be disambiguated and/or found.


 94%|█████████▍| 2658/2817 [1:40:52<01:37,  1.62it/s]

Page for "Virginia Beach County, VA" cannot be disambiguated and/or found.


100%|██████████| 2817/2817 [1:47:05<00:00,  2.28s/it]


In [79]:
processed_wiki_counties.to_csv('../data/NB2/wiki_counties_reference.csv', index = False)

## 2.4 Tagging locations in the datasets
Finally, we need to use our trained TopoBERT model to identify toponyms in each dataset. The two datasets will be joined together after relational retagging in notebook 3.


In [None]:
# Load the dataset using the BERT_geoparser Data.py module
data_csv = r'../data/NB1/wikineural_train_dataset.csv' # specify the data used to train the model for correct tokenization.
tokenizer = Tokenizer(size='large', cased=True)
data = Data(data_path=data_csv, 
            tokenizer=tokenizer,
            max_len=80)

model = BertModel(saved_model='../models/TopoBERT_WikiNeural.hdf5', data=data)
model.model.summary()

cities_data = pd.read_csv('../data/NB2/wiki_cities_reference.csv')
counties_data = pd.read_csv('../data/NB2/wiki_counties_reference.csv')

print('Tagging cities dataset')
cities_results_df = model.results_dataframe(cities_data.text.values, include_best=True)
print('Tagging counties dataset')
counties_results_df = model.results_dataframe(counties_data.text.values, include_best=True)

In [48]:
cities_results_df.to_csv('../data/NB2/wiki_cities_tagged.csv', index=False)
counties_results_df.to_csv('../data/NB2/wiki_counties_tagged.csv', index=False)