### Article web page processing

In [1]:
import html2text
import json
import requests
import numpy as np
from bs4 import BeautifulSoup

In [2]:
def text_from_html(url: str): #data_folder_html_file_id):
    """
    Given url of the webpage returns the readable text from the page
    """
    #road_trip = open("../data/" + str(data_folder_html_file_id) + ".html", "r").read()
    road_trip = requests.get(url).text
    soup = BeautifulSoup(road_trip)
    links = []
    for link in soup.findAll('a'):
        http_link = link.get('href')
        if "nationalgeographic" in http_link:
            continue
        if not http_link.startswith("http"):
            continue
        if len(http_link) > 5:
            links.append(http_link)
    
    text_maker = html2text.HTML2Text()
    text_maker.ignore_links = True
    text_maker.ignore_images = True
    road_trip_processed = text_maker.handle(road_trip)
    
    start = "ShareTweetEmail"
    index = road_trip_processed.find(start)
    road_trip_processed = road_trip_processed[index + len(start) + 1:]
    
    end = road_trip_processed.find("ShareTweetEmail")
    road_trip_processed = road_trip_processed[:end]
    road_trip_processed = road_trip_processed.replace("\n\n", ". ")
    for i in ["\n", "[", "]", "*", "#", "_", "\\'", '\\']:
        road_trip_processed = road_trip_processed.replace(i, " ")
    road_trip_processed = road_trip_processed.replace('>', ", ")
    for i in range(4): # 4 seems enough
        road_trip_processed = road_trip_processed.replace("  ", " ")
    road_trip_processed = road_trip_processed.replace("..", ".")
               
    return road_trip_processed.strip(), links

def get_text_and_places() -> dict:
    with open('/home/geoner/data/articles.json') as json_file:
        data = json.load(json_file)
    for i in data:
        text, _ = text_from_html(data[i]["article_url"])
        data[i]["text"] = text      
    return data

In [3]:
%time data = get_text_and_places()

CPU times: user 1.07 s, sys: 22 ms, total: 1.09 s
Wall time: 9.06 s


### Extractor

In [4]:
import spacy

In [5]:
nlp = spacy.load('en_core_web_trf')
def get_ne(txt: str) -> list:
    ne_types = ["GPE", "FAC", "LOC", "ORG"]
    ne = nlp(txt).ents
    target_ne = [str(ne[i]).strip() for i in range(len(ne)) if ne[i].label_ in ne_types]
    for i in range(len(target_ne)):
        if target_ne[i].startswith("the"):
            target_ne[i] = target_ne[i][4:]
        if target_ne[i].endswith(','):
            target_ne[i] = target_ne[i][:-1]
        target_ne[i] = target_ne[i].strip()
        if target_ne[i].endswith('\'s'):
            target_ne[i] = target_ne[i][:-2]    
        target_ne[i] = target_ne[i].strip()
    return list(set(target_ne))

### Gazetteer

In [6]:
import difflib
def str_similarity_percent(a: str, b: str) -> float:
    return round(difflib.SequenceMatcher(lambda x: x == " ", a, b).ratio(), 3)

In [7]:
# 20'000 credits daily limit per application (identified by the parameter 'username'),
# the hourly limit is 1000 credits. A credit is a web service request hit for most services.
# An exception is thrown when the limit is exceeded. 
geonames = {
    "exact_match":"http://secure.geonames.org/searchJSON?maxRows=30&username=kwh44&name_equals=",
    "general_match": "http://secure.geonames.org/searchJSON?maxRows=30&username=kwh44&name="
}
# https://www.gisgraphy.com/documentation/user-guide.php#fulltextservice
gisgraphy = {
    "exact_match": "https://services.gisgraphy.com/fulltext/fulltextsearch?&format=json&allwordsrequired=true&q=",
    "general_match": "https://services.gisgraphy.com/fulltext/fulltextsearch?&format=json&q="
}
resource_name = ""
dbpedia = {
    "resource_name_match": "https://lookup.dbpedia.org/api/search?&typeName=place&format=JSON_FULL&maxResults=3&query=",
    "resource_rdf_get": f"https://dbpedia.org/data/{resource_name}.json"
}

### Existence Detection

In [8]:
import reverse_geocoder as rg
import unidecode

In [9]:
def get_state_by_lat_lng(lat, lng):
    return rg.search((lat,lng))[0]

In [10]:
class Place:
    
    def __init__(self, place_name, country, state, place_type, lat, long, link=None):
        self.place_name = place_name
        self.country = country
        self.state = state
        self.place_type = place_type
        self.lat = float(lat)
        self.long = float(long)
        self.link = link
        
    def __repr__(self):
        return f"Place name: {self.place_name}; State: {self.state}, \
Country: {self.country}, Place Type: {self.place_type}, Lat: {self.lat}, Long: {self.long}"
    
    def __eq__(self, other):
        return self.place_name == other.place_name and self.country == other.country and \
        self.state == other.state

In [31]:
# can be made faster by parallelizing
def existence_check(extracted_places: list) -> tuple:
    geonames_search = dict()
    gisgraphy_search = dict()
    dbpedia_search = dict()
    for i in extracted_places:
        print(i)
        # try dbpedia, maybe there is a wikipedia page for the place extracted
        #print("DBpedia search")
        # dbpedia resource search
        dbpedia_lookup = requests.get(dbpedia["resource_name_match"] + i).json()
        for result in dbpedia_lookup["docs"]:
            result_name = result["label"][0]["value"]
            redirect_result_name = []
            if result.get("redirectlabel"):
                redirect_result_name = [k["value"] for k in result["redirectlabel"]]
            relevance = max(str_similarity_percent(unidecode.unidecode(r), i) for r in redirect_result_name + [result_name])
            print("Result relevance is ", relevance)
            if relevance >= 0.65:
                print("Wikipedia page found")
                # do resourse rdf json get
                rdf_json_url = result["resource"][0]["value"].replace("resource", "data") + ".json"
                print(rdf_json_url)
                try:
                    resource_rdf = requests.get(rdf_json_url, timeout=0.2).json()
                except:
                    print("dbpedia timeout")
                    continue
                if dbpedia_search.get(i):
                    dbpedia_search[i].append([result["resource"][0]["value"], resource_rdf])
                else:
                    dbpedia_search[i] = [[result["resource"][0]["value"], resource_rdf]]
        # geonames exact place match search
        geoname_exact_lookup = requests.get(geonames["exact_match"] + i).json()
        # print(geoname_exact_lookup)
        results_num = geoname_exact_lookup["totalResultsCount"]
        if results_num >= 1:
            geonames_search[i] = geoname_exact_lookup
            print("Geonames exact search found match")
            continue
        # gisgraphy exact place match search
        gisgraphy_exact_lookup = requests.get(gisgraphy["exact_match"] + i).json()
        # print(gisgraphy_exact_lookup)
        results_num = gisgraphy_exact_lookup["response"]["numFound"]
        if results_num >= 1:
            # gisgaphy is not that good with exact match, might find lots of places that constain the query string,
            # but those places would add more noise for region localization module
            if results_num < 80:
                gisgraphy_search[i] = gisgraphy_exact_lookup
                print("Gisgraphy exact search found match")
                continue
        print("exact place name search failed")        
        # exact place name search failed
        # geonames partial place match search
        geoname_general_lookup = requests.get(geonames["general_match"] + i).json()
        results_num = geoname_general_lookup["totalResultsCount"]
        if results_num >= 1:
            geonames_search[i] = geoname_general_lookup
            print("Geonames partial place match found")
            continue
        # gisgraphy partial place match search
        gisgraphy_general_lookup = requests.get(gisgraphy["general_match"] + i).json()
        results_num = gisgraphy_general_lookup["response"]["numFound"]
        if results_num >= 1:
            if results_num < 80:
                gisgraphy_search[i] = gisgraphy_general_lookup
                print("Geonames partial place match found")
                continue
    return geonames_search, gisgraphy_search, dbpedia_search

In [12]:
def unify_lookup_results(lookup_response):
    geonames_results, gisgraphy_results, dbpedia_rdf = lookup_response
    unified_lookup_results = []
    for key, value in geonames_results.items():
        results = value['geonames']
        for res in results:
            if str_similarity_percent(key, res["toponymName"]) < 0.75:
                continue
            # skip results that don't have required information
            try:
                lat = res["lat"]
                long = res["lng"]
                place_name = key # res["toponymName"]
                country = res["countryCode"]
                place_type = res["fcodeName"]
                state = res["adminName1"]
                unified_lookup_results.append(Place(place_name, country, state, place_type, lat, long))
            except:
                continue
    for key, value in gisgraphy_results.items():
        num = len(value["response"]["docs"])
        for res in value["response"]["docs"]:
            if str_similarity_percent(key, res['name']) < 0.75:
                continue
            try:
                lat = res["lat"]
                long = res["lng"]
                place_name = key
                country = res["country_code"]
                place_type = res["placetype"]
                state = get_state_by_lat_lng(lat, long)["admin1"]
                unified_lookup_results.append(Place(place_name, country, state, place_type, lat, long))
            except:
                continue
    for key, value in dbpedia_rdf.items():
        for result in value:
            try:
                rdf = result[1][result[0]]
                long = rdf["http://www.w3.org/2003/01/geo/wgs84_pos#long"][0]["value"]
                lat = rdf["http://www.w3.org/2003/01/geo/wgs84_pos#lat"][0]["value"]
                place_type = "Attraction"
                place_name = key
                country_admin1 = get_state_by_lat_lng(lat, long)
                state = country_admin1["admin1"]
                country = country_admin1["cc"]
                unified_lookup_results.append(Place(place_name, country, state, place_type, lat, long))
            except Exception as e:
                continue
    return unified_lookup_results

### Test page

In [13]:
page_url = data["1"]["article_url"]

In [14]:
data["1"]["article_url"]

'https://www.nationalgeographic.com/travel/article/route-66'

In [15]:
%time text, links = text_from_html(page_url)

CPU times: user 85.1 ms, sys: 0 ns, total: 85.1 ms
Wall time: 1.07 s


In [16]:
%time places_and_poi = get_ne(text)

CPU times: user 12.5 s, sys: 361 ms, total: 12.8 s
Wall time: 2.36 s


In [17]:
# process extractor results
places_and_poi_no_duplicate = set(places_and_poi) # discard duplicate names
places_and_poi_processed = []
# case: route 66 museum -> oklahoma route 66 museum
for i in places_and_poi_no_duplicate:
    skip = False
    if i.startswith("National Geographic"):
        continue
    if len(i.split()) > 2:
        for j in places_and_poi_no_duplicate:
            if j.endswith(i) and len(j) > len(i):
                skip = True
    if not skip:
        places_and_poi_processed.append(i)

In [18]:
len(places_and_poi_processed)

91

In [48]:
%time lookup_results = existence_check(places_and_poi_processed)

Kingman
Result relevance is  0.778
Wikipedia page found
http://dbpedia.org/data/Kingman,_Arizona.json
Result relevance is  0.667
Wikipedia page found
http://dbpedia.org/data/Kingman_County,_Kansas.json
Result relevance is  0.359
Geonames exact search found match
Woody Guthrie Center
Result relevance is  1.0
Wikipedia page found
http://dbpedia.org/data/Woody_Guthrie_Center.json
Result relevance is  0.356
Result relevance is  0.324
Gisgraphy exact search found match
Oklahoma City
Result relevance is  1.0
Wikipedia page found
http://dbpedia.org/data/Oklahoma_City.json
Result relevance is  0.609
Result relevance is  0.609
Geonames exact search found match
Munger Moss Motel
Result relevance is  0.941
Wikipedia page found
http://dbpedia.org/data/Munger-Moss_Motel.json
Result relevance is  0.276
Result relevance is  0.258
exact place name search failed
Geonames partial place match found
Meramec Caverns
Result relevance is  1.0
Wikipedia page found
http://dbpedia.org/data/Meramec_Caverns.json


dbpedia timeout
Result relevance is  0.588
Result relevance is  0.556
Geonames exact search found match
Rialto
Result relevance is  0.75
Wikipedia page found
http://dbpedia.org/data/Rialto,_California.json
Result relevance is  0.6
Result relevance is  0.667
Wikipedia page found
http://dbpedia.org/data/Rialto_Bridge.json
Geonames exact search found match
Elbow Inn Bar
Result relevance is  0.24
Result relevance is  0.4
Result relevance is  0.286
Gisgraphy exact search found match
St. Louis
Result relevance is  1.0
Wikipedia page found
http://dbpedia.org/data/St._Louis.json
dbpedia timeout
Result relevance is  0.621
Result relevance is  0.692
Wikipedia page found
http://dbpedia.org/data/St._Louis_County,_Missouri.json
dbpedia timeout
Geonames exact search found match
Costa Rica
Result relevance is  1.0
Wikipedia page found
http://dbpedia.org/data/Costa_Rica.json
dbpedia timeout
Result relevance is  0.667
Wikipedia page found
http://dbpedia.org/data/San_José,_Costa_Rica.json
dbpedia timeou

Result relevance is  0.733
Wikipedia page found
http://dbpedia.org/data/Delmar_Loop_station.json
Result relevance is  0.386
Result relevance is  0.733
Wikipedia page found
http://dbpedia.org/data/Loop_Trolley.json
Gisgraphy exact search found match
Oklahoma Route 66 Museum
Result relevance is  0.293
Result relevance is  0.256
Result relevance is  0.267
Gisgraphy exact search found match
Shirley
Result relevance is  0.778
Wikipedia page found
http://dbpedia.org/data/Shirley,_Massachusetts.json
Result relevance is  0.636
Result relevance is  0.609
Geonames exact search found match
Arizona
Result relevance is  1.0
Wikipedia page found
http://dbpedia.org/data/Arizona.json
dbpedia timeout
Result relevance is  0.667
Wikipedia page found
http://dbpedia.org/data/Phoenix,_Arizona.json
dbpedia timeout
Result relevance is  0.667
Wikipedia page found
http://dbpedia.org/data/Tucson,_Arizona.json
dbpedia timeout
Geonames exact search found match
Oklahoma
Result relevance is  1.0
Wikipedia page found

In [35]:
%time places_geocoded = unify_lookup_results(lookup_results)

Loading formatted geocoded file...
CPU times: user 1.23 s, sys: 18.1 s, total: 19.3 s
Wall time: 26.7 s


In [36]:
len(places_geocoded)

725

In [None]:
len(places_geocoded)

In [None]:
lookup_results

In [None]:
places_geocoded

## Region localization and outliers/noise removal

In [None]:
places_geocoded[:2]

In [None]:
# by country

In [37]:
places = places_geocoded

In [38]:
countries = dict()
for i in places:
    if countries.get(i.country):
        countries[i.country] +=1
    else:
        countries[i.country] = 1

In [39]:
travel_country = max(countries, key=lambda x: countries[x])

In [None]:
travel_country

In [40]:
travel_country_places = list(filter(lambda i: i.country == travel_country, places))

In [41]:
len(travel_country_places)

345

In [None]:
travel_country_places

In [42]:
place_name_geocoding = dict()
for place in travel_country_places:
    if place_name_geocoding.get(place.place_name):
        place_name_geocoding[place.place_name].append(place)
    else:
        place_name_geocoding[place.place_name] = [place]

In [43]:
states = [i.state for i in travel_country_places if i.place_type == "first-order administrative division"]

In [None]:
states

In [44]:
for i in place_name_geocoding:
    possible_geolocation_num = len(place_name_geocoding[i]) 
    if possible_geolocation_num == 1:
        continue
    filtered_geolocations = []
    # state names are treated as first-order administrative division
    for j in place_name_geocoding[i]:

        if j.place_type == "first-order administrative division":
            filtered_geolocations.append(j)
            break
    if filtered_geolocations:
        place_name_geocoding[i] = filtered_geolocations
        continue
        
    filtered_geolocations = []    
    # remove duplicate geolocations that come from different gazetteers and differ slightly in coordinates
    for j in place_name_geocoding[i]:
        if j.state not in states:
            continue
        duplicate = False
        for q in place_name_geocoding[i]:
            if j.state == q.state and abs(j.lat - q.lat) < 1.0 and abs(j.long - q.long) < 1.0:
                duplicate = True
        if not duplicate or j not in filtered_geolocations:
            filtered_geolocations.append(j)
    place_name_geocoding[i] = filtered_geolocations
    # remove geolocations that are located on state borders,
    # that point to the same place, but in on gazetteer it on opposing sides of the border
    filtered_geolocations = []    
    for j in place_name_geocoding[i]:
        duplicate = False
        for q in filtered_geolocations:
            if j != q and abs(j.lat - q.lat) < 1.0 and abs(j.long - q.long) < 1.0:
                duplicate = True
        if not duplicate:
            filtered_geolocations.append(j)
    place_name_geocoding[i] = filtered_geolocations

In [45]:
place_record_num = []
for i in place_name_geocoding:
    place_record_num.append([i, len(place_name_geocoding[i])])
place_record_num.sort(reverse=True, key=lambda x: x[1])

In [46]:
for i in place_record_num:
    if i[1] > 1:
        print(i[0], "\n", place_name_geocoding[i[0]])

Lebanon 
 [Place name: Lebanon; State: Illinois, Country: US, Place Type: populated place, Lat: 38.60394, Long: -89.80732, Place name: Lebanon; State: Missouri, Country: US, Place Type: seat of a second-order administrative division, Lat: 37.6806, Long: -92.66379, Place name: Lebanon; State: Oklahoma, Country: US, Place Type: populated place, Lat: 33.98232, Long: -96.90778, Place name: Lebanon; State: Arizona, Country: US, Place Type: populated place, Lat: 32.74535, Long: -109.71619]
Springfield 
 [Place name: Springfield; State: Illinois, Country: US, Place Type: seat of a first-order administrative division, Lat: 39.80172, Long: -89.64371, Place name: Springfield; State: Missouri, Country: US, Place Type: seat of a second-order administrative division, Lat: 37.21533, Long: -93.29824, Place name: Springfield; State: Michigan, Country: US, Place Type: populated place, Lat: 42.32643, Long: -85.23916, Place name: Springfield; State: California, Country: US, Place Type: populated place, L

In [None]:
place_name_geocoding["Chain of Rocks Bridge"]

In [47]:
place_record_num 

[['Lebanon', 4],
 ['Springfield', 4],
 ['Lincoln', 4],
 ['Shirley', 4],
 ['Pacific', 3],
 ['Clinton', 3],
 ['Kingman', 2],
 ['Needles', 2],
 ['Stroud', 2],
 ['Chicago', 2],
 ['St. Louis', 2],
 ['Blueberry Hill', 2],
 ['Route 66 Museum', 2],
 ['Palisades Park', 2],
 ['Seligman', 2],
 ['Palo Duro Canyon', 2],
 ['Oklahoma City', 1],
 ['El Morro National Monument', 1],
 ['San Bernardino', 1],
 ['America', 1],
 ['Tucumcari', 1],
 ['Mississippi', 1],
 ['Michigan', 1],
 ['Illinois', 1],
 ['Palo Duro Canyon State Park', 1],
 ['Ozarks', 1],
 ['Chain of Rocks Bridge', 1],
 ['New Mexico', 1],
 ['Colorado River', 1],
 ['Texas', 1],
 ['Tulsa', 1],
 ['Rialto', 1],
 ['Funks Grove', 1],
 ['Los Angeles', 1],
 ['Santa Monica', 1],
 ['Acoma', 1],
 ['Amarillo', 1],
 ['Missouri', 1],
 ['Big Piney River', 1],
 ['Arizona', 1],
 ['Oklahoma', 1],
 ['California', 1],
 ['Grants', 1],
 ['Route 66', 1],
 ['Hackberry General Store', 1],
 ['Oatman', 1],
 ['Will Rogers State Historic Park', 1],
 ['Loop', 1],
 ['Meteo

In [None]:
NE = nlp(text).ents
Target_ne = [str(ne[i]).strip() for i in range(len(NE)) if ne[i].label_ in ne_types]

In [None]:
ent = NE[0]

In [None]:
ent.start_char-ent.sent.start_char, ent.end_char-ent.sent.start_char

In [None]:
dir(NE[0])

In [None]:
text[17:25]

In [None]:
places_and_poi.index(ent.text)

In [None]:
ne_types = ["GPE", "FAC", "LOC", "ORG"]
ne = nlp(text).ents
target_ne = [(str(ne[i]).strip(), ne[i]) for i in range(len(ne)) if ne[i].label_ in ne_types]
for i in range(len(target_ne)):
    if target_ne[i][0].startswith("the"):
        target_ne[i][0] = target_ne[i][0][4:]
    if target_ne[i][0].endswith(','):
        target_ne[i][0] = target_ne[i][0][:-1]
    target_ne[i][0] = target_ne[i][0].strip()
    if target_ne[i][0].endswith('\'s'):
        target_ne[i][0] = target_ne[i][0][:-2]    
    target_ne[i][0] = target_ne[i][0].strip()
