## Article web page processing

In [1]:
import html2text
import json
import requests
import numpy as np
from bs4 import BeautifulSoup

In [2]:
def text_from_html(url: str): #data_folder_html_file_id):
    """
    Given url of the webpage returns the readable text from the page
    """
    #road_trip = open("../data/" + str(data_folder_html_file_id) + ".html", "r").read()
    road_trip = requests.get(url).text
    soup = BeautifulSoup(road_trip)
    links = []
    for link in soup.findAll('a'):
        http_link = link.get('href')
        if "nationalgeographic" in http_link:
            continue
        if not http_link.startswith("http"):
            continue
        if len(http_link) > 5:
            links.append(http_link)
    
    text_maker = html2text.HTML2Text()
    text_maker.ignore_links = True
    text_maker.ignore_images = True
    road_trip_processed = text_maker.handle(road_trip)
    
    start = "ShareTweetEmail"
    index = road_trip_processed.find(start)
    road_trip_processed = road_trip_processed[index + len(start) + 1:]
    
    end = road_trip_processed.find("ShareTweetEmail")
    road_trip_processed = road_trip_processed[:end]
    road_trip_processed = road_trip_processed.replace("\n\n", ". ")
    for i in ["\n", "[", "]", "*", "#", "_", "\\'", '\\']:
        road_trip_processed = road_trip_processed.replace(i, " ")
    road_trip_processed = road_trip_processed.replace('>', ", ")
    for i in range(4): # 4 seems enough
        road_trip_processed = road_trip_processed.replace("  ", " ")
    road_trip_processed = road_trip_processed.replace("..", ".")
               
    return road_trip_processed.strip(), links

def get_text_and_places() -> dict:
    with open('/home/geoner/data/articles.json') as json_file:
        data = json.load(json_file)
    for i in data:
        text, _ = text_from_html(data[i]["article_url"])
        data[i]["text"] = text      
    return data

In [3]:
%time data = get_text_and_places()

CPU times: user 1.09 s, sys: 54.8 ms, total: 1.15 s
Wall time: 12.9 s


## Extractor

In [4]:
import spacy

In [5]:
nlp = spacy.load('en_core_web_trf')
def get_ne(txt: str) -> list:
    ne_types = ["GPE", "FAC", "LOC", "ORG"]
    ne = nlp(txt).ents
    target_ne = [[str(ne[i]).strip(), ne[i]] for i in range(len(ne)) if ne[i].label_ in ne_types]
    for i in range(len(target_ne)):
        if target_ne[i][0].startswith("the"):
            target_ne[i][0] = target_ne[i][0][4:]
        if target_ne[i][0].endswith(','):
            target_ne[i][0] = target_ne[i][0][:-1]
        target_ne[i][0] = target_ne[i][0].strip()
        if target_ne[i][0].endswith('\'s'):
            target_ne[i][0] = target_ne[i][0][:-2]    
        target_ne[i][0] = target_ne[i][0].strip()
    places = list(set(i[0] for i in target_ne)) # removing duplicates as well
    places_and_poi_processed = []
    # case: route 66 museum -> oklahoma route 66 museum
    for i in places:
        if i.startswith("National Geographic"):
            continue
        places_and_poi_processed.append(i)
    places_and_poi = [i for i in target_ne if i[0] in places_and_poi_processed]
    places_and_poi.sort(key=lambda x: x[1].start_char)
    ne_index = {}
    for i in range(len(places_and_poi)):
        if ne_index.get(places_and_poi[i][0]):
            ne_index[places_and_poi[i][0]].append(i)
        else:
            ne_index[places_and_poi[i][0]] = [i]
    return places_and_poi_processed, places_and_poi, ne_index

## Gazetteer

In [6]:
import difflib
def str_similarity_percent(a: str, b: str) -> float:
    return round(difflib.SequenceMatcher(lambda x: x == " ", a, b).ratio(), 3)

In [7]:
# 20'000 credits daily limit per application (identified by the parameter 'username'),
# the hourly limit is 1000 credits. A credit is a web service request hit for most services.
# An exception is thrown when the limit is exceeded. 
geonames = {
    "exact_match":"http://secure.geonames.org/searchJSON?maxRows=40&username=kwh44&name_equals=",
    "general_match": "http://secure.geonames.org/searchJSON?maxRows=30&username=kwh44&name="
}
# https://www.gisgraphy.com/documentation/user-guide.php#fulltextservice
gisgraphy = {
    "exact_match": "https://services.gisgraphy.com/fulltext/fulltextsearch?&format=json&allwordsrequired=true&q=",
    "general_match": "https://services.gisgraphy.com/fulltext/fulltextsearch?&format=json&q="
}
resource_name = ""
dbpedia = {
    "resource_name_match": "https://lookup.dbpedia.org/api/search?&typeName=place&format=JSON_FULL&maxResults=3&query=",
    "resource_rdf_get": f"https://dbpedia.org/data/{resource_name}.json"
}

## Existence Detection

In [8]:
import reverse_geocoder as rg
import unidecode

In [9]:
def get_state_by_lat_lng(lat, lng):
    return rg.search((lat,lng))[0]

In [10]:
class Place:
    
    def __init__(self, place_name, country, state, place_type, lat, long, link=None, population=0):
        self.place_name = place_name
        self.country = country
        self.state = state
        self.place_type = place_type
        self.lat = float(lat)
        self.long = float(long)
        self.link = link
        self.population = population
        
    def __repr__(self):
        return f"Place name: {self.place_name}; State: {self.state}, \
Country: {self.country}, Place Type: {self.place_type}, Lat: {self.lat}, Long: {self.long}"
    
    def __eq__(self, other):
        return self.place_name == other.place_name and self.country == other.country and \
        self.state == other.state

In [11]:
# can be made faster by parallelizing
def existence_check(extracted_places: list) -> tuple:
    geonames_search = dict()
    gisgraphy_search = dict()
    dbpedia_search = dict()
    for i in extracted_places:
        print(i)
        # try dbpedia, maybe there is a wikipedia page for the place extracted
        #print("DBpedia search")
        # dbpedia resource search
        dbpedia_lookup = requests.get(dbpedia["resource_name_match"] + i).json()
        for result in dbpedia_lookup["docs"]:
            result_name = result["label"][0]["value"]
            redirect_result_name = []
            if result.get("redirectlabel"):
                redirect_result_name = [k["value"] for k in result["redirectlabel"]]
            relevance = max(str_similarity_percent(unidecode.unidecode(r), i) for r in redirect_result_name + [result_name])
            print("Result relevance is ", relevance)
            if relevance >= 0.65:
                print("Wikipedia page found")
                # do resourse rdf json get
                rdf_json_url = result["resource"][0]["value"].replace("resource", "data") + ".json"
                print(rdf_json_url)
                try:
                    resource_rdf = requests.get(rdf_json_url, timeout=0.2).json()
                except:
                    print("dbpedia timeout")
                    continue
                if dbpedia_search.get(i):
                    dbpedia_search[i].append([result["resource"][0]["value"], resource_rdf])
                else:
                    dbpedia_search[i] = [[result["resource"][0]["value"], resource_rdf]]
        # geonames exact place match search
        geoname_exact_lookup = requests.get(geonames["exact_match"] + i).json()
        # print(geoname_exact_lookup)
        results_num = geoname_exact_lookup["totalResultsCount"]
        if results_num >= 1:
            geonames_search[i] = geoname_exact_lookup
            print("Geonames exact search found match")
            continue
        # gisgraphy exact place match search
        gisgraphy_exact_lookup = requests.get(gisgraphy["exact_match"] + i).json()
        # print(gisgraphy_exact_lookup)
        results_num = gisgraphy_exact_lookup["response"]["numFound"]
        if results_num >= 1:
            # gisgaphy is not that good with exact match, might find lots of places that constain the query string,
            # but those places would add more noise for region localization module
            if results_num < 80:
                gisgraphy_search[i] = gisgraphy_exact_lookup
                print("Gisgraphy exact search found match")
                continue
        print("exact place name search failed")        
        # exact place name search failed
        # geonames partial place match search
        geoname_general_lookup = requests.get(geonames["general_match"] + i).json()
        results_num = geoname_general_lookup["totalResultsCount"]
        if results_num >= 1:
            geonames_search[i] = geoname_general_lookup
            print("Geonames partial place match found")
            continue
        # gisgraphy partial place match search
        gisgraphy_general_lookup = requests.get(gisgraphy["general_match"] + i).json()
        results_num = gisgraphy_general_lookup["response"]["numFound"]
        if results_num >= 1:
            if results_num < 80:
                gisgraphy_search[i] = gisgraphy_general_lookup
                print("Geonames partial place match found")
                continue
    return geonames_search, gisgraphy_search, dbpedia_search

In [12]:
def unify_lookup_results(lookup_response):
    geonames_results, gisgraphy_results, dbpedia_rdf = lookup_response
    unified_lookup_results = []
    for key, value in geonames_results.items():
        results = value['geonames']
        for res in results:
            if str_similarity_percent(key, res["toponymName"]) < 0.75:
                continue
            # skip results that don't have required information
            try:
                lat = res["lat"]
                long = res["lng"]
                place_name = key # res["toponymName"]
                country = res["countryCode"]
                place_type = res["fcodeName"]
                population = res.get("population", 0)
                state = res["adminName1"]
                unified_lookup_results.append(Place(place_name, country, state, place_type, lat, long, population=population))
            except:
                continue
    for key, value in gisgraphy_results.items():
        num = len(value["response"]["docs"])
        for res in value["response"]["docs"]:
            if str_similarity_percent(key, res['name']) < 0.75:
                continue
            try:
                lat = res["lat"]
                long = res["lng"]
                place_name = key
                country = res["country_code"]
                place_type = res["placetype"]
                population = res.get("population", 0)
                state = get_state_by_lat_lng(lat, long)["admin1"]
                unified_lookup_results.append(Place(place_name, country, state, place_type, lat, long, population=population))
            except:
                continue
    for key, value in dbpedia_rdf.items():
        for result in value:
            try:
                rdf = result[1][result[0]]
                long = rdf["http://www.w3.org/2003/01/geo/wgs84_pos#long"][0]["value"]
                lat = rdf["http://www.w3.org/2003/01/geo/wgs84_pos#lat"][0]["value"]
                place_type = "Attraction"
                place_name = key
                country_admin1 = get_state_by_lat_lng(lat, long)
                state = country_admin1["admin1"]
                country = country_admin1["cc"]
                unified_lookup_results.append(Place(place_name, country, state, place_type, lat, long))
            except Exception as e:
                continue
    return unified_lookup_results

## Region localization and noise removal

In [13]:
def leave_one_country(geo_points):
    places = geo_points
    countries = dict()
    for i in places:
        if countries.get(i.country):
            countries[i.country] +=1
        else:
            countries[i.country] = 1
    travel_country = max(countries, key=lambda x: countries[x])
    travel_country_places = list(filter(lambda i: i.country == travel_country, places))
    place_name_geocoding = dict()
    for place in travel_country_places:
        if place_name_geocoding.get(place.place_name):
            place_name_geocoding[place.place_name].append(place)
        else:
            place_name_geocoding[place.place_name] = [place]
    states = [i.state for i in travel_country_places if i.place_type == "first-order administrative division"]
    return place_name_geocoding, states

In [14]:
def remove_duplicates_and_leave_states(place_name_geocoding, states):
    for i in place_name_geocoding:
        possible_geolocation_num = len(place_name_geocoding[i]) 
        if possible_geolocation_num <= 1:
            continue
        filtered_geolocations = []
        # state names are treated as first-order administrative division
        for j in place_name_geocoding[i]:
            if j.place_type == "first-order administrative division":
                filtered_geolocations.append(j)
                break
        if filtered_geolocations:
            place_name_geocoding[i] = filtered_geolocations
            continue
        filtered_geolocations = []    
        # remove duplicate geolocations that come from different
        # gazetteers and differ slightly in coordinates
        for j in place_name_geocoding[i]:
            if j.state not in states:
                continue
            duplicate = False
            for q in place_name_geocoding[i]:
                if j.state == q.state and abs(j.lat - q.lat) < 1.0 and abs(j.long - q.long) < 1.0:
                    duplicate = True
            if not duplicate or j not in filtered_geolocations:
                filtered_geolocations.append(j)
        place_name_geocoding[i] = filtered_geolocations
        # remove geolocations that are located on state borders,
        # that point to the same place, but in on gazetteer it on opposing sides of the border
        filtered_geolocations = []    
        for j in place_name_geocoding[i]:
            duplicate = False
            for q in filtered_geolocations:
                if j != q and abs(j.lat - q.lat) < 1.0 and abs(j.long - q.long) < 1.0:
                    duplicate = True
            if not duplicate:
                filtered_geolocations.append(j)
        place_name_geocoding[i] = filtered_geolocations
    return place_name_geocoding

In [15]:
def located_based_on_neighbours(place_name_geocoding, ne_index, places_and_poi):
    for i in place_name_geocoding:    
        # for places that have two or more possible geolocations
        # look at left and right target token in the text to 
        # decide which geolocation to choose
        neighbours_state = []
        if len(place_name_geocoding[i]) > 1:
            first_mention = ne_index[i][0]
            try:
                right_neighbour = places_and_poi[first_mention + 1]
                dist = abs(right_neighbour[1].start_char - places_and_poi[first_mention][1].start_char)
                if dist > 800:
                    raise ValueError("neighbour too far away")
                right_neighbour_name = right_neighbour[0]
                if len(place_name_geocoding[right_neighbour_name]) == 1:
                    neighbours_state.append([place_name_geocoding[right_neighbour_name][0].state, dist])
            except:
                pass
            try:
                left_neighbour = places_and_poi[first_mention-1]
                dist = abs(left_neighbour[1].start_char - places_and_poi[first_mention][1].start_char)
                if dist > 800:
                    raise ValueError("neighbour too far away")
                left_neighbour_name = left_neighbour[0]
                if len(place_name_geocoding[left_neighbour_name]) == 1:
                    neighbours_state.append([place_name_geocoding[left_neighbour_name][0].state, dist])
            except:
                pass
        if neighbours_state:
            filtered_geolocations = []
            closest_neighbour_with_exact_geolocation = min(neighbours_state, key=lambda x: x[1])
            filtered_state = closest_neighbour_with_exact_geolocation[0]
            for j in place_name_geocoding[i]:
                if j.state == filtered_state:
                    filtered_geolocations.append(j)
                    break
            if filtered_geolocations:
                place_name_geocoding[i] = filtered_geolocations
            else:
                # try assign target place to more distant neighbour region
                other_neighbour = max(neighbours_state, key=lambda x: x[1])
                filtered_state = other_neighbour[0]
                for j in place_name_geocoding[i]:
                    if j.state == filtered_state:
                        filtered_geolocations.append(j)
                        break
                if filtered_geolocations:
                    place_name_geocoding[i] = filtered_geolocations
    return place_name_geocoding

In [16]:
def locate_based_on_population(place_name_geocoding):
    for i in place_name_geocoding:
        if len(place_name_geocoding[i]) > 1:
            most_popular = max(place_name_geocoding[i], key=lambda x: x.population)
            place_name_geocoding[i] = [most_popular]
    return place_name_geocoding

In [17]:
def remove_geolocations_not_in_states_mentioned(place_name_geocoding, states):
    for i in place_name_geocoding:
        if len(place_name_geocoding[i]) == 1:
            if place_name_geocoding[i][0].state not in states:
                place_name_geocoding[i] = []
    return place_name_geocoding

In [18]:
def remove_empty_geo_point(place_name_geocoding):
    remove_keys = []
    for i in place_name_geocoding:
        if len(place_name_geocoding[i]) == 0:
            remove_keys.append(i)
    for i in remove_keys:
        del place_name_geocoding[i]
    return place_name_geocoding

In [19]:
def remove_mere_state_names(place_name_geocoding):
    for i in place_name_geocoding:
        state_name = False
        for j in place_name_geocoding[i]:
            if j.place_type == "first-order administrative division":
                state_name = True
        if state_name:
            place_name_geocoding[i] = []
    return place_name_geocoding

In [20]:
def disambiguate(places_geocoded, ne_index, places_and_poi):
    x, y = leave_one_country(places_geocoded)
    x = remove_duplicates_and_leave_states(x, y)
    x = remove_geolocations_not_in_states_mentioned(x, y)
    x = located_based_on_neighbours(x, ne_index, places_and_poi)
    x = locate_based_on_population(x)
    x = located_based_on_neighbours(x, ne_index, places_and_poi)
    x = remove_mere_state_names(x)
    x = remove_empty_geo_point(x)
    return x

## Visualization

In [41]:
import altair as alt
from vega_datasets import data as vg_data
import pandas as pd
import geopandas as gpd

In [64]:
def visualize_points(place_name_geocoding):
    name, lat, long = [], [], []
    for i in place_name_geocoding:
        name.append(i)
        lat.append(place_name_geocoding[i][0].lat)
        long.append(place_name_geocoding[i][0].long)
    df = pd.DataFrame(list(zip(name, lat, long)), columns =['Name', 'Lat', "Long"])
    states = alt.topo_feature(vg_data.us_10m.url, feature='states')
    gdf = gpd.read_file('https://raw.githubusercontent.com/python-visualization/folium/master/tests/us-states.json', driver='GeoJSON')
    gdf = gdf[gdf.id=='CA']
    
    background = alt.Chart(states).mark_geoshape(
        fill='lightgray',
        stroke='white'
    ).project('albersUsa').properties(
        width=900,
        height=1000
    )
    
    base = alt.Chart(gdf).mark_geoshape(
    stroke='black', 
    fill='lightgray').properties(
        width=700,
        height=1000
    )

    points = alt.Chart(df).mark_circle().encode(
        longitude='Long:Q',
        latitude='Lat:Q',
        color=alt.Color('Name:N', legend=alt.Legend(symbolLimit=0)),
        size=alt.value(50),
        tooltip='Name'
    )

    return base + points

### Test page

In [66]:
page_url = data["2"]["article_url"]

In [67]:
data["2"]["article_url"]

'https://www.nationalgeographic.com/travel/article/must-see-stops-destinations-california-route-1'

In [60]:
%time text, links = text_from_html(page_url)

CPU times: user 139 ms, sys: 4.87 ms, total: 144 ms
Wall time: 1.22 s


In [61]:
%time places_and_poi_processed, places_and_poi, ne_index = get_ne(text)

CPU times: user 13.1 s, sys: 312 ms, total: 13.4 s
Wall time: 2.47 s


In [None]:
%time lookup_results = existence_check(places_and_poi_processed)

In [32]:
%time places_geocoded = unify_lookup_results(lookup_results)

Loading formatted geocoded file...
CPU times: user 897 ms, sys: 9.82 s, total: 10.7 s
Wall time: 14.8 s


In [33]:
%time place_name_geocoding = disambiguate(places_geocoded, ne_index, places_and_poi)

CPU times: user 330 µs, sys: 151 µs, total: 481 µs
Wall time: 483 µs


In [34]:
[i for i in place_name_geocoding if len(place_name_geocoding[i]) > 1]

[]

In [65]:
visualize_points(place_name_geocoding)