### Article web page processing

In [1]:
import html2text
import json
import requests
import numpy as np
from bs4 import BeautifulSoup

In [2]:
def text_from_html(url: str): #data_folder_html_file_id):
    """
    Given url of the webpage returns the readable text from the page
    """
    #road_trip = open("../data/" + str(data_folder_html_file_id) + ".html", "r").read()
    road_trip = requests.get(url).text
    soup = BeautifulSoup(road_trip)
    links = []
    for link in soup.findAll('a'):
        http_link = link.get('href')
        if "nationalgeographic" in http_link:
            continue
        if not http_link.startswith("http"):
            continue
        if len(http_link) > 5:
            links.append(http_link)
    
    text_maker = html2text.HTML2Text()
    text_maker.ignore_links = True
    text_maker.ignore_images = True
    road_trip_processed = text_maker.handle(road_trip)
    
    start = "ShareTweetEmail"
    index = road_trip_processed.find(start)
    road_trip_processed = road_trip_processed[index + len(start) + 1:]
    
    end = road_trip_processed.find("ShareTweetEmail")
    road_trip_processed = road_trip_processed[:end]
    road_trip_processed = road_trip_processed.replace("\n\n", ". ")
    for i in ["\n", "[", "]", "*", "#", "_", "\\'", '\\']:
        road_trip_processed = road_trip_processed.replace(i, " ")
    road_trip_processed = road_trip_processed.replace('>', ", ")
    for i in range(4): # 4 seems enough
        road_trip_processed = road_trip_processed.replace("  ", " ")
    road_trip_processed = road_trip_processed.replace("..", ".")
               
    return road_trip_processed.strip(), links

def get_text_and_places() -> dict:
    with open('/home/geoner/data/articles.json') as json_file:
        data = json.load(json_file)
    for i in data:
        text, _ = text_from_html(data[i]["article_url"])
        data[i]["text"] = text      
    return data

In [3]:
%time data = get_text_and_places()

CPU times: user 916 ms, sys: 38.7 ms, total: 955 ms
Wall time: 15 s


### Extractor

In [5]:
import spacy

In [9]:
nlp = spacy.load('en_core_web_trf')
def get_ne(txt: str) -> list:
    ne_types = ["GPE", "FAC", "LOC", "ORG"]
    ne = nlp(txt).ents
    target_ne = [str(ne[i]).strip() for i in range(len(ne)) if ne[i].label_ in ne_types]
    for i in range(len(target_ne)):
        if target_ne[i].startswith("the"):
            target_ne[i] = target_ne[i][4:]
        if target_ne[i].endswith(','):
            target_ne[i] = target_ne[i][:-1]
        target_ne[i] = target_ne[i].strip()
    return target_ne

### Gazetteer

In [47]:
import difflib
def str_similarity_percent(a: str, b: str) -> float:
    return round(difflib.SequenceMatcher(lambda x: x == " ", a, b).ratio(), 3)

In [13]:
# 20'000 credits daily limit per application (identified by the parameter 'username'),
# the hourly limit is 1000 credits. A credit is a web service request hit for most services.
# An exception is thrown when the limit is exceeded. 
geonames = {
    "exact_match":"http://secure.geonames.org/searchJSON?maxRows=30&username=kwh44&name_equals=",
    "general_match": "http://secure.geonames.org/searchJSON?maxRows=30&username=kwh44&name="
}
# https://www.gisgraphy.com/documentation/user-guide.php#fulltextservice
gisgraphy = {
    "exact_match": "https://services.gisgraphy.com/fulltext/fulltextsearch?&format=json&allwordsrequired=true&q=",
    "general_match": "https://services.gisgraphy.com/fulltext/fulltextsearch?&format=json&q="
}
resource_name = ""
dbpedia = {
    "resource_name_match": "https://lookup.dbpedia.org/api/search?&typeName=place&format=JSON_FULL&maxResults=3&query=",
    "resource_rdf_get": f"https://dbpedia.org/data/{resource_name}.json"
}

### Test page

In [20]:
page_url = data["1"]["article_url"]

In [21]:
%time text, links = text_from_html(page_url)

CPU times: user 73.1 ms, sys: 3.65 ms, total: 76.8 ms
Wall time: 530 ms


In [22]:
%time places_and_poi = get_ne(text)

CPU times: user 11.3 s, sys: 6.58 ms, total: 11.3 s
Wall time: 2.05 s


In [23]:
lookup_results = []

In [48]:
for i in places_and_poi:
    print(i)

    # geonames exact place match search
    geoname_exact_lookup = requests.get(geonames["exact_match"] + i).json()
    # print(geoname_exact_lookup)
    results_num = geoname_exact_lookup["totalResultsCount"]
    if results_num >= 1:
        lookup_results.append([i, geoname_exact_lookup])
        print("Geonames exact search found match")
        continue
   
    # gisgraphy exact place match search
    gisgraphy_exact_lookup = requests.get(gisgraphy["exact_match"] + i).json()
    # print(gisgraphy_exact_lookup)
    results_num = gisgraphy_exact_lookup["response"]["numFound"]
    if results_num >= 1:
        # gisgaphy is not that good with exact match, might find lots of places that constain the query string,
        # but those places would add more noise for region localization module
        if results_num < 80:
            lookup_results.append([i, gisgraphy_exact_lookup])
            print("Gisgraphy exact search found match")
            continue

    print("exact place name search failed")        
    # exact place name search failed
    
    # geonames partial place match search
    geoname_general_lookup = requests.get(geonames["general_match"] + i).json()
    results_num = geoname_general_lookup["totalResultsCount"]
    if results_num >= 1:
        lookup_results.append([i, geoname_general_lookup])
        print("Geonames partial place match found")
        continue
    
    # gisgraphy partial place match search
    gisgraphy_general_lookup = requests.get(gisgraphy["general_match"] + i).json()
    results_num = gisgraphy_general_lookup["response"]["numFound"]
    if results_num >= 1:
        if results_num < 80:
            lookup_results.append([i, gisgraphy_general_lookup])
            print("Geonames partial place match found")
            continue
    
    print("Geonames and Gisgraphy don't know the place")
    
    # try dbpedia, maybe there is a wikipedia page for the place extracted
    print("DBpedia search")
    # dbpedia resource search
    dbpedia_lookup = requests.get(dbpedia["resource_name_match"] + i).json()
    for result in dbpedia_lookup["docs"]:
        result_name = result["label"][0]["value"]
        print("Result name", result_name, " Query is: ", i)
        print("Result relevance is ", str_similarity_percent(result_name, i))
    break

Route 66
Geonames exact search found match
Chicago
Geonames exact search found match
Santa Monica
Geonames exact search found match
America
Geonames exact search found match
Route 66
Geonames exact search found match
Route 66
Geonames exact search found match
66
Geonames exact search found match
Tulsa
Geonames exact search found match
Route 66
Geonames exact search found match
Route 66
Geonames exact search found match
Pixar
Gisgraphy exact search found match
Cozy Dogs
exact place name search failed
Geonames and Gisgraphy don't know the place
DBpedia search
Result name United States  Query is:  Cozy Dogs
Result relevance is  0.091
Result name India  Query is:  Cozy Dogs
Result relevance is  0.0
Result name United Kingdom  Query is:  Cozy Dogs
Result relevance is  0.087


In [45]:
dbpedia_lookup["docs"][0]["label"][0]["value"]

'United States'