In [5]:
import html2text
import json
import requests
import numpy as np
from bs4 import BeautifulSoup

In [6]:
def text_from_html(url: str): #data_folder_html_file_id):
    """
    Given url of the webpage returns the readable text from the page
    """
    #road_trip = open("../data/" + str(data_folder_html_file_id) + ".html", "r").read()
    road_trip = requests.get(url).text
    soup = BeautifulSoup(road_trip)
    links = []
    for link in soup.findAll('a'):
        http_link = link.get('href')
        if "nationalgeographic" in http_link:
            continue
        if not http_link.startswith("http"):
            continue
        if len(http_link) > 5:
            links.append(http_link)
    
    text_maker = html2text.HTML2Text()
    text_maker.ignore_links = True
    text_maker.ignore_images = True
    road_trip_processed = text_maker.handle(road_trip)
    
    start = "ShareTweetEmail"
    index = road_trip_processed.find(start)
    road_trip_processed = road_trip_processed[index + len(start) + 1:]
    
    end = road_trip_processed.find("ShareTweetEmail")
    road_trip_processed = road_trip_processed[:end]
    road_trip_processed = road_trip_processed.replace("\n\n", ". ")
    for i in ["\n", "[", "]", "*", "#", "_", "\\'", '\\']:
        road_trip_processed = road_trip_processed.replace(i, " ")
    road_trip_processed = road_trip_processed.replace('>', ", ")
    for i in range(4): # 4 seems enough
        road_trip_processed = road_trip_processed.replace("  ", " ")
    road_trip_processed = road_trip_processed.replace("..", ".")
               
    return road_trip_processed.strip(), links 

In [7]:
def get_text_and_places() -> dict:
    with open('/home/geoner/data/articles.json') as json_file:
        data = json.load(json_file)
    for i in data:
        text, _ = text_from_html(data[i]["article_url"])
        data[i]["text"] = text      
    return data

In [34]:
%time data = get_text_and_places()

CPU times: user 1.12 s, sys: 7.8 ms, total: 1.13 s
Wall time: 11.1 s


## NLTK

In [None]:
import nltk

In [61]:
nltk_pred = dict()
ntlk_target_entities = ["GPE", "LOCATION", "FACILITY", "ORGANIZATION"]

for i in data:
    i_th_data_sample_predictions = []
    
    for sent in nltk.sent_tokenize(data[i]["text"]):
        for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
            
            if hasattr(chunk, 'label') and chunk.label() in ntlk_target_entities:
                extracted_place = " ".join(word[0] for word in chunk.leaves())
                i_th_data_sample_predictions.append(extracted_place)
    nltk_pred[i] = i_th_data_sample_predictions
ntlk_articles_recall = []
for i in data:
    article_correctly_extracted_entities_number = 0
    for entity in data[i]["places_and_poi"]:
        if entity in nltk_pred[i]:
            article_correctly_extracted_entities_number += 1
    ntlk_articles_recall.append(article_correctly_extracted_entities_number / len(data[i]["places_and_poi"]))

In [62]:
np.mean(ntlk_articles_recall)

0.526400097487054

## Spacy

In [9]:
import spacy

In [10]:
nlp = spacy.load('en_core_web_trf')

In [35]:
def get_ne(txt):
    ne_types = ["GPE", "FAC", "LOC", "ORG"]
    ne = nlp(txt).ents
    target_ne = [str(ne[i]).strip() for i in range(len(ne)) if ne[i].label_ in ne_types]
    for i in range(len(target_ne)):
        if target_ne[i].startswith("the") or target_ne[i].startswith("The"):
            target_ne[i] = target_ne[i][4:]
        if target_ne[i].endswith('.') or target_ne[i].endswith(','):
            target_ne[i] = target_ne[i][:-1]
        target_ne[i] = target_ne[i].strip()
    return target_ne

In [39]:
spacy_pred = dict()
for i in data:
    i_th_data_sample_predictions = get_ne(data[i]["text"])
    spacy_pred[i] = i_th_data_sample_predictions
spacy_articles_recall = []
for i in data:
    article_correctly_extracted_entities_number = 0
    for entity in data[i]["places_and_poi"]:
        if entity in spacy_pred[i]:
            article_correctly_extracted_entities_number += 1
    spacy_articles_recall.append(article_correctly_extracted_entities_number / len(data[i]["places_and_poi"]))

In [40]:
np.mean(spacy_articles_recall)

0.9505920971138362

## Flair

In [None]:
from flair.models import SequenceTagger

In [None]:
tagger = SequenceTagger.load('ner')