In [5]:
import html2text
import json
import requests
import numpy as np
from bs4 import BeautifulSoup

In [6]:
def text_from_html(url: str): #data_folder_html_file_id):
    """
    Given url of the webpage returns the readable text from the page
    """
    #road_trip = open("../data/" + str(data_folder_html_file_id) + ".html", "r").read()
    road_trip = requests.get(url).text
    soup = BeautifulSoup(road_trip)
    links = []
    for link in soup.findAll('a'):
        http_link = link.get('href')
        if "nationalgeographic" in http_link:
            continue
        if not http_link.startswith("http"):
            continue
        if len(http_link) > 5:
            links.append(http_link)
    
    text_maker = html2text.HTML2Text()
    text_maker.ignore_links = True
    text_maker.ignore_images = True
    road_trip_processed = text_maker.handle(road_trip)
    
    start = "ShareTweetEmail"
    index = road_trip_processed.find(start)
    road_trip_processed = road_trip_processed[index + len(start) + 1:]
    
    end = road_trip_processed.find("ShareTweetEmail")
    road_trip_processed = road_trip_processed[:end]
    road_trip_processed = road_trip_processed.replace("\n\n", ". ")
    for i in ["\n", "[", "]", "*", "#", "_", "\\'", '\\']:
        road_trip_processed = road_trip_processed.replace(i, " ")
    road_trip_processed = road_trip_processed.replace('>', ", ")
    for i in range(4): # 4 seems enough
        road_trip_processed = road_trip_processed.replace("  ", " ")
    road_trip_processed = road_trip_processed.replace("..", ".")
               
    return road_trip_processed.strip(), links 

In [7]:
def get_text_and_places() -> dict:
    with open('/home/geoner/data/articles.json') as json_file:
        data = json.load(json_file)
    for i in data:
        text, _ = text_from_html(data[i]["article_url"])
        data[i]["text"] = text      
    return data

In [8]:
%time data = get_text_and_places()

CPU times: user 1 s, sys: 8.9 ms, total: 1.01 s
Wall time: 9.9 s


## NLTK

In [8]:
import nltk

In [9]:
nltk_pred = dict()
ntlk_target_entities = ["GPE", "LOCATION", "FACILITY", "ORGANIZATION"]

for i in data:
    i_th_data_sample_predictions = []
    
    for sent in nltk.sent_tokenize(data[i]["text"]):
        for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
            
            if hasattr(chunk, 'label') and chunk.label() in ntlk_target_entities:
                extracted_place = " ".join(word[0] for word in chunk.leaves())
                i_th_data_sample_predictions.append(extracted_place)
    nltk_pred[i] = i_th_data_sample_predictions
ntlk_articles_recall = []
for i in data:
    article_correctly_extracted_entities_number = 0
    for entity in data[i]["places_and_poi"]:
        if entity in nltk_pred[i]:
            article_correctly_extracted_entities_number += 1
    ntlk_articles_recall.append(article_correctly_extracted_entities_number / len(data[i]["places_and_poi"]))

In [10]:
np.mean(ntlk_articles_recall)

0.5251821487691053

## Spacy

In [28]:
import spacy

In [29]:
nlp = spacy.load('en_core_web_trf')

In [72]:
def get_ne(txt):
    ne_types = ["GPE", "FAC", "LOC", "ORG"]
    ne = nlp(txt).ents
    target_ne = [str(ne[i]).strip() for i in range(len(ne)) if ne[i].label_ in ne_types]
    for i in range(len(target_ne)):
        if target_ne[i].startswith("the"):
            target_ne[i] = target_ne[i][4:]
        if target_ne[i].endswith(','):
            target_ne[i] = target_ne[i][:-1]
        target_ne[i] = target_ne[i].strip()
    return target_ne

In [73]:
spacy_pred = dict()
for i in data:
    i_th_data_sample_predictions = get_ne(data[i]["text"])
    spacy_pred[i] = i_th_data_sample_predictions
spacy_articles_recall = []
for i in data:
    article_correctly_extracted_entities_number = 0
    for entity in data[i]["places_and_poi"]:
        if entity in spacy_pred[i]:
            article_correctly_extracted_entities_number += 1
    spacy_articles_recall.append(article_correctly_extracted_entities_number / len(data[i]["places_and_poi"]))

In [74]:
np.mean(spacy_articles_recall)

0.9575572615790007

## Flair

In [1]:
from flair.models import SequenceTagger
from flair.data import Sentence

In [2]:
tagger = SequenceTagger.load('flair/ner-english')

2021-04-15 17:55:07,321 loading file /root/.flair/models/ner-english/4f4cdab26f24cb98b732b389e6cebc646c36f54cfd6e0b7d3b90b25656e4262f.8baa8ae8795f4df80b28e7f7b61d788ecbb057d1dc85aacb316f1bd02837a4a4


In [27]:
def get_flair_ne(txt: str):
    sentence = Sentence(txt)
    tagger.predict(sentence)
    target_ne = [entity.to_plain_string() for entity in sentence.get_spans() if entity.tag != "PER"]
    for i in range(len(target_ne)):
        if target_ne[i].endswith("’"):
            target_ne[i] = target_ne[i][:-1]
    return target_ne

In [32]:
flair_pred = dict()
for i in data:
    i_th_data_sample_predictions = get_flair_ne(data[i]["text"])
    flair_pred[i] = i_th_data_sample_predictions
flair_articles_recall = []
for i in data:
    article_correctly_extracted_entities_number = 0
    for entity in data[i]["places_and_poi"]:
        if entity in flair_pred[i]:
            article_correctly_extracted_entities_number += 1
    flair_articles_recall.append(article_correctly_extracted_entities_number / len(data[i]["places_and_poi"]))

In [33]:
np.mean(flair_articles_recall)

0.8971442969269056

## StanfordNER

Running stanford-ner server on localhost:8878
/home/stanfordner/stanford-ner-2020-11-17# java -mx1000m -cp stanford-ner-4.2.0.jar edu.stanford.nlp.ie.NERServer -loadClassifier classifiers/[classifier] -port 8878 -outputFormat inlineXML

In [39]:
import ner

In [54]:
tagger = ner.SocketNER(host='localhost', port=8878)

In [55]:
def get_stanford_ne(txt):
    ne_dict = tagger.get_entities(txt)
    ne = ne_dict["LOCATION"] + ne_dict["ORGANIZATION"]
    return ne

In [56]:
stanford_pred = dict()
for i in data:
    i_th_data_sample_predictions = get_stanford_ne(data[i]["text"])
    stanford_pred[i] = i_th_data_sample_predictions
stanford_articles_recall = []
for i in data:
    article_correctly_extracted_entities_number = 0
    for entity in data[i]["places_and_poi"]:
        if entity in stanford_pred[i]:
            article_correctly_extracted_entities_number += 1
    stanford_articles_recall.append(article_correctly_extracted_entities_number / len(data[i]["places_and_poi"]))

In [36]:
# english.all.3class.distsim.crf.ser.gz
np.mean(stanford_articles_recall)

0.6565522118782988

In [53]:
# english.conll.4class.distsim.crf.ser.gz
np.mean(stanford_articles_recall)

0.7034879050096442

In [57]:
# english.muc.7class.distsim.crf.ser.gz 
np.mean(stanford_articles_recall)

0.6244530147791016

## AllenNLP

In [1]:
from allennlp_models import pretrained

In [3]:
# https://github.com/allenai/allennlp-models/blob/main/allennlp_models/modelcards/tagging-fine-grained-transformer-crf-tagger.json
predictor = pretrained.load_predictor("tagging-fine-grained-transformer-crf-tagger")
# BILUO scheme is used 

lerc is not a registered model.
visual-entailment is not a registered model.


In [75]:
def get_allennlp_ne(txt: str):
    allen_result = predictor.predict(txt)
    ents = []
    for word, tag in zip(allen_result["words"], allen_result["tags"]):
        if tag != "O":
            ent_position, ent_type = tag.split("-")
            
            if ent_type not in ["LOC", "GPE", "ORG", "FAC", "PRODUCT", "WORK_OF_ART"]:
                continue
            if ent_position == "U":
                ents.append(word)
            else:
                if ent_position == "B":
                    w = word
                elif ent_position == "I":
                    w += " " + word
                elif ent_position == "L":
                    w += " " + word
                    ents.append(w)
    for i in range(len(ents)):
        if ents[i].startswith("the"):
            ents[i] = ents[i][4:]
        if ents[i].endswith("'s"):
            ents[i] = ents[i][:-2]
        if ents[i].endswith(','):
            ents[i] = ents[i][:-1]    
        ents[i] = ents[i].strip()
        ents[i] = ents[i].replace(" 's", "'s")
    return ents

In [76]:
allennlp_pred = dict()
for i in data:
    i_th_data_sample_predictions = get_allennlp_ne(data[i]["text"])
    allennlp_pred[i] = i_th_data_sample_predictions
allennlp_articles_recall = []
for i in data:
    article_correctly_extracted_entities_number = 0
    for entity in data[i]["places_and_poi"]:
        if entity in allennlp_pred[i]:
            article_correctly_extracted_entities_number += 1
    allennlp_articles_recall.append(article_correctly_extracted_entities_number / len(data[i]["places_and_poi"]))

In [77]:
np.mean(allennlp_articles_recall)

0.8018399273834056