In [1]:
import bz2
import cchardet # speed up lxml (html parsing) just by importing
import json
import lxml
import requests
from itertools import chain
import time
import unicodedata
from bs4 import BeautifulSoup
from text_processing_utils import *
import sys
sys.path.append(sys.path[0] + '/..')
from baselines.hover.StanfordNLP import StanfordNLP
corenlp = StanfordNLP(port=9000)
# java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -annotators "tokenize,ssplit,pos,lemma,parse,sentiment" -port 9000 -timeout 30000 -quiet

import re
cite_pattern = re.compile("\[\d+\]|\[nb\s*\d+\]")   # Citation pattern e.g. [1] or [nb]
data_loc = "../baselines/hover/data/enwiki_files/enwiki-2023-original/"

import spacy
spacy.prefer_gpu() # alternatively use: spacy.require_gpu()
nlp = spacy.load("en_core_web_lg", disable=['tagger', 'parser', 'ner', 'lemmatizer'])
nlp.add_pipe('sentencizer')

<spacy.pipeline.sentencizer.Sentencizer at 0x78c774dbf540>

In [7]:
def find_empty_text(filepath: str, start_loc: str):
    results = []
    with bz2.open(start_loc + filepath, "rt") as file:
        for line in file:
            wiki_article = json.loads(line)
            wiki_text = wiki_article['fact_text'][1:]
            if not " ".join(chain.from_iterable(wiki_text)).strip():
                results.append((wiki_article['url'], wiki_article['fact_text'][0]))
        return results

search_paths = search_file_paths(data_loc)[:500]
not_found = Parallel(n_jobs=16, prefer=None)(delayed(find_empty_text)(bz2_filepath, data_loc) for bz2_filepath in search_paths)
not_found = list(chain.from_iterable(not_found))

In [2]:
def cite(wiki_url, use_spacy):
    raw_page = requests.get(url=wiki_url)
    soup = BeautifulSoup(raw_page.text, 'lxml')
    # used only when extracting just the first paragraph
    cite_texts, parent_paras = [], []
    parent_tags = soup.find_all(['p', 'ul'], class_=None)
    for parent_tag in parent_tags:
        # Store Parent sentences per paragraph sub-list
        parent_text = cite_pattern.sub('', unicodedata.normalize('NFD', parent_tag.get_text()).strip())
        if not parent_text:
            continue

        if use_spacy:
            parent_sents = [sent for p_sent in nlp(parent_text).sents for sent in p_sent.text.strip().split("\n")]
            parent_paras.append(parent_sents)
        else:
            corenlp_sents = []
            para_parse = corenlp.annotate(parent_text)
            for sent_parse in para_parse['sentences']:
                start_idx = sent_parse['tokens'][0]['characterOffsetBegin']
                end_idx = sent_parse['tokens'][-1]['characterOffsetEnd'] 
                corenlp_sents.extend(parent_text[start_idx:end_idx].strip().split("\n"))
            parent_paras.append(corenlp_sents)

        # Find all citation tags in paragraph and extract text up to each citation tag.
        cite_tags = parent_tag.find_all('sup', {'class': 'reference'})
        for cite_tag in cite_tags:
            cite_text = cite_tag.get_text()
            if cite_text:
                citated_text = parent_tag.get_text().split(cite_text)[:-1]
                cited = ''
                # Multiple citations can occur in a paragraph (or sentence) so concatenate previous parts
                for c in citated_text:
                    cited += c
                    cleaned_text = cite_pattern.sub('', unicodedata.normalize('NFD', cited.split("\n")[-1])).strip()
                    if cleaned_text and cleaned_text not in cite_texts:
                        cite_texts.append(cleaned_text)

    # Get actual last sentence and remove non-duplicates (exact match and sub-strings)
    if use_spacy:
        docs = nlp.pipe(cite_texts, batch_size=128, n_process=1)
        sentences = [list(doc.sents)[-1].text for doc in docs if list(doc.sents)]
        non_dupes = list(dict.fromkeys(sentences))
    else:
        non_dupes = []
        for para_text in cite_texts:
            para_parse = corenlp.annotate(para_text)
            sent_parse = para_parse['sentences'][-1]
            start_idx = sent_parse['tokens'][0]['characterOffsetBegin']
            sent = para_text[start_idx:]
            if sent not in non_dupes:
                non_dupes.append(sent)
    filtered_sentences = [sent for sent in non_dupes if not any(sent in other_sent for other_sent in non_dupes if sent != other_sent)]
    # Get the full sentence (possibly sentence cut-off due to citation in middle of sentence).
    results = []
    for paragraph in parent_paras:
        para_sents = [p_sent for c_sent in filtered_sentences for p_sent in paragraph if c_sent in p_sent]
        if para_sents:
            results.append(para_sents)
    return results


In [3]:
results = cite("https://en.wikipedia.org/wiki/Delft", False)
for r in results:
    print(r)

['Historically, Delft played a highly influential role in the Dutch Golden Age.', 'In terms of science and technology, thanks to the pioneering contributions of Antonie van Leeuwenhoek and Martinus Beijerinck, Delft can be considered to be the birthplace of microbiology.']
['The Delft Explosion, also known in history as the Delft Thunderclap, occurred on 12 October 1654 when a gunpowder store exploded, destroying much of the city.', 'Over a hundred people were killed and thousands were injured.']
["The gunpowder store (Dutch: Kruithuis) was subsequently re-housed, a 'cannonball's distance away', outside the city, in a new building designed by architect Pieter Post."]
['The city centre retains a large number of monumental buildings, while in many streets there are canals of which the banks are connected by typical bridges, altogether making this city a notable tourist destination.']
["The Prinsenhof (Princes' Court), now a museum.", 'Restored to working order in 2013.', 'Royal Delft als

In [3]:
results = cite("https://en.wikipedia.org/wiki/Delft", True)
for r in results:
    print(r)

['Historically, Delft played a highly influential role in the Dutch Golden Age.', 'In terms of science and technology, thanks to the pioneering contributions of Antonie van Leeuwenhoek and Martinus Beijerinck, Delft can be considered to be the birthplace of microbiology.']
['The Delft Explosion, also known in history as the Delft Thunderclap, occurred on 12 October 1654 when a gunpowder store exploded, destroying much of the city.', 'Over a hundred people were killed and thousands were injured.']
["The gunpowder store (Dutch: Kruithuis) was subsequently re-housed, a 'cannonball's distance away', outside the city, in a new building designed by architect Pieter Post."]
['The city centre retains a large number of monumental buildings, while in many streets there are canals of which the banks are connected by typical bridges, altogether making this city a notable tourist destination.']
["The Prinsenhof (Princes' Court), now a museum.", 'Restored to working order in 2013.', 'Royal Delft als

In [8]:
import sqlite3, unicodedata

title = "Faster-than-light"

conn = sqlite3.connect("../baselines/hover/data/db_files/enwiki-2017-cite-full.db")
wiki_db = conn.cursor()
doc_text = wiki_db.execute("SELECT text FROM documents WHERE id = ?", 
                            (unicodedata.normalize('NFD',title),)).fetchone()[0]
conn.close()
print("2017 ", doc_text)

conn = sqlite3.connect("../baselines/hover/data/db_files/enwiki-2023-cite-full-incorrect.db")
wiki_db = conn.cursor()
doc_text = wiki_db.execute("SELECT text FROM documents WHERE id = ?", 
                            (unicodedata.normalize('NFD',title),)).fetchone()[0]
conn.close()
print("2023 ", doc_text)

2017  Apparent" or "effective" FTL, on the other hand, depends on the hypothesis that unusually distorted regions of spacetime might permit matter to reach distant locations in less time than light could in normal ("undistorted") spacetime.[SENT]Examples of apparent FTL proposals are the Alcubierre drive, Krasnikov tubes, traversable wormholes, and quantum tunneling.[SENT]In the context of this article, FTL is the transmission of information or matter faster than c, a constant equal to the speed of light in vacuum, which is 299,792,458 m/s (by definition of the metre) or about 186,282.397 miles per second.[SENT]Proxima Centauri, the nearest star outside the Solar System, is about four and a half light-years away.[SENT]In this frame of reference, in which Proxima Centauri is perceived to be moving in a circular trajectory with a radius of four light years, it could be described as having a speed many times greater than c as the rim speed of an object moving in a circle is a product of t