***Extracting Entities from Text***

In [1]:
import spacy
from spacy.lang.en.examples import sentences

In [2]:
# Load the large English NLP model
nlp = spacy.load("en_core_web_lg")

In [3]:
# The text we want to examine
text = """London is the capital and most populous city of England and
the United Kingdom. Standing on the River Thames in the south east
of the island of Great Britain, London has been a major settlement for
two millennia. It was founded by the Romans, who named it Londinium"""

# Parse the text with spaCy. This runs the entire NLP pipeline.
doc = nlp(text)

In [4]:
# For example, this will print out all the named entities that were detected:
for entity in doc.ents:
    print(f"{entity.text} ({entity.label_})")

London (GPE)
England (GPE)
the United Kingdom (GPE)
the River Thames (LOC)
Great Britain (GPE)
London (GPE)
two millennia (DATE)
Romans (NORP)
Londinium (LOC)


***Building a data scrubber***

In [8]:
# Replace a token with "REDACTED" if it is a name
def replace_name_with_placeholder(token):
    if token.ent_iob != 0 and token.ent_type_ == "PERSON":
        return "[REDACTED]"
    else:
        return token.string 

In [10]:
# Loop through all the entities in a document and check if they are names
def scrub(text):
    doc = nlp(text)
    for ent in doc.ents:
        ent.merge()
    tokens = map(replace_name_with_placeholder, doc)
    return "".join(tokens)

In [11]:
s = """
In 1950, Alan Turing published his famous article "Computing Machinery
and Intelligence". In 1957, Noam 'Chomskys Syntactic Structures
revolutionized Linguistics with "universal grammar", a rule based system
of syntactic structures.
"""

print(scrub(s))


In 1950, [REDACTED]published his famous article "Computing Machinery
and Intelligence". In 1957, Noam 'Chomskys Syntactic Structures
revolutionized Linguistics with "universal grammar", a rule based system
of syntactic structures.



***Extracting Facts from Text***

In [12]:
import spacy
import textacy.extract
from pathlib import Path 

In [14]:
# The text we want to examine
text = Path("london.txt").read_text()

In [15]:
# Parse the document with spaCy
doc = nlp(text)

# Extract semi-structured statements 
statements = textacy.extract.semistructured_statements(doc, "London")

In [16]:
# Print the results
print("Here are the things I know about London:")

for statement in statements:
    subject, verb, fact = statement
    print(f" - {fact}")

Here are the things I know about London:
 - the capital and most populous city of England and the United Kingdom.

 - a major settlement for two millennia
 - the world's most populous city from around 1831 to 1925
 - beyond all comparison the largest
town in England
 - still very compact
 - the world's largest city from about 1831 to 1925
 - the seat of the
Government of the United Kingdom
 - vulnerable to flooding.

 - "one of the World's
Greenest Cities" with more than 40 percent green space or open water
 - the most
populous city and metropolitan area of the European Union and the second most
populous in Europe
 - the 19th largest city and the 18th largest
metropolitan region in the world
 - Christian, and has a large number of churches, particularly
in the City of London
 - also home to sizeable Muslim, Hindu, Sikh, and Jewish
communities
 - also home to 42
Hindu temples
 - one of the pre-eminent financial centres of the world as the most
important location for international financ

***Extracting Noun Chunks***

In [None]:
import spacy
import textacy.extract
from pathlib import Path

# Load the large English NLP model
nlp = spacy.load("en_core_web_lg")

# The text we want to examine
text = Path("london.txt").read_text()

# Parse the document with spaCy
doc = nlp(text)

In [17]:
# Extract semi-structured statements
noun_chunks = textacy.extract.noun_chunks(doc, min_freq=3)

In [18]:
# Convert noun chunks to lowercase strings
noun_chunks = map(str, noun_chunks)
noun_chunks = map(str.lower, noun_chunks)

# Print out any nouns that are at least 2 words long
for noun_chunk in set(noun_chunks):
    if len(noun_chunk.split(" ")) > 1:
        print(noun_chunk)


major centre

london underground
large number
european union
population density
greater london

london school
major centre

population density
greater london's population
united kingdom
royal albert hall
2011 census
second world war
canary wharf
river thames
westminster abbey
west london
epping forest
london school
london eye
other city
new york city
greater london authority
central london
national gallery
regent's park
hampstead heath
tate modern
national statistics

eight royal parks
eight royal parks
royal opera house

city centre
london underground
inner london
east end
london's population
british museum
office space
great fire

other city
city centre

national gallery
outer london
south london
trafalgar square
