# Test 1

In [1]:
import spacy

In [2]:
# Load the large English NLP model
nlp = spacy.load('en_core_web_lg')

In [3]:
# The text we want to examine
text = """Chicago ( (listen), locally also ), officially the City of Chicago, is the most populous city in the U.S. state of Illinois and the third most populous city in the United States. With an estimated population of 2,705,994 (2018), it is also the most populous city in the Midwestern United States. Chicago is the county seat of Cook County, the second most populous county in the US, with portions of the northwest city limits extending into DuPage County near O'Hare Airport. Chicago is the principal city of the Chicago metropolitan area, often referred to as Chicagoland. At nearly 10 million people, the metropolitan area is the third most populous in the nation.
"""

In [4]:
# Parse the text with spaCy. This runs the entire pipeline.
doc = nlp(text)

In [5]:
# 'doc' now contains a parsed version of text. We can use it to do anything we want!
# For example, this will print out all the named entities that were detected:
for entity in doc.ents:
    print(f"{entity.text} ({entity.label_})")

Chicago (GPE)
the City of Chicago (GPE)
U.S. (GPE)
Illinois (GPE)
third (ORDINAL)
the United States (GPE)
2,705,994 (CARDINAL)
2018 (DATE)
the Midwestern United States (GPE)
Chicago (GPE)
Cook County (GPE)
second (ORDINAL)
US (GPE)
DuPage County (GPE)
O'Hare Airport (FAC)
Chicago (GPE)
Chicago (GPE)
Chicagoland (GPE)
nearly 10 million (CARDINAL)
third (ORDINAL)


# Test 2

In [6]:
import spacy

# Load the large English NLP model
nlp = spacy.load('en_core_web_lg')

In [7]:
# Replace a token with "REDACTED" if it is a name
def replace_name_with_placeholder(token):
    if token.ent_iob != 0 and token.ent_type_ == "PERSON":
        return "[REDACTED] "
    else:
        return token.string

In [8]:
# Loop through all the entities in a document and check if they are names
def scrub(text):
    doc = nlp(text)
    for ent in doc.ents:
        ent.merge()
    tokens = map(replace_name_with_placeholder, doc)
    return "".join(tokens)

In [9]:
s = """
In 1950, Alan Turing published his famous article "Computing Machinery and Intelligence". In 1957, Noam Chomsky’s 
Syntactic Structures revolutionized Linguistics with 'universal grammar', a rule based system of syntactic structures.
"""

In [10]:
print(scrub(s))


In 1950, [REDACTED] published his famous article "Computing Machinery and Intelligence". In 1957, [REDACTED] 
Syntactic Structures revolutionized Linguistics with 'universal grammar', a rule based system of syntactic structures.



# Test 3

In [11]:
import spacy
import textacy.extract

# Load the large English NLP model
nlp = spacy.load('en_core_web_lg')

In [12]:
# The text we want to examine
text = """Chicago, officially the City of Chicago, is the most populous city in the U.S. state of Illinois and the third 
most populous city in the United States. With an estimated population of 2,705,994 (2018), it is also the most populous 
city in the Midwestern United States. Chicago is the county seat of Cook County, the second most populous county in the US,
with portions of the northwest city limits extending into DuPage County near O'Hare Airport. Chicago is the principal city 
of the Chicago metropolitan area, often referred to as Chicagoland. At nearly 10 million people, the metropolitan area is 
the third most populous in the nation.
Located on the shores of freshwater Lake Michigan, Chicago was incorporated as a city in 1837 near a portage between the 
Great Lakes and the Mississippi River watershed and grew rapidly in the mid-19th century. After the Great Chicago Fire of 
1871, which destroyed several square miles and left more than 100,000 homeless, the city made a concerted effort to rebuild.
The construction boom accelerated population growth throughout the following decades, and by 1900, less than 30 years after
the great fire, Chicago was the fifth-largest city in the world. Chicago made noted contributions to urban planning and 
zoning standards, including new construction styles (including the Chicago School of architecture), the development of the
City Beautiful Movement, and the steel-framed skyscraper.Chicago is an international hub for finance, culture, commerce, 
industry, education, technology, telecommunications, and transportation. It is the site of the creation of the first 
standardized futures contracts, issued by the Chicago Board of Trade, which today is the largest and most diverse 
derivatives market in the world, generating 20% of all volume in commodities and financial futures alone. Depending on the
particular year, the city's O'Hare International Airport is routinely ranked as the world's first or second busiest airport
according to tracked data by the Airports Council International. The region also has the largest number of federal highways
and is the nation's railroad hub. Chicago was listed as an alpha global city by the Globalization and World Cities Research
Network, and it ranked seventh in the entire world in the 2017 Global Cities Index. The Chicago area has one of the highest
gross domestic products (GDP) in the world, generating $680 billion in 2017. In addition, the city has one of the world's 
most diversified and balanced economies, with no single industry employing more than 14% of the workforce. Chicago is home
to several Fortune 500 companies, including Allstate, Boeing, Exelon, Kraft Heinz, McDonald's, Mondelez International, 
Sears, United Airlines Holdings, and Walgreens.
Chicago's 58 million domestic and international visitors in 2018 made it the second most visited city in the nation, not 
far behind New York City's 65 million visitors in 2018. The city was ranked first in the 2018 Time Out City Life Index, a
global quality of life survey of 15,000 people in 32 cities. Landmarks in the city include Millennium Park, Navy Pier, the
Magnificent Mile, the Art Institute of Chicago, Museum Campus, the Willis (Sears) Tower, Grant Park, the Museum of Science
and Industry, and Lincoln Park Zoo. Chicago's culture includes the visual arts, literature, film, theatre, comedy 
(especially improvisational comedy), food, and music, particularly jazz, blues, soul, hip-hop, gospel, and electronic 
dance music including house music. Of the area's many colleges and universities, the University of Chicago, Northwestern University, and the University of Illinois at Chicago are classified as "highest research" doctoral universities.  Chicago has professional sports teams in each of the major professional leagues, including two Major League Baseball teams.
"""

# Parse the document with spaCy
doc = nlp(text)

In [13]:
# Extract semi-structured statements
statements = textacy.extract.semistructured_statements(doc, "Chicago")

In [14]:
# Print the results
print("Here are the things I know about Chicago:")

for statement in statements:
    subject, verb, fact = statement
    print(f" - {fact}")

Here are the things I know about Chicago:
 - the most populous city in the U.S. state of Illinois and the third 
most populous city in the United States
 - the principal city 
of the Chicago metropolitan area, often referred to as Chicagoland
 - the fifth-largest city in the world
 - an international hub for finance, culture, commerce, 
industry, education, technology, telecommunications, and transportation
