# Entity Extratction with spaCy

In [1]:
## RUN THIS CELL FIRST

## This script installs libraries we'll use in the notebook
!bash entity-extraction-spacy.sh

In [None]:
# !pip install -U spacy

Model was build on ["OntoNotes" data set](https://catalog.ldc.upenn.edu/LDC2013T19). 

In [None]:
# !python -m spacy download en_core_web_sm

In [8]:
import spacy
import en_core_web_sm

In [9]:
nlp = en_core_web_sm.load()

In [12]:
doc = nlp(u"San Francisco considers banning sidewalk delivery robots")
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

San Francisco 0 13 GPE


In [14]:
for ent in doc.ents:
    print(f'{ent.text}\t{ent.label_}\t{spacy.explain(ent.label_)}')

San Francisco	GPE	Countries, cities, states


More on [named entity recognition with spaCy](https://spacy.io/usage/linguistic-features#named-entities)

In [16]:
!wget -N https://s3.amazonaws.com/media.johnkeefe.net/class-modules/2018.05.24_BerlinRosen_Responsive_Records_100pgs.pdf

--2019-08-03 16:23:32--  https://s3.amazonaws.com/media.johnkeefe.net/class-modules/2018.05.24_BerlinRosen_Responsive_Records_100pgs.pdf
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.115.37
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.115.37|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7474142 (7.1M) [application/pdf]
Saving to: ‘2018.05.24_BerlinRosen_Responsive_Records_100pgs.pdf’


2019-08-03 16:23:32 (18.8 MB/s) - ‘2018.05.24_BerlinRosen_Responsive_Records_100pgs.pdf’ saved [7474142/7474142]



In [None]:
# !pip install PyPDF2

In [20]:
import PyPDF2
import json
from os.path import exists


In [None]:
jsonl_file = "nyc_docs.jsonl"
if not exists(jsonl_file):
    pdf_file = open('2018.05.24_BerlinRosen_Responsive_Records_100pgs.pdf', 'rb')
    read_pdf = PyPDF2.PdfFileReader(pdf_file)
    with open(jsonl_file, 'w') as f:
        for page_num in range(read_pdf.getNumPages()):
            page = read_pdf.getPage(page_num)
            page_content = page.extractText().encode('utf-8').decode("utf-8") 
            f.write(json.dumps({"_source": {"content": page_content}, "_id": f"p{page_num+1}"}) + "\n")

In [22]:
with open(jsonl_file, 'r') as f:
    for line in f:
        line = json.loads(line)
        text = line["_source"]["content"]
        page = line["_id"]
        doc = nlp(text)
        for ent in doc.ents:
            if (ent.label_ == "PERSON"):
                print(page, ent.text)

p1 Jonathan Rosen
p1 ¤87(2
p1 ¤87(2
p1 ¤87(2
p2 Jimmy Pan
 Associate Counsel
p3 Phil
p3 Henry Goldman
p3 Phil
p3 Gwyneth Paltrow
p3 Henry Goldman
p3 Jan
p4 Jonathan Rosen
p4 Peter RE
p4 Bill de Blasio
p4 Peter Ragone Cc
p4 Jonathan Rosen Subject
p4 De Blasio's
p4 Peter
p4 DeBlasio Clips
p4 DeBlasio
p4 DeBlasio
p4 Dexheimer
p4 Bill De
p4 John Kanas
p4 Betty Liu
p4 De Blasia
p5 Wilbur Ross
p5 Steve Schwarzman
p6 Monica ¥
p6 Thx
p6 Monica Rejecting
p6 Kenneth Lovett
p6 de Blasia
p6 Jeffrey Klein
p6 de Blasia
p6 Klein
p6 Klein
p6 Dean Skelos
p6 Rejecting de Blasia's
p6 Klein
p8 Jonathan Rosen
p8 Peter
p8 Wolfe
p8 Emma Subject
p8 Richard Brodsky's
p8 Richard Brodsky
p8 Hank Sheinkopf
p8 Mayor
p8 Mayor
p8 Bill de Blasia
p8 Andrew Cuomo
p8 Cuomo
p9 Cuomo
p9 de Blasio
p9 Heather Briccetti
p9 Cuomo
p9 de Blasio
p9 Sheldon
p9 Jeffrey Klein
p9 de Blasio
p9 de Blasio
p9 de Blasio
p9 Vincent Alvarez
p9 Cuomo
p9 de Blasio
p9 Cuomo
p9 de Blasio
p9 Cuomo
p9 de Blasio
p9 de Blasio
p10 Cuomo
p10 Richard

In [27]:
list_of_names = {}

with open(jsonl_file, 'r') as f:
    for line in f:
        line = json.loads(line)
        text = line["_source"]["content"]
        page = line["_id"]
        doc = nlp(text)
        
        # loop through the entities in the page
        for ent in doc.ents:
            
            # is the entity is a person ...
            if (ent.label_ == "PERSON"):
                
                # check if we already have this entity
                if ent.text in list_of_names:
                    list_of_names[ent.text] += " " + page
                else:
                    list_of_names[ent.text] = page

In [28]:
list_of_names

{"0'!)2%-!$!8": 'p43',
 '123!4/!5\'3678A)2,#\'B"CDEDF"GEGHDIEJ': 'p75 p75',
 '3\')#1)&#."6T)+': 'p24',
 '3-).+\';a2\'-&"F&F\',(\'\'(,-&.A8+&\',)&+F"#\'"9F\'A&F\'A(.:$\'E(>&\',-8+\'76"90\n ': 'p92',
 '4&)"1*,)\',5\'*7"\'/$)*7\'G%:*&*2': 'p24',
 '@levitandanOn Tue': 'p50',
 'A%.3#&S,4%&!90H3#2!J--)#.4': 'p16',
 "A511$'#0-)&')4": 'p78',
 'APoe-Kest@cityhall.nyc.gov': 'p58',
 'Adam': 'p83 p86',
 'Adam Dickter': 'p33',
 'Adams': 'p13 p13 p13 p14 p14 p14 p15 p15 p15 p18 p18 p18 p19 p19 p19 p19 p20 p32 p32 p32 p32 p32',
 'Adams Marti Cc:': 'p19',
 'Alexandra Jonathan Rosen EmanueL Chase RE': 'p65',
 'Alexandra Subject': 'p59',
 'Alicia': 'p58 p58 p58',
 'Alison Baumann': 'p75 p75 p75 p75 p75 p78 p78 p78 p80 p80 p80 p80 p81',
 'Alison Novak': 'p83 p87',
 'Amazon Local': 'p71 p71',
 'Amazon Local \n': 'p71',
 'Andrew Brent': 'p58',
 'Andrew Cuomo': 'p8 p38',
 'Ann Santry': 'p84 p88',
 'Ann-Asch': 'p34',
 'Anna': 'p58 p58',
 'Anna\n': 'p58',
 'Anne Carson Blair': 'p84 p88',
 'Anthony Marx': 'p39

In [41]:
for name, pages in sorted(list_of_names.items()):
    print(name + "   (" + pages + ")")

0'!)2%-!$!8   (p43)
123!4/!5'3678A)2,#'B"CDEDF"GEGHDIEJ   (p75 p75)
3')#1)&#."6T)+   (p24)
3-).+';a2'-&"F&F',(''(,-&.A8+&',)&+F"#'"9F'A&F'A(.:$'E(>&',-8+'76"90
    (p92)
4&)"1*,)',5'*7"'/$)*7'G%:*&*2   (p24)
@levitandanOn Tue   (p50)
A%.3#&S,4%&!90H3#2!J--)#.4   (p16)
A511$'#0-)&')4   (p78)
APoe-Kest@cityhall.nyc.gov   (p58)
Adam   (p83 p86)
Adam Dickter   (p33)
Adams   (p13 p13 p13 p14 p14 p14 p15 p15 p15 p18 p18 p18 p19 p19 p19 p19 p20 p32 p32 p32 p32 p32)
Adams Marti Cc:   (p19)
Alexandra Jonathan Rosen EmanueL Chase RE   (p65)
Alexandra Subject   (p59)
Alicia   (p58 p58 p58)
Alison Baumann   (p75 p75 p75 p75 p75 p78 p78 p78 p80 p80 p80 p80 p81)
Alison Novak   (p83 p87)
Amazon Local   (p71 p71)
Amazon Local 
   (p71)
Andrew Brent   (p58)
Andrew Cuomo   (p8 p38)
Ann Santry   (p84 p88)
Ann-Asch   (p34)
Anna   (p58 p58)
Anna
   (p58)
Anne Carson Blair   (p84 p88)
Anthony Marx   (p39)
Arnie Gross   (p88)
Ashley Subject   (p47)
Ashley Thompson   (p47)
Attachments   (p26)
Barbara Gratz   