In [1]:
import pandas as pd
import spacy
from spacy import displacy

In [2]:
nlp = spacy.load("en_core_web_sm")    # disable=["tagger", "parser", "ner"]) 

In [11]:
doc = nlp("James Bond went to High School in Michigan")

pd.DataFrame([(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
             token.morph, token.ent_type_) for token in doc],
             columns=['text', 'lemma', 'pos', 'tag', 'dep', 'morph', 'ent'])


Unnamed: 0,text,lemma,pos,tag,dep,morph,ent
0,James,James,PROPN,NNP,compound,(Number=Sing),PERSON
1,Bond,Bond,PROPN,NNP,nsubj,(Number=Sing),PERSON
2,went,go,VERB,VBD,ROOT,"(Tense=Past, VerbForm=Fin)",
3,to,to,ADP,IN,prep,(),
4,High,High,PROPN,NNP,compound,(Number=Sing),ORG
5,School,School,PROPN,NNP,pobj,(Number=Sing),ORG
6,in,in,ADP,IN,prep,(),
7,Michigan,Michigan,PROPN,NNP,pobj,(Number=Sing),GPE


In [4]:
[(label, spacy.explain(label)) for label in nlp.get_pipe("tagger").labels]


[('$', 'symbol, currency'),
 ("''", 'closing quotation mark'),
 (',', 'punctuation mark, comma'),
 ('-LRB-', 'left round bracket'),
 ('-RRB-', 'right round bracket'),
 ('.', 'punctuation mark, sentence closer'),
 (':', 'punctuation mark, colon or ellipsis'),
 ('ADD', 'email'),
 ('AFX', 'affix'),
 ('CC', 'conjunction, coordinating'),
 ('CD', 'cardinal number'),
 ('DT', 'determiner'),
 ('EX', 'existential there'),
 ('FW', 'foreign word'),
 ('HYPH', 'punctuation mark, hyphen'),
 ('IN', 'conjunction, subordinating or preposition'),
 ('JJ', 'adjective (English), other noun-modifier (Chinese)'),
 ('JJR', 'adjective, comparative'),
 ('JJS', 'adjective, superlative'),
 ('LS', 'list item marker'),
 ('MD', 'verb, modal auxiliary'),
 ('NFP', 'superfluous punctuation'),
 ('NN', 'noun, singular or mass'),
 ('NNP', 'noun, proper singular'),
 ('NNPS', 'noun, proper plural'),
 ('NNS', 'noun, plural'),
 ('PDT', 'predeterminer'),
 ('POS', 'possessive ending'),
 ('PRP', 'pronoun, personal'),
 ('PRP$', 'p

In [6]:
[token for token in doc if token.pos_ == "PROPN"]

[James, Bond, Michigan]

In [7]:
[token for token in doc if token.pos_ == "VERB"]

[went]

In [8]:
[token for token in doc if token.pos_ == "ADJ"]

[high]

### Named entity recognition

In [12]:
[(ent.text, ent.label_) for ent in doc.ents]

[('James Bond', 'PERSON'), ('High School', 'ORG'), ('Michigan', 'GPE')]

In [13]:
displacy.render(doc, style="ent")

In [14]:
doc2 = nlp("Apple is looking at buying U.K. startup DayOff for $1 billion")

In [16]:
[(ent.text, ent.label_) for ent in doc2.ents]

[('Apple', 'ORG'), ('U.K.', 'GPE'), ('DayOff', 'ORG'), ('$1 billion', 'MONEY')]

In [17]:
displacy.render(doc2, style="ent")

In [19]:
[(token.text, token.ent_type_) for token in doc2 if token.ent_type_ != ""]

[('Apple', 'ORG'),
 ('U.K.', 'GPE'),
 ('DayOff', 'ORG'),
 ('$', 'MONEY'),
 ('1', 'MONEY'),
 ('billion', 'MONEY')]

In [20]:
spacy.explain('GPE')

'Countries, cities, states'

In [21]:
doc = nlp("Autonomous cars shift insurance liability toward manufacturers")
pd.DataFrame([(chunk.text, chunk.root.text)
             for chunk in doc.noun_chunks], columns=['noun phrase', 'noun'])

Unnamed: 0,noun phrase,noun
0,Autonomous cars,cars
1,insurance liability,liability
2,manufacturers,manufacturers


In [23]:
doc = nlp('Satellites spot whales from space')
options = {'compact': True, 'add_lemma': False, 'distance': 150}
displacy.render(doc, style="dep", options=options)          # this tells us how things are related. Pretty cool huh? :-)

In [24]:
pd.DataFrame([(token.text, token.dep_, spacy.explain(token.dep_), token.head.text)
             for token in doc], columns=['text', 'dep1', 'dep2', 'headtext'])



Unnamed: 0,text,dep1,dep2,headtext
0,Satellites,nsubj,nominal subject,spot
1,spot,ROOT,,spot
2,whales,dobj,direct object,spot
3,from,prep,prepositional modifier,whales
4,space,pobj,object of preposition,from
