# Verifications: spaCy 

- Target: To perform parsing and entity recognition using spaCy
- Keywords: Tokenization, POS tagging, Dependency analysis, Words similarity measurement, Entity recognition

## 1. Parsing (分かち書きとPOSタギング）

In [1]:
import spacy

In [2]:
nlp = spacy.load('en')

In [3]:
text = u"We are living in Singapore.\nIt's blazing outside today!\n"

In [4]:
doc = nlp(text)

In [5]:
# 各トークンに対しPOSタグのIDを得る
for token in doc:
    print((token.text, token.lemma, token.tag, token.pos))

('We', 561228191312463089, 13656873538139661788, 94)
('are', 10382539506755952630, 9188597074677201817, 99)
('living', 13874798850131827181, 1534113631682161808, 99)
('in', 3002984154512732771, 1292078113972184607, 84)
('Singapore', 10329536245932617809, 15794550382381185553, 95)
('.', 12646065887601541794, 12646065887601541794, 96)
('\n', '\n', 0, 102)
('It', 561228191312463089, 13656873538139661788, 94)
("'s", 10382539506755952630, 13927759927860985106, 99)
('blazing', 14126656987735467782, 1534113631682161808, 99)
('outside', 12341974070768608367, 164681854541413346, 85)
('today', 11042482332948150395, 15308085513773655218, 91)
('!', 17494803046312582752, 12646065887601541794, 96)
('\n', '\n', 0, 102)


In [6]:
# 各トークンに対しPOSタグを得る
for token in doc:
    print((token.text, token.lemma_, token.tag_, token.pos_)) # lemma means *root form*

('We', '-PRON-', 'PRP', 'PRON')
('are', 'be', 'VBP', 'VERB')
('living', 'live', 'VBG', 'VERB')
('in', 'in', 'IN', 'ADP')
('Singapore', 'singapore', 'NNP', 'PROPN')
('.', '.', '.', 'PUNCT')
('\n', '\n', '', 'SPACE')
('It', '-PRON-', 'PRP', 'PRON')
("'s", 'be', 'VBZ', 'VERB')
('blazing', 'blaze', 'VBG', 'VERB')
('outside', 'outside', 'RB', 'ADV')
('today', 'today', 'NN', 'NOUN')
('!', '!', '.', 'PUNCT')
('\n', '\n', '', 'SPACE')


### Tag, POStag　一覧

<table class="c-table o-block"><tr class="c-table__row"><th class="c-table__head-cell u-text-label">Tag</th><th class="c-table__head-cell u-text-label">POS</th><th class="c-table__head-cell u-text-label">Morphology</th></tr><tr class="c-table__row"><td class="c-table__cell u-text"> <code>-LRB-</code></td><td class="c-table__cell u-text"> <code>PUNCT</code></td><td class="c-table__cell u-text"> <code>PunctType=brck</code> <code>PunctSide=ini</code></td></tr><tr class="c-table__row"><td class="c-table__cell u-text"> <code>-PRB-</code></td><td class="c-table__cell u-text"> <code>PUNCT</code></td><td class="c-table__cell u-text"> <code>PunctType=brck</code> <code>PunctSide=fin</code></td></tr><tr class="c-table__row"><td class="c-table__cell u-text"> <code>,</code></td><td class="c-table__cell u-text"> <code>PUNCT</code></td><td class="c-table__cell u-text"> <code>PunctType=comm</code></td></tr><tr class="c-table__row"><td class="c-table__cell u-text"> <code>:</code></td><td class="c-table__cell u-text"> <code>PUNCT</code></td><td class="c-table__cell u-text"></td></tr><tr class="c-table__row"><td class="c-table__cell u-text"> <code>.</code></td><td class="c-table__cell u-text"> <code>PUNCT</code></td><td class="c-table__cell u-text"> <code>PunctType=peri</code></td></tr><tr class="c-table__row"><td class="c-table__cell u-text"> <code>''</code></td><td class="c-table__cell u-text"> <code>PUNCT</code></td><td class="c-table__cell u-text"> <code>PunctType=quot</code> <code>PunctSide=fin</code></td></tr><tr class="c-table__row"><td class="c-table__cell u-text"> <code>&quot;&quot;</code></td><td class="c-table__cell u-text"> <code>PUNCT</code></td><td class="c-table__cell u-text"> <code>PunctType=quot</code> <code>PunctSide=fin</code></td></tr><tr class="c-table__row"><td class="c-table__cell u-text"> <code>#</code></td><td class="c-table__cell u-text"> <code>SYM</code></td><td class="c-table__cell u-text"> <code>SymType=numbersign</code></td></tr><tr class="c-table__row"><td class="c-table__cell u-text"> <code>``</code></td><td class="c-table__cell u-text"> <code>PUNCT</code></td><td class="c-table__cell u-text"> <code>PunctType=quot</code> <code>PunctSide=ini</code></td></tr><tr class="c-table__row"><td class="c-table__cell u-text"> <code></code></td><td class="c-table__cell u-text"> <code>SYM</code></td><td class="c-table__cell u-text"> <code>SymType=currency</code></td></tr><tr class="c-table__row"><td class="c-table__cell u-text"> <code>ADD</code></td><td class="c-table__cell u-text"> <code>X</code></td><td class="c-table__cell u-text"></td></tr><tr class="c-table__row"><td class="c-table__cell u-text"> <code>AFX</code></td><td class="c-table__cell u-text"> <code>ADJ</code></td><td class="c-table__cell u-text"> <code>Hyph=yes</code></td></tr><tr class="c-table__row"><td class="c-table__cell u-text"> <code>BES</code></td><td class="c-table__cell u-text"> <code>VERB</code></td><td class="c-table__cell u-text"></td></tr><tr class="c-table__row"><td class="c-table__cell u-text"> <code>CC</code></td><td class="c-table__cell u-text"> <code>CONJ</code></td><td class="c-table__cell u-text"> <code>ConjType=coor</code></td></tr><tr class="c-table__row"><td class="c-table__cell u-text"> <code>CD</code></td><td class="c-table__cell u-text"> <code>NUM</code></td><td class="c-table__cell u-text"> <code>NumType=card</code></td></tr><tr class="c-table__row"><td class="c-table__cell u-text"> <code>DT</code></td><td class="c-table__cell u-text"> <code>DET</code></td><td class="c-table__cell u-text"></td></tr><tr class="c-table__row"><td class="c-table__cell u-text"> <code>EX</code></td><td class="c-table__cell u-text"> <code>ADV</code></td><td class="c-table__cell u-text"> <code>AdvType=ex</code></td></tr><tr class="c-table__row"><td class="c-table__cell u-text"> <code>FW</code></td><td class="c-table__cell u-text"> <code>X</code></td><td class="c-table__cell u-text"> <code>Foreign=yes</code></td></tr><tr class="c-table__row"><td class="c-table__cell u-text"> <code>GW</code></td><td class="c-table__cell u-text"> <code>X</code></td><td class="c-table__cell u-text"></td></tr><tr class="c-table__row"><td class="c-table__cell u-text"> <code>HVS</code></td><td class="c-table__cell u-text"> <code>VERB</code></td><td class="c-table__cell u-text"></td></tr><tr class="c-table__row"><td class="c-table__cell u-text"> <code>HYPH</code></td><td class="c-table__cell u-text"> <code>PUNCT</code></td><td class="c-table__cell u-text"> <code>PunctType=dash</code></td></tr><tr class="c-table__row"><td class="c-table__cell u-text"> <code>IN</code></td><td class="c-table__cell u-text"> <code>ADP</code></td><td class="c-table__cell u-text"></td></tr><tr class="c-table__row"><td class="c-table__cell u-text"> <code>JJ</code></td><td class="c-table__cell u-text"> <code>ADJ</code></td><td class="c-table__cell u-text"> <code>Degree=pos</code></td></tr><tr class="c-table__row"><td class="c-table__cell u-text"> <code>JJR</code></td><td class="c-table__cell u-text"> <code>ADJ</code></td><td class="c-table__cell u-text"> <code>Degree=comp</code></td></tr><tr class="c-table__row"><td class="c-table__cell u-text"> <code>JJS</code></td><td class="c-table__cell u-text"> <code>ADJ</code></td><td class="c-table__cell u-text"> <code>Degree=sup</code></td></tr><tr class="c-table__row"><td class="c-table__cell u-text"> <code>LS</code></td><td class="c-table__cell u-text"> <code>PUNCT</code></td><td class="c-table__cell u-text"> <code>NumType=ord</code></td></tr><tr class="c-table__row"><td class="c-table__cell u-text"> <code>MD</code></td><td class="c-table__cell u-text"> <code>VERB</code></td><td class="c-table__cell u-text"> <code>VerbType=mod</code></td></tr><tr class="c-table__row"><td class="c-table__cell u-text"> <code>NFP</code></td><td class="c-table__cell u-text"> <code>PUNCT</code></td><td class="c-table__cell u-text"></td></tr><tr class="c-table__row"><td class="c-table__cell u-text"> <code>NIL</code></td><td class="c-table__cell u-text"></td><td class="c-table__cell u-text"></td></tr><tr class="c-table__row"><td class="c-table__cell u-text"> <code>NN</code></td><td class="c-table__cell u-text"> <code>NOUN</code></td><td class="c-table__cell u-text"> <code>Number=sing</code></td></tr><tr class="c-table__row"><td class="c-table__cell u-text"> <code>NNP</code></td><td class="c-table__cell u-text"> <code>PROPN</code></td><td class="c-table__cell u-text"> <code>NounType=prop</code> <code>Number=sign</code></td></tr><tr class="c-table__row"><td class="c-table__cell u-text"> <code>NNPS</code></td><td class="c-table__cell u-text"> <code>PROPN</code></td><td class="c-table__cell u-text"> <code>NounType=prop</code> <code>Number=plur</code></td></tr><tr class="c-table__row"><td class="c-table__cell u-text"> <code>NNS</code></td><td class="c-table__cell u-text"> <code>NOUN</code></td><td class="c-table__cell u-text"> <code>Number=plur</code></td></tr><tr class="c-table__row"><td class="c-table__cell u-text"> <code>PDT</code></td><td class="c-table__cell u-text"> <code>ADJ</code></td><td class="c-table__cell u-text"> <code>AdjType=pdt</code> <code>PronType=prn</code></td></tr><tr class="c-table__row"><td class="c-table__cell u-text"> <code>POS</code></td><td class="c-table__cell u-text"> <code>PART</code></td><td class="c-table__cell u-text"> <code>Poss=yes</code></td></tr><tr class="c-table__row"><td class="c-table__cell u-text"> <code>PRP</code></td><td class="c-table__cell u-text"> <code>PRON</code></td><td class="c-table__cell u-text"> <code>PronType=prs</code></td></tr><tr class="c-table__row"><td class="c-table__cell u-text"> <code>PRP</code></td><td class="c-table__cell u-text"> <code>ADJ</code></td><td class="c-table__cell u-text"> <code>PronType=prs</code> <code>Poss=yes</code></td></tr><tr class="c-table__row"><td class="c-table__cell u-text"> <code>RB</code></td><td class="c-table__cell u-text"> <code>ADV</code></td><td class="c-table__cell u-text"> <code>Degree=pos</code></td></tr><tr class="c-table__row"><td class="c-table__cell u-text"> <code>RBR</code></td><td class="c-table__cell u-text"> <code>ADV</code></td><td class="c-table__cell u-text"> <code>Degree=comp</code></td></tr><tr class="c-table__row"><td class="c-table__cell u-text"> <code>RBS</code></td><td class="c-table__cell u-text"> <code>ADV</code></td><td class="c-table__cell u-text"> <code>Degree=sup</code></td></tr><tr class="c-table__row"><td class="c-table__cell u-text"> <code>RP</code></td><td class="c-table__cell u-text"> <code>PART</code></td><td class="c-table__cell u-text"></td></tr><tr class="c-table__row"><td class="c-table__cell u-text"> <code>SP</code></td><td class="c-table__cell u-text"> <code>SPACE</code></td><td class="c-table__cell u-text"></td></tr><tr class="c-table__row"><td class="c-table__cell u-text"> <code>SYM</code></td><td class="c-table__cell u-text"> <code>SYM</code></td><td class="c-table__cell u-text"></td></tr><tr class="c-table__row"><td class="c-table__cell u-text"> <code>TO</code></td><td class="c-table__cell u-text"> <code>PART</code></td><td class="c-table__cell u-text"> <code>PartType=inf</code> <code>VerbForm=inf</code></td></tr><tr class="c-table__row"><td class="c-table__cell u-text"> <code>UH</code></td><td class="c-table__cell u-text"> <code>INTJ</code></td><td class="c-table__cell u-text"></td></tr><tr class="c-table__row"><td class="c-table__cell u-text"> <code>VB</code></td><td class="c-table__cell u-text"> <code>VERB</code></td><td class="c-table__cell u-text"> <code>VerbForm=inf</code></td></tr><tr class="c-table__row"><td class="c-table__cell u-text"> <code>VBD</code></td><td class="c-table__cell u-text"> <code>VERB</code></td><td class="c-table__cell u-text"> <code>VerbForm=fin</code> <code>Tense=past</code></td></tr><tr class="c-table__row"><td class="c-table__cell u-text"> <code>VBG</code></td><td class="c-table__cell u-text"> <code>VERB</code></td><td class="c-table__cell u-text"> <code>VerbForm=part</code> <code>Tense=pres</code> <code>Aspect=prog</code></td></tr><tr class="c-table__row"><td class="c-table__cell u-text"> <code>VBN</code></td><td class="c-table__cell u-text"> <code>VERB</code></td><td class="c-table__cell u-text"> <code>VerbForm=part</code> <code>Tense=past</code> <code>Aspect=perf</code></td></tr><tr class="c-table__row"><td class="c-table__cell u-text"> <code>VBP</code></td><td class="c-table__cell u-text"> <code>VERB</code></td><td class="c-table__cell u-text"> <code>VerbForm=fin</code> <code>Tense=pres</code></td></tr><tr class="c-table__row"><td class="c-table__cell u-text"> <code>VBZ</code></td><td class="c-table__cell u-text"> <code>VERB</code></td><td class="c-table__cell u-text"> <code>VerbForm=fin</code> <code>Tense=pres</code> <code>Number=sing</code> <code>Person=3</code></td></tr><tr class="c-table__row"><td class="c-table__cell u-text"> <code>WDT</code></td><td class="c-table__cell u-text"> <code>ADJ</code></td><td class="c-table__cell u-text"> <code>PronType=int|rel</code></td></tr><tr class="c-table__row"><td class="c-table__cell u-text"> <code>WP</code></td><td class="c-table__cell u-text"> <code>NOUN</code></td><td class="c-table__cell u-text"> <code>PronType=int|rel</code></td></tr><tr class="c-table__row"><td class="c-table__cell u-text"> <code>WP</code></td><td class="c-table__cell u-text"> <code>ADJ</code></td><td class="c-table__cell u-text"> <code>Poss=yes</code> <code>PronType=int|rel</code></td></tr><tr class="c-table__row"><td class="c-table__cell u-text"> <code>WRB</code></td><td class="c-table__cell u-text"> <code>ADV</code></td><td class="c-table__cell u-text"> <code>PronType=int|rel</code></td></tr><tr class="c-table__row"><td class="c-table__cell u-text"> <code>XX</code></td><td class="c-table__cell u-text"> <code>X</code></td><td class="c-table__cell u-text"></td></tr></table>

### タグの定義一覧

<table cellpadding="2" cellspacing="2" border="0">
  <tr bgcolor="#DFDFFF" align="none"> 
    <td align="none"> 
      <div align="left">Number</div>
    </td>
    <td> 
      <div align="left">Tag</div>
    </td>
    <td> 
      <div align="left">Description</div>
    </td>
  </tr>
  <tr bgcolor="#FFFFCA"> 
    <td align="none"> 1. </td>
    <td>CC </td>
    <td>Coordinating conjunction </td>
  </tr>
  <tr bgcolor="#FFFFCA"> 
    <td align="none"> 2. </td>
    <td>CD </td>
    <td>Cardinal number </td>
  </tr>
  <tr bgcolor="#FFFFCA"> 
    <td align="none"> 3. </td>
    <td>DT </td>
    <td>Determiner </td>
  </tr>
  <tr bgcolor="#FFFFCA"> 
    <td align="none"> 4. </td>
    <td>EX </td>
    <td>Existential <i>there<i> </i></i></td>
  </tr>
  <tr bgcolor="#FFFFCA"> 
    <td align="none"> 5. </td>
    <td>FW </td>
    <td>Foreign word </td>
  </tr>
  <tr bgcolor="#FFFFCA"> 
    <td align="none"> 6. </td>
    <td>IN </td>
    <td>Preposition or subordinating conjunction </td>
  </tr>
  <tr bgcolor="#FFFFCA"> 
    <td align="none"> 7. </td>
    <td>JJ </td>
    <td>Adjective </td>
  </tr>
  <tr bgcolor="#FFFFCA"> 
    <td align="none"> 8. </td>
    <td>JJR </td>
    <td>Adjective, comparative </td>
  </tr>
  <tr bgcolor="#FFFFCA"> 
    <td align="none"> 9. </td>
    <td>JJS </td>
    <td>Adjective, superlative </td>
  </tr>
  <tr bgcolor="#FFFFCA"> 
    <td align="none"> 10. </td>
    <td>LS </td>
    <td>List item marker </td>
  </tr>
  <tr bgcolor="#FFFFCA"> 
    <td align="none"> 11. </td>
    <td>MD </td>
    <td>Modal </td>
  </tr>
  <tr bgcolor="#FFFFCA"> 
    <td align="none"> 12. </td>
    <td>NN </td>
    <td>Noun, singular or mass </td>
  </tr>
  <tr bgcolor="#FFFFCA"> 
    <td align="none"> 13. </td>
    <td>NNS </td>
    <td>Noun, plural </td>
  </tr>
  <tr bgcolor="#FFFFCA"> 
    <td align="none"> 14. </td>
    <td>NNP </td>
    <td>Proper noun, singular </td>
  </tr>
  <tr bgcolor="#FFFFCA"> 
    <td align="none"> 15. </td>
    <td>NNPS </td>
    <td>Proper noun, plural </td>
  </tr>
  <tr bgcolor="#FFFFCA"> 
    <td align="none"> 16. </td>
    <td>PDT </td>
    <td>Predeterminer </td>
  </tr>
  <tr bgcolor="#FFFFCA"> 
    <td align="none"> 17. </td>
    <td>POS </td>
    <td>Possessive ending </td>
  </tr>
  <tr bgcolor="#FFFFCA"> 
    <td align="none"> 18. </td>
    <td>PRP </td>
    <td>Personal pronoun </td>
  </tr>
  <tr bgcolor="#FFFFCA"> 
    <td align="none"> 19. </td>
    <td>PRP </td>
    <td>Possessive pronoun </td>
  </tr>
  <tr bgcolor="#FFFFCA"> 
    <td align="none"> 20. </td>
    <td>RB </td>
    <td>Adverb </td>
  </tr>
  <tr bgcolor="#FFFFCA"> 
    <td align="none"> 21. </td>
    <td>RBR </td>
    <td>Adverb, comparative </td>
  </tr>
  <tr bgcolor="#FFFFCA"> 
    <td align="none"> 22. </td>
    <td>RBS </td>
    <td>Adverb, superlative </td>
  </tr>
  <tr bgcolor="#FFFFCA"> 
    <td align="none"> 23. </td>
    <td>RP </td>
    <td>Particle </td>
  </tr>
  <tr bgcolor="#FFFFCA"> 
    <td align="none"> 24. </td>
    <td>SYM </td>
    <td>Symbol </td>
  </tr>
  <tr bgcolor="#FFFFCA"> 
    <td align="none"> 25. </td>
    <td>TO </td>
    <td><i>to</i> </td>
  </tr>
  <tr bgcolor="#FFFFCA"> 
    <td align="none"> 26. </td>
    <td>UH </td>
    <td>Interjection </td>
  </tr>
  <tr bgcolor="#FFFFCA"> 
    <td align="none"> 27. </td>
    <td>VB </td>
    <td>Verb, base form </td>
  </tr>
  <tr bgcolor="#FFFFCA"> 
    <td align="none"> 28. </td>
    <td>VBD </td>
    <td>Verb, past tense </td>
  </tr>
  <tr bgcolor="#FFFFCA"> 
    <td align="none"> 29. </td>
    <td>VBG </td>
    <td>Verb, gerund or present participle </td>
  </tr>
  <tr bgcolor="#FFFFCA"> 
    <td align="none"> 30. </td>
    <td>VBN </td>
    <td>Verb, past participle </td>
  </tr>
  <tr bgcolor="#FFFFCA"> 
    <td align="none"> 31. </td>
    <td>VBP </td>
    <td>Verb, non-3rd person singular present </td>
  </tr>
  <tr bgcolor="#FFFFCA"> 
    <td align="none"> 32. </td>
    <td>VBZ </td>
    <td>Verb, 3rd person singular present </td>
  </tr>
  <tr bgcolor="#FFFFCA"> 
    <td align="none"> 33. </td>
    <td>WDT </td>
    <td>Wh-determiner </td>
  </tr>
  <tr bgcolor="#FFFFCA"> 
    <td align="none"> 34. </td>
    <td>WP </td>
    <td>Wh-pronoun </td>
  </tr>
  <tr bgcolor="#FFFFCA"> 
    <td align="none"> 35. </td>
    <td>WP </td>
    <td>Possessive wh-pronoun </td>
  </tr>
  <tr bgcolor="#FFFFCA"> 
    <td align="none"> 36. </td>
    <td>WRB </td>
    <td>Wh-adverb 
</table>


### 各トークンについて得られるその他の情報
ここでは主語Sakamotoを例にとってその他情報の主要なものを表示してみる

In [7]:
#https://spacy.io/docs/api/token
doc_ps = nlp("Mr.Sakamoto told us the Dragon Fruits was very yummy!") 
#for t in doc:
t = doc_ps[2]
print("token:",t)
print("vocab (The vocab object of the parent Doc):", t.vocab)
print("doc (The parent document.):", t.doc)
print("i (The index of the token within the parent document.):", t.i)
print("ent_type_ (Named entity type.):", t.ent_type_)
print("ent_iob_ (IOB code of named entity tag):", t.ent_iob_)
print("ent_id_ (ID of the entity the token is an instance of):", t.ent_id_)
print("lemma_ (Base form of the word, with no inflectional suffixes.):", t.lemma_)
print("lower_ (Lower-case form of the word.):", t.lower_)
print("shape_ (A transform of the word's string, to show orthographic features.):", t.shape_)
print("prefix_ (Integer ID of a length-N substring from the start of the word):", t.prefix_)
print("suffix_ (Length-N substring from the end of the word):", t.suffix_)
print("like_url (Does the word resemble a URL?):", t.like_url)
print("like_num (Does the word represent a number? ):", t.like_num)
print("like_email (Does the word resemble an email address?):", t.like_email)
print("is_oov (Is the word out-of-vocabulary?):", t.is_oov)
print("is_stop (Is the word part of a stop list?):", t.is_stop)
print("pos_ (Coarse-grained part-of-speech.):", t.pos_)
print("tag_ (Fine-grained part-of-speech.):", t.tag_)
print("dep_ (Syntactic dependency relation.):", t.dep_)
print("lang_ (Language of the parent document's vocabulary.):", t.lang_)
print("prob: (Smoothed log probability estimate of token's type.)", t.prob)
print("idx (The character offset of the token within the parent document.):", t.idx)
print("sentiment (A scalar value indicating the positivity or negativity of the token):", t.sentiment)
print("lex_id (ID of the token's lexical type.):", t.lex_id)
print("text (Verbatim text content.):", t.text)
print("text_with_ws (Text content, with trailing space character if present.):", t.text_with_ws)
print("whitespace_ (Trailing space character if present.):", t.whitespace_)

token: Sakamoto
vocab (The vocab object of the parent Doc): <spacy.vocab.Vocab object at 0x10f48bf48>
doc (The parent document.): Mr.Sakamoto told us the Dragon Fruits was very yummy!
i (The index of the token within the parent document.): 2
ent_type_ (Named entity type.): 
ent_iob_ (IOB code of named entity tag): O
ent_id_ (ID of the entity the token is an instance of): 
lemma_ (Base form of the word, with no inflectional suffixes.): sakamoto
lower_ (Lower-case form of the word.): sakamoto
shape_ (A transform of the word's string, to show orthographic features.): Xxxxx
prefix_ (Integer ID of a length-N substring from the start of the word): S
suffix_ (Length-N substring from the end of the word): oto
like_url (Does the word resemble a URL?): False
like_num (Does the word represent a number? ): False
like_email (Does the word resemble an email address?): False
is_oov (Is the word out-of-vocabulary?): True
is_stop (Is the word part of a stop list?): False
pos_ (Coarse-grained part-of-sp

## 2. 係り受け解析
spaCyの係り受け解析を行いビジュアライゼーションするためのデモサイトが存在する  
displaCy (https://demos.explosion.ai/displacy/)  
<br>
例えば、'I like chicken rice and Laksa.'という文章に対し、次のような描画が行われる

<img src="../img/spacy_dependency01.png">

ここでは同様な係り受け解析をテキストベースで行ってみる

In [8]:
# 各名詞について、root.dep_が役割・関係を、root.head.textが係り受け元を表す
doc_dep = nlp(u'I like chicken rice and Laksa.')
for np in doc_dep.noun_chunks:
    print((np.text, np.root.text, np.root.dep_, np.root.head.text))

('I', 'I', 'nsubj', 'like')
('chicken rice', 'rice', 'dobj', 'like')
('Laksa', 'Laksa', 'conj', 'rice')


In [9]:
#名詞以外の全トークンについても、役割・関係を表示することが可能
doc_dep_list = []
for token in doc_dep:
    doc_dep_list.append((token.text, token.tag_, token.dep_, token.n_lefts, token.n_rights, token.head.orth_, [t.orth_ for t in token.lefts], [t.orth_ for t in token.rights]))

from pprint import pprint
pprint(doc_dep_list)

[('I', 'PRP', 'nsubj', 0, 0, 'like', [], []),
 ('like', 'VBP', 'ROOT', 1, 2, 'like', ['I'], ['rice', '.']),
 ('chicken', 'NN', 'compound', 0, 0, 'rice', [], []),
 ('rice', 'NN', 'dobj', 1, 2, 'like', ['chicken'], ['and', 'Laksa']),
 ('and', 'CC', 'cc', 0, 0, 'rice', [], []),
 ('Laksa', 'NNP', 'conj', 0, 0, 'rice', [], []),
 ('.', '.', 'punct', 0, 0, 'like', [], [])]


In [10]:
# pandasのテーブルで表示してみる
import pandas as pd
df = pd.DataFrame(doc_dep_list)
df.columns = ['text','tag_','dep_','n_lefts','n_rights','head.orth_','orth_ in token.lefts','orth_ in token.rights']
df

Unnamed: 0,text,tag_,dep_,n_lefts,n_rights,head.orth_,orth_ in token.lefts,orth_ in token.rights
0,I,PRP,nsubj,0,0,like,[],[]
1,like,VBP,ROOT,1,2,like,[I],"[rice, .]"
2,chicken,NN,compound,0,0,rice,[],[]
3,rice,NN,dobj,1,2,like,[chicken],"[and, Laksa]"
4,and,CC,cc,0,0,rice,[],[]
5,Laksa,NNP,conj,0,0,rice,[],[]
6,.,.,punct,0,0,like,[],[]


In [11]:
# 矢印を使って関係を図示してみる
dependency_pattern = '{left}<---{word}[{w_type}]--->{right}\n--------'

In [12]:
for token in doc_dep:
    print (dependency_pattern.format(word=token.orth_, 
                                  w_type=token.dep_,
                                  left=[t.orth_ for t in token.lefts],
                                  right=[t.orth_ for t in token.rights]))

[]<---I[nsubj]--->[]
--------
['I']<---like[ROOT]--->['rice', '.']
--------
[]<---chicken[compound]--->[]
--------
['chicken']<---rice[dobj]--->['and', 'Laksa']
--------
[]<---and[cc]--->[]
--------
[]<---Laksa[conj]--->[]
--------
[]<---.[punct]--->[]
--------


## 3. Word Vecorを用いた単語の類似度取得

In [13]:
# Word Vector (small size)の読み込み
nlp = spacy.load('en_core_web_sm')

In [14]:
tokens = nlp(u'Singapore Japan Tokyo')
similarities = []
for token1 in tokens:
    sim_row = []
    for token2 in tokens:
        print([token1.text, token2.text, token1.similarity(token2)])
        sim_row.append(token1.similarity(token2))
    similarities.append(sim_row)
print(similarities)

['Singapore', 'Singapore', 1.0]
['Singapore', 'Japan', 0.69249845]
['Singapore', 'Tokyo', 0.48649472]
['Japan', 'Singapore', 0.69249845]
['Japan', 'Japan', 1.0000001]
['Japan', 'Tokyo', 0.57047987]
['Tokyo', 'Singapore', 0.48649472]
['Tokyo', 'Japan', 0.57047987]
['Tokyo', 'Tokyo', 1.0]
[[1.0, 0.69249845, 0.48649472], [0.69249845, 1.0000001, 0.57047987], [0.48649472, 0.57047987, 1.0]]


In [15]:
import pandas as pd
df = pd.DataFrame(similarities)
df.columns = tokens
df.index=tokens
df

Unnamed: 0,Singapore,Japan,Tokyo
Singapore,1.0,0.692498,0.486495
Japan,0.692498,1.0,0.57048
Tokyo,0.486495,0.57048,1.0


## 4.エンティティ認識

In [16]:
example_sent = "NTUC has raised S$25 million to help workers re-skill and upgrade their skills, secretary-general Chan Chun Sing said at the May Day Rally on Monday "
parsed = nlp(example_sent)
for token in parsed:
    print((token.orth_, token.ent_type_ if token.ent_type_ != "" else "(not an entity)"))

('NTUC', 'ORG')
('has', '(not an entity)')
('raised', '(not an entity)')
('S$25', 'CARDINAL')
('million', 'CARDINAL')
('to', '(not an entity)')
('help', '(not an entity)')
('workers', '(not an entity)')
('re', '(not an entity)')
('-', '(not an entity)')
('skill', '(not an entity)')
('and', '(not an entity)')
('upgrade', '(not an entity)')
('their', '(not an entity)')
('skills', '(not an entity)')
(',', '(not an entity)')
('secretary', '(not an entity)')
('-', '(not an entity)')
('general', '(not an entity)')
('Chan', 'PERSON')
('Chun', 'PERSON')
('Sing', 'PERSON')
('said', '(not an entity)')
('at', '(not an entity)')
('the', 'DATE')
('May', 'DATE')
('Day', 'DATE')
('Rally', 'DATE')
('on', 'DATE')
('Monday', 'DATE')


Visualization using displaCy Named Entity Visualizer (https://demos.explosion.ai/displacy-ent/)
<img src="../img/spacy_ner01.png">

### エンティティタイプ一覧 
https://spacy.io/docs/usage/entity-recognition

<table class="c-table o-block"><tr class="c-table__row"><th class="c-table__head-cell u-text-label">Type</th><th class="c-table__head-cell u-text-label">Description</th></tr><tr class="c-table__row"><td class="c-table__cell u-text"><code>PERSON</code></td><td class="c-table__cell u-text">People, including fictional.</td></tr><tr class="c-table__row"><td class="c-table__cell u-text"><code>NORP</code></td><td class="c-table__cell u-text">Nationalities or religious or political groups.</td></tr><tr class="c-table__row"><td class="c-table__cell u-text"><code>FACILITY</code></td><td class="c-table__cell u-text">Buildings, airports, highways, bridges, etc.</td></tr><tr class="c-table__row"><td class="c-table__cell u-text"><code>ORG</code></td><td class="c-table__cell u-text">Companies, agencies, institutions, etc.</td></tr><tr class="c-table__row"><td class="c-table__cell u-text"><code>GPE</code></td><td class="c-table__cell u-text">Countries, cities, states.</td></tr><tr class="c-table__row"><td class="c-table__cell u-text"><code>LOC</code></td><td class="c-table__cell u-text">Non-GPE locations, mountain ranges, bodies of water.</td></tr><tr class="c-table__row"><td class="c-table__cell u-text"><code>PRODUCT</code></td><td class="c-table__cell u-text">Objects, vehicles, foods, etc. (Not services.)</td></tr><tr class="c-table__row"><td class="c-table__cell u-text"><code>EVENT</code></td><td class="c-table__cell u-text">Named hurricanes, battles, wars, sports events, etc.</td></tr><tr class="c-table__row"><td class="c-table__cell u-text"><code>WORK_OF_ART</code></td><td class="c-table__cell u-text">Titles of books, songs, etc.</td></tr><tr class="c-table__row"><td class="c-table__cell u-text"><code>LANGUAGE</code></td><td class="c-table__cell u-text">Any named language.</td></tr></table>

## エンティティ認識モデルの作成

In [17]:
import random
from pathlib import Path
import spacy

In [18]:
# 訓練データ
TRAIN_DATA = [
    ('Who is Daphne Khoo?', {'entities': [(7, 18, 'PERSON')]}),
    ('I like Bangkok and Buangkok.', {'entities': [(7, 14, 'LOC'), (19, 27, 'LOC')]})
]

In [19]:
# モデルの読み込み
nlp = spacy.blank('en')  # create blank Language class
print("Created blank 'en' model")

Created blank 'en' model


In [20]:
# パイプラインの生成
# nlp.create_pipe works for built-ins that are registered with spaCy
if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner, last=True)
# otherwise, get it so we can add labels
else:
    ner = nlp.get_pipe('ner')

In [21]:
# 訓練データのラベルをパイプラインnerに追加
for _, annotations in TRAIN_DATA:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])

In [22]:
# 他のパイプラインを一時無効化しながら訓練を開始
n_iter=100
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):  # only train NER
    optimizer = nlp.begin_training()
    for itn in range(n_iter):
        random.shuffle(TRAIN_DATA)
        losses = {}
        for text, annotations in TRAIN_DATA:
            nlp.update(
                [text],  # batch of texts
                [annotations],  # batch of annotations
                drop=0.5,  # dropout - make it harder to memorise data
                sgd=optimizer,  # callable to update weights
                losses=losses)
        print(losses)

{'ner': 13.175836280376352}
{'ner': 12.667925622750635}
{'ner': 9.95857561635717}
{'ner': 6.250982132274663}
{'ner': 10.797445607903985}
{'ner': 6.29055441384935}
{'ner': 8.415489806663096}
{'ner': 8.65286132128112}
{'ner': 7.9207013388083904}
{'ner': 5.176604208581704}
{'ner': 2.0163456532285933}
{'ner': 1.6894788717259888}
{'ner': 0.0035130279810733087}
{'ner': 1.0005834656321062}
{'ner': 3.4471729101848494}
{'ner': 3.0742444012545636}
{'ner': 1.8941273111148758}
{'ner': 3.0038413572032066}
{'ner': 1.3641084270238104}
{'ner': 2.0024618038231115}
{'ner': 1.8680192646406215}
{'ner': 3.708134078557032}
{'ner': 3.9812688870266085e-05}
{'ner': 1.9123445341356675}
{'ner': 5.623626587195949e-09}
{'ner': 0.171074504659794}
{'ner': 4.9556581891946755e-05}
{'ner': 0.00027758975395631433}
{'ner': 0.6852870714143676}
{'ner': 7.23222076358214e-16}
{'ner': 1.581942049117866e-08}
{'ner': 0.08680034869202281}
{'ner': 0.655540899188921}
{'ner': 2.4608986337185897e-08}
{'ner': 1.5703130161498047e-18}


In [23]:
# テスト
for text, _ in TRAIN_DATA:
    doc = nlp(text)
    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
    print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])

Entities [('Daphne Khoo', 'PERSON')]
Tokens [('Who', '', 2), ('is', '', 2), ('Daphne', 'PERSON', 3), ('Khoo', 'PERSON', 1), ('?', '', 2)]
Entities [('Bangkok', 'LOC'), ('Buangkok', 'LOC')]
Tokens [('I', '', 2), ('like', '', 2), ('Bangkok', 'LOC', 3), ('and', '', 2), ('Buangkok', 'LOC', 3), ('.', '', 2)]


In [24]:
# outputディレクトリへモデルを保存
output_dir = './sample_ner/'
output_dir = Path(output_dir)
if not output_dir.exists():
    output_dir.mkdir()
nlp.to_disk(output_dir)
print("Saved model to", output_dir)

Saved model to sample_ner


In [25]:
# 保存したモデルで再度テスト
print("Loading from", output_dir)
nlp2 = spacy.load(output_dir)
for text, _ in TRAIN_DATA:
    doc = nlp2(text)
    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
    print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])

Loading from sample_ner
Entities [('Daphne Khoo', 'PERSON')]
Tokens [('Who', '', 2), ('is', '', 2), ('Daphne', 'PERSON', 3), ('Khoo', 'PERSON', 1), ('?', '', 2)]
Entities [('Bangkok', 'LOC'), ('Buangkok', 'LOC')]
Tokens [('I', '', 2), ('like', '', 2), ('Bangkok', 'LOC', 3), ('and', '', 2), ('Buangkok', 'LOC', 3), ('.', '', 2)]


## Reference
https://spacy.io/<br>
https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html<br>
https://spacy.io/docs/usage/pos-tagging<br>
https://spacy.io/usage/training <br>

[Installation]  
pip install spacy  
python -m spacy download en  
python -m spacy download en_core_web_sm
