In [2]:
import stanza
stanza.download('en')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.2.2.json:   0%|   …

2021-08-21 15:27:28 INFO: Downloading default packages for language: en (English)...


Downloading http://nlp.stanford.edu/software/stanza/1.2.2/en/default.zip:   0%|          | 0.00/412M [00:00<?,…

2021-08-21 15:31:04 INFO: Finished downloading models and saved to C:\Users\luoyan011\stanza_resources.


In [3]:
nlp = stanza.Pipeline('en')

2021-08-21 15:31:04 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| pos       | combined  |
| lemma     | combined  |
| depparse  | combined  |
| sentiment | sstplus   |
| ner       | ontonotes |

2021-08-21 15:31:04 INFO: Use device: cpu
2021-08-21 15:31:04 INFO: Loading: tokenize
2021-08-21 15:31:04 INFO: Loading: pos
2021-08-21 15:31:05 INFO: Loading: lemma
2021-08-21 15:31:05 INFO: Loading: depparse
2021-08-21 15:31:05 INFO: Loading: sentiment
2021-08-21 15:31:06 INFO: Loading: ner
2021-08-21 15:31:07 INFO: Done loading processors!


In [4]:
doc = nlp('Barack Obama was born in Hawaii')

In [5]:
print(doc)
print(doc.entities)

[
  [
    {
      "id": 1,
      "text": "Barack",
      "lemma": "Barack",
      "upos": "PROPN",
      "xpos": "NNP",
      "feats": "Number=Sing",
      "head": 4,
      "deprel": "nsubj:pass",
      "start_char": 0,
      "end_char": 6,
      "ner": "B-PERSON"
    },
    {
      "id": 2,
      "text": "Obama",
      "lemma": "Obama",
      "upos": "PROPN",
      "xpos": "NNP",
      "feats": "Number=Sing",
      "head": 1,
      "deprel": "flat",
      "start_char": 7,
      "end_char": 12,
      "ner": "E-PERSON"
    },
    {
      "id": 3,
      "text": "was",
      "lemma": "be",
      "upos": "AUX",
      "xpos": "VBD",
      "feats": "Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin",
      "head": 4,
      "deprel": "aux:pass",
      "start_char": 13,
      "end_char": 16,
      "ner": "O"
    },
    {
      "id": 4,
      "text": "born",
      "lemma": "bear",
      "upos": "VERB",
      "xpos": "VBN",
      "feats": "Tense=Past|VerbForm=Part|Voice=Pass",
      "head": 

## Dependency Parsing

In [6]:
nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma,depparse')
doc = nlp('Barack Obama was born in Hawaii.')
print(*[f'id: {word.id}\tword: {word.text}\thead id: {word.head}\thead: {sent.words[word.head-1].text if word.head > 0 else "root"}\tdeprel: {word.deprel}' for sent in doc.sentences for word in sent.words], sep='\n')

2021-08-21 15:40:39 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| pos       | combined |
| lemma     | combined |
| depparse  | combined |

2021-08-21 15:40:39 INFO: Use device: cpu
2021-08-21 15:40:39 INFO: Loading: tokenize
2021-08-21 15:40:39 INFO: Loading: pos
2021-08-21 15:40:39 INFO: Loading: lemma
2021-08-21 15:40:39 INFO: Loading: depparse
2021-08-21 15:40:40 INFO: Done loading processors!


id: 1	word: Barack	head id: 4	head: born	deprel: nsubj:pass
id: 2	word: Obama	head id: 1	head: Barack	deprel: flat
id: 3	word: was	head id: 4	head: born	deprel: aux:pass
id: 4	word: born	head id: 0	head: root	deprel: root
id: 5	word: in	head id: 6	head: Hawaii	deprel: case
id: 6	word: Hawaii	head id: 4	head: born	deprel: obl
id: 7	word: .	head id: 4	head: born	deprel: punct


In [8]:
from stanza.models.common.doc import Document
nlp = stanza.Pipeline(lang='en', processors='depparse', depparse_pretagged=True)
pretagged_doc = Document([[{'id': 1, 'text': 'Test', 'lemma': 'Test', 'upos': 'NOUN', 'xpos': 'NN', 'feats': 'Number=Sing'}, {'id': 2, 'text': 'sentence', 'lemma': 'sentence', 'upos': 'NOUN', 'xpos': 'NN', 'feats': 'Number=Sing'}, {'id': 3, 'text': '.', 'lemma': '.', 'upos': 'PUNCT', 'xpos': '.'}]])
doc = nlp(pretagged_doc)
print(doc)

2021-08-21 15:42:00 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| depparse  | combined |

2021-08-21 15:42:00 INFO: Use device: cpu
2021-08-21 15:42:00 INFO: Loading: depparse
2021-08-21 15:42:01 INFO: Done loading processors!


[
  [
    {
      "id": 1,
      "text": "Test",
      "lemma": "Test",
      "upos": "NOUN",
      "xpos": "NN",
      "feats": "Number=Sing",
      "head": 2,
      "deprel": "compound"
    },
    {
      "id": 2,
      "text": "sentence",
      "lemma": "sentence",
      "upos": "NOUN",
      "xpos": "NN",
      "feats": "Number=Sing",
      "head": 0,
      "deprel": "root"
    },
    {
      "id": 3,
      "text": ".",
      "lemma": ".",
      "upos": "PUNCT",
      "xpos": ".",
      "head": 2,
      "deprel": "punct"
    }
  ]
]


## Named Entity Recognition

In [11]:
nlp = stanza.Pipeline(lang='en', processors='tokenize,ner')
doc = nlp("Chris Manning teaches at Stanford University. He lives in the Bay Area.")
print(*[f'entity: {ent.text}\ttype: {ent.type}' for ent in doc.ents], sep='\n')
print('=========')
print(*[f'token: {token.text}\tner: {token.ner}' for sent in doc.sentences for token in sent.tokens], sep='\n')

2021-08-21 15:46:28 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| ner       | ontonotes |

2021-08-21 15:46:28 INFO: Use device: cpu
2021-08-21 15:46:28 INFO: Loading: tokenize
2021-08-21 15:46:28 INFO: Loading: ner
2021-08-21 15:46:29 INFO: Done loading processors!


entity: Chris Manning	type: PERSON
entity: Stanford University	type: ORG
entity: the Bay Area	type: LOC
token: Chris	ner: B-PERSON
token: Manning	ner: E-PERSON
token: teaches	ner: O
token: at	ner: O
token: Stanford	ner: B-ORG
token: University	ner: E-ORG
token: .	ner: O
token: He	ner: O
token: lives	ner: O
token: in	ner: O
token: the	ner: B-LOC
token: Bay	ner: I-LOC
token: Area	ner: E-LOC
token: .	ner: O
