# Intro Models

## First Execution

In [8]:
import stanza

nlp = stanza.Pipeline('en')

doc = nlp('Barack Obama was born in Hawaii.')

for sentence in doc.sentences:
  for word in sentence.words:
    print(word.text, word.lemma, word.pos)


2023-11-28 12:09:04 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …

2023-11-28 12:09:05 INFO: Loading these models for language: en (English):
| Processor    | Package             |
--------------------------------------
| tokenize     | combined            |
| pos          | combined_charlm     |
| lemma        | combined_nocharlm   |
| constituency | ptb3-revised_charlm |
| depparse     | combined_charlm     |
| sentiment    | sstplus             |
| ner          | ontonotes_charlm    |

2023-11-28 12:09:05 INFO: Using device: cpu
2023-11-28 12:09:05 INFO: Loading: tokenize
2023-11-28 12:09:05 INFO: Loading: pos
2023-11-28 12:09:06 INFO: Loading: lemma
2023-11-28 12:09:06 INFO: Loading: constituency
2023-11-28 12:09:06 INFO: Loading: depparse
2023-11-28 12:09:06 INFO: Loading: sentiment
2023-11-28 12:09:07 INFO: Loading: ner
2023-11-28 12:09:07 INFO: Done loading processors!


Barack Barack PROPN
Obama Obama PROPN
was be AUX
born bear VERB
in in ADP
Hawaii Hawaii PROPN
. . PUNCT


# Redo

In [6]:
import stanza

nlp = stanza.Pipeline('en', processors='tokenize,pos', use_gpu=True, pos_batch_size=3000) # Build the pipeline, specify part-of-speech processor's batch size

doc = nlp("Barack Obama was born in Hawaii.") # Run the pipeline on the input text

print(doc) # Look at the result

2023-11-28 12:02:40 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …

2023-11-28 12:02:41 INFO: Loading these models for language: en (English):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| pos       | combined_charlm |

2023-11-28 12:02:41 INFO: Using device: cpu
2023-11-28 12:02:41 INFO: Loading: tokenize
2023-11-28 12:02:41 INFO: Loading: pos
2023-11-28 12:02:41 INFO: Done loading processors!


[
  [
    {
      "id": 1,
      "text": "Barack",
      "upos": "PROPN",
      "xpos": "NNP",
      "feats": "Number=Sing",
      "start_char": 0,
      "end_char": 6
    },
    {
      "id": 2,
      "text": "Obama",
      "upos": "PROPN",
      "xpos": "NNP",
      "feats": "Number=Sing",
      "start_char": 7,
      "end_char": 12
    },
    {
      "id": 3,
      "text": "was",
      "upos": "AUX",
      "xpos": "VBD",
      "feats": "Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin",
      "start_char": 13,
      "end_char": 16
    },
    {
      "id": 4,
      "text": "born",
      "upos": "VERB",
      "xpos": "VBN",
      "feats": "Tense=Past|VerbForm=Part|Voice=Pass",
      "start_char": 17,
      "end_char": 21
    },
    {
      "id": 5,
      "text": "in",
      "upos": "ADP",
      "xpos": "IN",
      "start_char": 22,
      "end_char": 24
    },
    {
      "id": 6,
      "text": "Hawaii",
      "upos": "PROPN",
      "xpos": "NNP",
      "feats": "Number=Sing",
   

# Multiple Documents

In [7]:
import stanza

nlp = stanza.Pipeline(lang="en") # Initialize the default English pipeline

documents = ["This is a test document.", "I wrote another document for fun."] # Documents that we are going to process

in_docs = [stanza.Document([], text=d) for d in documents] # Wrap each document with a stanza.Document object

out_docs = nlp(in_docs) # Call the neural pipeline on this list of documents

print(out_docs[1]) # The output is also a list of stanza.Document objects, each output corresponding to an input Document object

2023-11-28 12:04:34 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …

2023-11-28 12:04:35 INFO: Loading these models for language: en (English):
| Processor    | Package             |
--------------------------------------
| tokenize     | combined            |
| pos          | combined_charlm     |
| lemma        | combined_nocharlm   |
| constituency | ptb3-revised_charlm |
| depparse     | combined_charlm     |
| sentiment    | sstplus             |
| ner          | ontonotes_charlm    |

2023-11-28 12:04:35 INFO: Using device: cpu
2023-11-28 12:04:35 INFO: Loading: tokenize
2023-11-28 12:04:35 INFO: Loading: pos
2023-11-28 12:04:36 INFO: Loading: lemma
2023-11-28 12:04:36 INFO: Loading: constituency
2023-11-28 12:04:36 INFO: Loading: depparse
2023-11-28 12:04:37 INFO: Loading: sentiment
2023-11-28 12:04:37 INFO: Loading: ner
2023-11-28 12:04:37 INFO: Done loading processors!


[
  [
    {
      "id": 1,
      "text": "I",
      "lemma": "I",
      "upos": "PRON",
      "xpos": "PRP",
      "feats": "Case=Nom|Number=Sing|Person=1|PronType=Prs",
      "head": 2,
      "deprel": "nsubj",
      "start_char": 0,
      "end_char": 1,
      "ner": "O",
      "multi_ner": [
        "O"
      ]
    },
    {
      "id": 2,
      "text": "wrote",
      "lemma": "write",
      "upos": "VERB",
      "xpos": "VBD",
      "feats": "Mood=Ind|Number=Sing|Person=1|Tense=Past|VerbForm=Fin",
      "head": 0,
      "deprel": "root",
      "start_char": 2,
      "end_char": 7,
      "ner": "O",
      "multi_ner": [
        "O"
      ]
    },
    {
      "id": 3,
      "text": "another",
      "lemma": "another",
      "upos": "DET",
      "xpos": "DT",
      "head": 4,
      "deprel": "det",
      "start_char": 8,
      "end_char": 15,
      "ner": "O",
      "multi_ner": [
        "O"
      ]
    },
    {
      "id": 4,
      "text": "document",
      "lemma": "document",
      

## Bulk Processing

In [10]:
import stanza

nlp = stanza.Pipeline(lang="en") # Initialize the default English pipeline

documents = ["This is a test document.", "I wrote another document for fun."] # Documents that we are going to process

out_docs = nlp.bulk_process(documents) # Call the neural pipeline on this list of documents

print(out_docs[1]) # The output is also a list of stanza.Document objects, each output corresponding to an input Document object

2023-11-28 12:10:18 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …

2023-11-28 12:10:20 INFO: Loading these models for language: en (English):
| Processor    | Package             |
--------------------------------------
| tokenize     | combined            |
| pos          | combined_charlm     |
| lemma        | combined_nocharlm   |
| constituency | ptb3-revised_charlm |
| depparse     | combined_charlm     |
| sentiment    | sstplus             |
| ner          | ontonotes_charlm    |

2023-11-28 12:10:20 INFO: Using device: cpu
2023-11-28 12:10:20 INFO: Loading: tokenize
2023-11-28 12:10:20 INFO: Loading: pos
2023-11-28 12:10:20 INFO: Loading: lemma
2023-11-28 12:10:20 INFO: Loading: constituency
2023-11-28 12:10:21 INFO: Loading: depparse
2023-11-28 12:10:21 INFO: Loading: sentiment
2023-11-28 12:10:21 INFO: Loading: ner
2023-11-28 12:10:22 INFO: Done loading processors!


[
  [
    {
      "id": 1,
      "text": "I",
      "lemma": "I",
      "upos": "PRON",
      "xpos": "PRP",
      "feats": "Case=Nom|Number=Sing|Person=1|PronType=Prs",
      "head": 2,
      "deprel": "nsubj",
      "start_char": 0,
      "end_char": 1,
      "ner": "O",
      "multi_ner": [
        "O"
      ]
    },
    {
      "id": 2,
      "text": "wrote",
      "lemma": "write",
      "upos": "VERB",
      "xpos": "VBD",
      "feats": "Mood=Ind|Number=Sing|Person=1|Tense=Past|VerbForm=Fin",
      "head": 0,
      "deprel": "root",
      "start_char": 2,
      "end_char": 7,
      "ner": "O",
      "multi_ner": [
        "O"
      ]
    },
    {
      "id": 3,
      "text": "another",
      "lemma": "another",
      "upos": "DET",
      "xpos": "DT",
      "head": 4,
      "deprel": "det",
      "start_char": 8,
      "end_char": 15,
      "ner": "O",
      "multi_ner": [
        "O"
      ]
    },
    {
      "id": 4,
      "text": "document",
      "lemma": "document",
      