In [None]:
#==============================================================================
# CellStrat Hub Pack - Natural Language Processing
# Compatible tier : Free Tier or above 
# Kernel : conda_pytorch_latest_p36 
#==============================================================================

In [None]:
#==============================================================================================
#The named entity recognition (NER) module recognizes mention spans of a particular entity type
#(e.g., Person or Organization) in the input sentence. 
# NER is widely used in areas such as information extraction or question answering systems. 
# In Stanza, NER is performed by the NERProcessor and can be invoked by the name ner
#==============================================================================================

In [None]:
#==================================================================================
#Stanza is a collection of accurate and efficient tools for the linguistic analysis
# of many human languages.
#Starting from raw text to syntactic analysis and entity recognition, 
# Stanza brings state-of-the-art NLP models to languages of your choosing.
#==================================================================================

In [1]:
# -*- coding: utf-8 -*-

#==============================================================================
# Install Stanza
#==============================================================================
!pip install stanza

Collecting stanza
  Downloading stanza-1.2.1-py3-none-any.whl (334 kB)
[K     |████████████████████████████████| 334 kB 15.1 MB/s eta 0:00:01
Collecting tqdm
  Downloading tqdm-4.61.1-py2.py3-none-any.whl (75 kB)
[K     |████████████████████████████████| 75 kB 7.6 MB/s  eta 0:00:01
Installing collected packages: tqdm, stanza
Successfully installed stanza-1.2.1 tqdm-4.61.1


In [2]:
#==============================================================================
# import the stanza library and download the english package
#==============================================================================

import stanza
stanza.download('en')  

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.2.1.json:   0%|   …

2021-06-29 09:27:13 INFO: Downloading default packages for language: en (English)...


Downloading http://nlp.stanford.edu/software/stanza/1.2.1/en/default.zip:   0%|          | 0.00/412M [00:00<?,…

2021-06-29 09:28:35 INFO: Finished downloading models and saved to /home/ec2-user/stanza_resources.


In [3]:
#==============================================================================
#This sets up a default neural pipeline in English
#==============================================================================
nlp = stanza.Pipeline('en') 



2021-06-21 07:28:38 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| pos       | combined  |
| lemma     | combined  |
| depparse  | combined  |
| sentiment | sstplus   |
| ner       | ontonotes |

2021-06-21 07:28:38 INFO: Use device: cpu
2021-06-21 07:28:38 INFO: Loading: tokenize
2021-06-21 07:28:39 INFO: Loading: pos
2021-06-21 07:28:39 INFO: Loading: lemma
2021-06-21 07:28:39 INFO: Loading: depparse
2021-06-21 07:28:40 INFO: Loading: sentiment
2021-06-21 07:28:40 INFO: Loading: ner
2021-06-21 07:28:41 INFO: Done loading processors!


In [5]:
#==============================================================================
#BIOES representation of the entities we saw above
#Stanza correctly identifies that 'Barack' is a person,'Hawaii' as a location
#==============================================================================
doc = nlp("Barack Obama was born in Hawaii. He was elected president in 2008.")

print(*[f'token: {token.text}\tner: {token.ner}' for sent in doc.sentences for token in sent.tokens], sep='\n')

token: Barack	ner: B-PERSON
token: Obama	ner: E-PERSON
token: was	ner: O
token: born	ner: O
token: in	ner: O
token: Hawaii	ner: S-GPE
token: .	ner: O
token: He	ner: O
token: was	ner: O
token: elected	ner: O
token: president	ner: O
token: in	ner: O
token: 2008	ner: S-DATE
token: .	ner: O
