In [1]:
#==============================================================================
# CellStrat Hub Pack - Natural Language Processing
# Compatible tier : Free Tier or above  
# Kerner : conda_pytorch_latest_p36 
#==============================================================================

In [2]:
#=====================================================================================================
#In traditional grammar, a part of speech or part-of-speech (abbreviated as POS or PoS) 
#is a category of words (or, more generally, of lexical items) that have similar grammatical properties.
# Words that are assigned to the same part of speech generally display similar syntactic behavior
#they play similar roles within the grammatical structure of sentences—and sometimes similar morphology
#in that they undergo inflection for similar properties.
#======================================================================================================

In [3]:
#==============================================================================
#Stanza is a collection of accurate and efficient tools for the linguistic analysis
# of many human languages.
#Starting from raw text to syntactic analysis and entity recognition, 
# Stanza brings state-of-the-art NLP models to languages of your choosing.
#==============================================================================

In [4]:
# -*- coding: utf-8 -*-

#==============================================================================
# Install Stanza
#==============================================================================
!pip install stanza

Collecting stanza
  Downloading stanza-1.2.1-py3-none-any.whl (334 kB)
[K     |████████████████████████████████| 334 kB 24.8 MB/s eta 0:00:01
Collecting tqdm
  Downloading tqdm-4.61.1-py2.py3-none-any.whl (75 kB)
[K     |████████████████████████████████| 75 kB 6.9 MB/s  eta 0:00:01
Installing collected packages: tqdm, stanza
Successfully installed stanza-1.2.1 tqdm-4.61.1


In [5]:
#==============================================================================
# import the stanza library and download the english package
#==============================================================================
import stanza
stanza.download('en')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.2.1.json:   0%|   …

2021-06-29 14:26:57 INFO: Downloading default packages for language: en (English)...


Downloading http://nlp.stanford.edu/software/stanza/1.2.1/en/default.zip:   0%|          | 0.00/412M [00:00<?,…

2021-06-29 14:28:25 INFO: Finished downloading models and saved to /home/ec2-user/stanza_resources.


In [6]:
#==============================================================================
# The dependency parsing module builds a tree structure of words from the input 
# sentence,which represents the syntactic dependency relations between words
# In Stanza, dependency parsing is performed by the DepparseProcessor, 
# It can be invoked with the name depparse.
#==============================================================================


In [7]:
#====================================================================================
#Running the POSProcessor requires the TokenizeProcessor and MWTProcessor.
#After the pipeline is run, the Document will contain a list of Sentences,
# and the Sentences will contain lists of Words.
#The part-of-speech tags can be accessed via the upos(pos) and xpos fields of each Word, 
#=======================================================================================
nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos')


2021-06-29 14:28:56 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| pos       | combined |

2021-06-29 14:28:57 INFO: Use device: cpu
2021-06-29 14:28:57 INFO: Loading: tokenize
2021-06-29 14:28:57 INFO: Loading: pos
2021-06-29 14:28:58 INFO: Done loading processors!


In [8]:
#=============================================================================================
# As can be seen in the result, we can tell that the word was is a third-person auxiliary verb 
# in the past tense from Stanza’s analysis.
#==============================================================================================
doc = nlp("Barack Obama was born in Hawaii. He was elected president in 2008.")
print(*[f'word: {word.text}\tupos: {word.upos}\txpos: {word.xpos}\tfeats: {word.feats if word.feats else "_"}' for sent in doc.sentences for word in sent.words], sep='\n')


word: Barack	upos: PROPN	xpos: NNP	feats: Number=Sing
word: Obama	upos: PROPN	xpos: NNP	feats: Number=Sing
word: was	upos: AUX	xpos: VBD	feats: Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin
word: born	upos: VERB	xpos: VBN	feats: Tense=Past|VerbForm=Part|Voice=Pass
word: in	upos: ADP	xpos: IN	feats: _
word: Hawaii	upos: PROPN	xpos: NNP	feats: Number=Sing
word: .	upos: PUNCT	xpos: .	feats: _
word: He	upos: PRON	xpos: PRP	feats: Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs
word: was	upos: AUX	xpos: VBD	feats: Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin
word: elected	upos: VERB	xpos: VBN	feats: Tense=Past|VerbForm=Part|Voice=Pass
word: president	upos: NOUN	xpos: NN	feats: Number=Sing
word: in	upos: ADP	xpos: IN	feats: _
word: 2008	upos: NUM	xpos: CD	feats: NumForm=Digit|NumType=Card
word: .	upos: PUNCT	xpos: .	feats: _
