# Basic Part of Speech Tagging

Imaging that we have a token stream and we are interesting in learning more about the structure of the sentece. For example, assume that we want to identify the various parts of speech in a sentece.

In [2]:
import nltk
from nltk.tokenize import TreebankWordTokenizer
treebank_tokenizer = TreebankWordTokenizer()

input = "Specifically, we reviewed the AN/ASQ‑235 Airborne Mine Neutralization System (AMNS), Airborne Laser Mine Detection System (ALMDS), and Coastal Battlefield Reconnaissance and Analysis (COBRA) Block I systems."
tokens = treebank_tokenizer.tokenize(input)
print(tokens)

pos_tags = nltk.pos_tag(tokens)
print(pos_tags)

['Specifically', ',', 'we', 'reviewed', 'the', 'AN/ASQ‑235', 'Airborne', 'Mine', 'Neutralization', 'System', '(', 'AMNS', ')', ',', 'Airborne', 'Laser', 'Mine', 'Detection', 'System', '(', 'ALMDS', ')', ',', 'and', 'Coastal', 'Battlefield', 'Reconnaissance', 'and', 'Analysis', '(', 'COBRA', ')', 'Block', 'I', 'systems', '.']
[('Specifically', 'RB'), (',', ','), ('we', 'PRP'), ('reviewed', 'VBD'), ('the', 'DT'), ('AN/ASQ‑235', 'NNP'), ('Airborne', 'NNP'), ('Mine', 'NNP'), ('Neutralization', 'NNP'), ('System', 'NNP'), ('(', '('), ('AMNS', 'NNP'), (')', ')'), (',', ','), ('Airborne', 'NNP'), ('Laser', 'NNP'), ('Mine', 'NNP'), ('Detection', 'NNP'), ('System', 'NNP'), ('(', '('), ('ALMDS', 'NNP'), (')', ')'), (',', ','), ('and', 'CC'), ('Coastal', 'NNP'), ('Battlefield', 'NNP'), ('Reconnaissance', 'NNP'), ('and', 'CC'), ('Analysis', 'NNP'), ('(', '('), ('COBRA', 'NNP'), (')', ')'), ('Block', 'NNP'), ('I', 'PRP'), ('systems', 'NNS'), ('.', '.')]


Is this useful? Can we do anything with the actual tags?

In [3]:
for pos_token in pos_tags:
    print(pos_token)

('Specifically', 'RB')
(',', ',')
('we', 'PRP')
('reviewed', 'VBD')
('the', 'DT')
('AN/ASQ‑235', 'NNP')
('Airborne', 'NNP')
('Mine', 'NNP')
('Neutralization', 'NNP')
('System', 'NNP')
('(', '(')
('AMNS', 'NNP')
(')', ')')
(',', ',')
('Airborne', 'NNP')
('Laser', 'NNP')
('Mine', 'NNP')
('Detection', 'NNP')
('System', 'NNP')
('(', '(')
('ALMDS', 'NNP')
(')', ')')
(',', ',')
('and', 'CC')
('Coastal', 'NNP')
('Battlefield', 'NNP')
('Reconnaissance', 'NNP')
('and', 'CC')
('Analysis', 'NNP')
('(', '(')
('COBRA', 'NNP')
(')', ')')
('Block', 'NNP')
('I', 'PRP')
('systems', 'NNS')
('.', '.')


In [4]:
for pos_token in pos_tags:
    print(pos_token[0] + " -- " + pos_token[1])

Specifically -- RB
, -- ,
we -- PRP
reviewed -- VBD
the -- DT
AN/ASQ‑235 -- NNP
Airborne -- NNP
Mine -- NNP
Neutralization -- NNP
System -- NNP
( -- (
AMNS -- NNP
) -- )
, -- ,
Airborne -- NNP
Laser -- NNP
Mine -- NNP
Detection -- NNP
System -- NNP
( -- (
ALMDS -- NNP
) -- )
, -- ,
and -- CC
Coastal -- NNP
Battlefield -- NNP
Reconnaissance -- NNP
and -- CC
Analysis -- NNP
( -- (
COBRA -- NNP
) -- )
Block -- NNP
I -- PRP
systems -- NNS
. -- .


In [5]:
nouns = [pos_token[0] for pos_token in pos_tags if pos_token[1] == "NNP"]
print(nouns)

['AN/ASQ‑235', 'Airborne', 'Mine', 'Neutralization', 'System', 'AMNS', 'Airborne', 'Laser', 'Mine', 'Detection', 'System', 'ALMDS', 'Coastal', 'Battlefield', 'Reconnaissance', 'Analysis', 'COBRA', 'Block']


# Attempting the same with Spacy

In [6]:
import spacy
nlp = spacy.load('en')

In [7]:
token_stream = nlp(input)
for i in token_stream:
    print(i)
    # the crucial difference is that Spacy marks up the whole document and then provides properties on each token.
    # One of the Token properties you can look at is the part of speech
    print(i.pos_)

Specifically
ADV
,
PUNCT
we
PRON
reviewed
VERB
the
DET
AN
PROPN
/
SYM
ASQ‑235
PROPN
Airborne
PROPN
Mine
PROPN
Neutralization
PROPN
System
PROPN
(
PUNCT
AMNS
PROPN
)
PUNCT
,
PUNCT
Airborne
PROPN
Laser
PROPN
Mine
PROPN
Detection
PROPN
System
PROPN
(
PUNCT
ALMDS
PROPN
)
PUNCT
,
PUNCT
and
CCONJ
Coastal
PROPN
Battlefield
PROPN
Reconnaissance
PROPN
and
CCONJ
Analysis
PROPN
(
PUNCT
COBRA
PROPN
)
PUNCT
Block
PROPN
I
PRON
systems
VERB
.
PUNCT
