In [1]:
# Parts of speech tagging - identifying each word in the sentence as a noun, verb, adjective, etc.
# Author: Muhammad Humayun Khan

import spacy

In [9]:
# English language model
nlp = spacy.load("en_core_web_sm")

doc = nlp("The quick brown fox jumps over the lazy dog")

doc.text

'The quick brown fox jumps over the lazy dog'

In [10]:
doc[-1].text  # Accessing the last token in the document

'dog'

In [11]:
doc[2]

brown

In [12]:
# checking the part of the speech tag of the token
doc[2].pos_

'ADJ'

In [13]:

doc[2].tag_  # Detailed part of speech tag

'JJ'

In [14]:
# explain the part of speech tag
spacy.explain(doc[2].tag_)

'adjective (English), other noun-modifier (Chinese)'

In [15]:
# explain all the document
for word in doc:
    print(word.text,"------>", word.pos_,word.tag_,spacy.explain(word.tag_))

The ------> DET DT determiner
quick ------> ADJ JJ adjective (English), other noun-modifier (Chinese)
brown ------> ADJ JJ adjective (English), other noun-modifier (Chinese)
fox ------> NOUN NN noun, singular or mass
jumps ------> VERB VBZ verb, 3rd person singular present
over ------> ADP IN conjunction, subordinating or preposition
the ------> DET DT determiner
lazy ------> ADJ JJ adjective (English), other noun-modifier (Chinese)
dog ------> NOUN NN noun, singular or mass


In [16]:
# for visual representation of the parts of speech tagging use the displacy module
from spacy import displacy

# Render the document with part-of-speech tags
displacy.render(doc, style="dep", jupyter=True)

In [18]:
# the displacy can be modified for own requirements
options={
    'distance':80,
    'compact':True,
    'color':'#fff',
    'bg':'#00a65a'
}

displacy.render(doc,style='dep',jupyter=True,options=options)

In [3]:
# Now lets try to remove unnecessary tags from the document
import spacy

# Load the English language model
nlp = spacy.load("en_core_web_sm")

welcome_doc = "The University of Engineering & Technology Mardan was established in 2018, and is the second Public Sector Engineering university" \
" in Khyber Pakhtunkhwa, with a mission of providing quality education and preparing students to tackle the challenges of the industry." \
" From its inception, UET Mardan has remained dedicated to serving the community by utilizing projects to solve local problems and further the " \
"common good. In a short time, UET Mardan has achieved significant success and continues to innovate through new programs that offer a " \
"contemporary curriculum, research opportunities, student engagement, practical training, faculty development, international collaborations, " \
"progress towards a state-of-the-art campus building, and industry partnerships."

welcome_doc = nlp(welcome_doc)

# let's check out the tags in the document
for word in welcome_doc:
    print(word.text, "------>", word.pos_, word.tag_, spacy.explain(word.tag_))


The ------> DET DT determiner
University ------> PROPN NNP noun, proper singular
of ------> ADP IN conjunction, subordinating or preposition
Engineering ------> PROPN NNP noun, proper singular
& ------> CCONJ CC conjunction, coordinating
Technology ------> PROPN NNP noun, proper singular
Mardan ------> PROPN NNP noun, proper singular
was ------> AUX VBD verb, past tense
established ------> VERB VBN verb, past participle
in ------> ADP IN conjunction, subordinating or preposition
2018 ------> NUM CD cardinal number
, ------> PUNCT , punctuation mark, comma
and ------> CCONJ CC conjunction, coordinating
is ------> AUX VBZ verb, 3rd person singular present
the ------> DET DT determiner
second ------> ADJ JJ adjective (English), other noun-modifier (Chinese)
Public ------> PROPN NNP noun, proper singular
Sector ------> PROPN NNP noun, proper singular
Engineering ------> PROPN NNP noun, proper singular
university ------> NOUN NN noun, singular or mass
in ------> ADP IN conjunction, subordin

In [4]:
filtered_tokens = []
for word in welcome_doc:
    if word.pos_ not in ['PUNCT', 'DET', 'CCONJ', 'ADP', 'PRON']:
        filtered_tokens.append(word)

In [5]:
filtered_tokens[:]

[University,
 Engineering,
 Technology,
 Mardan,
 was,
 established,
 2018,
 is,
 second,
 Public,
 Sector,
 Engineering,
 university,
 Khyber,
 Pakhtunkhwa,
 mission,
 providing,
 quality,
 education,
 preparing,
 students,
 to,
 tackle,
 challenges,
 industry,
 inception,
 UET,
 Mardan,
 has,
 remained,
 dedicated,
 serving,
 community,
 utilizing,
 projects,
 to,
 solve,
 local,
 problems,
 further,
 common,
 good,
 short,
 time,
 UET,
 Mardan,
 has,
 achieved,
 significant,
 success,
 continues,
 to,
 innovate,
 new,
 programs,
 offer,
 contemporary,
 curriculum,
 research,
 opportunities,
 student,
 engagement,
 practical,
 training,
 faculty,
 development,
 international,
 collaborations,
 progress,
 state,
 art,
 campus,
 building,
 industry,
 partnerships]