In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")

In [4]:
#here we print out the text in doc
print(doc.text)

The quick brown fox jumped over the lazy dog's back.


In [5]:
#if  I want to pick a particular token position like the word jumped indexing then i do 
#i can also call various part of this token for example i can grab the SPEECH TAG with (doc[4].pos_) wich give out VERB
#or if you want the Fine_grained PArt-of-speech Tags as determined by morphology i type (doc[4].tag_)wich give out VBD
#so VERB + VBD  means verb and past tense. so that means that the word "jumped" is Past Tense.
#if you leave away the underscore it shows then always the numerical ID ( 1710900183.....) for that particular tag.
print(doc[4].tag_)

VBD


In [6]:
#now i can make a little for loop and by saying for token an document
#print out all of this little table with all the information using f string literals. so i can say now :
#print out the token text, token.text , print out token part of speech, token.pos, print out token tag, token.tag 
#print out an explanation, spacy.explain(token.tag_)
#
for token in doc:
    print(f"{token.text:{10}} {token.pos_:{10}} {token.tag:{10}} {spacy.explain(token.tag_)}")

The        DET        15267657372422890137 determiner
quick      ADJ        10554686591937588953 adjective
brown      ADJ        10554686591937588953 adjective
fox        NOUN       15308085513773655218 noun, singular or mass
jumped     VERB       17109001835818727656 verb, past tense
over       ADP        1292078113972184607 conjunction, subordinating or preposition
the        DET        15267657372422890137 determiner
lazy       ADJ        10554686591937588953 adjective
dog        NOUN       15308085513773655218 noun, singular or mass
's         PART               74 possessive ending
back       NOUN       15308085513773655218 noun, singular or mass
.          PUNCT      12646065887601541794 punctuation mark, sentence closer


so i just made the coarse green part of speech tags as well as the fine grained part of speech


Now in the English language as we mention that same string of characters can have different meanings.


In [7]:
doc = nlp(u"I am a teacher.")

In [8]:
word = doc[1]

In [9]:
word.text

'am'

In [10]:
#Here it shows that the word am is present tense
word = doc[1]
token = word
print(f"{token.text:{10}} {token.pos_:{10}} {token.tag:{10}} {spacy.explain(token.tag_)}")

am         VERB       9188597074677201817 verb, non-3rd person singular present


In [11]:
doc = nlp(u"I was a teacher.")

In [12]:
#here it recognizes that the word was is past tense
word = doc[1]
token = word
print(f"{token.text:{10}} {token.pos_:{10}} {token.tag:{10}} {spacy.explain(token.tag_)}")

was        VERB       17109001835818727656 verb, past tense


In [13]:
#so lets do actuall count parts of speech tags. So the document object has a count by a method that accepts a specific 
#token attribute as its argument and then it return a frequency count of the given attribute as a dictionary object.
#here is an example
doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")

In [14]:
POS_counts = doc.count_by(spacy.attrs.POS)

In [15]:
#thes numbers are actuall part of speech code 90 , 84 etc normaly when we do the underscore example: token.pos_ we will get
# the tag like VERB but without the underscore we will get back the numerical identifier.
#I also see now the counts like 2 , 3, 3, 1 etc..these a re values counts how often it occurs in the sentence
POS_counts

{90: 2, 84: 3, 92: 3, 100: 1, 85: 1, 94: 1, 97: 1}

In [19]:
# So I can always lookup the numerical identifier for.ex. 96, 83, 99, etc simply by calling the documents vocab and then
#passing in whatever number we are interested in, let's say 83 and then ask for the text of that and I'll return back thats
#an adjective again ADJ
doc.vocab[84].text

'ADJ'

In [25]:
#i recall that if we just grab any token off of a document. doc[2].pos_ ADJ, or just doc[2].pos it shows 83 
#thats what a speech of counts doing, its returning back the number and then how many times it showed up
#so there are 3 Adjectives ADJ in this document "the Quick brown fox......."
doc[2].pos

84

NOW IF YOU WANT TO ACTUALLY CREATE A FREQUENCY LIST OF SPEECH TAGS FROM THE ENTIRE DOCUMENT YOU CAN DO THAT BY SIMPLE making a 
FOR LOOP THAT LOOKS SIMILAR LIKE THIS.


In [26]:
#I can say for k,v key value in that we can call sorted sorted() POS counts items wich is essentially a list of tulpes
#i print out with a little formatting here to look nice table we print ut the {k}
for k,v in sorted(POS_counts.items()):
    print(f"{k}. {doc.vocab[k].text:{5}} {v}")

84. ADJ   3
85. ADP   1
90. DET   2
92. NOUN  3
94. PART  1
97. PUNCT 1
100. VERB  1


Above result gives me the informatino that for example ADJ with the numerical ID 84 occurs 3 times in our Doc sentence.
you can do this also not just for speech tags but for the fine grained tags as well.
this i will show you below


In [27]:
#instead of POS counts we say TAG counts
#and lets create that Tag counts object
TAG_counts = doc.count_by(spacy.attrs.TAG)
for k,v in sorted(TAG_counts.items()):
    print(f"{k}. {doc.vocab[k].text:{5}} {v}")

74. POS   1
1292078113972184607. IN    1
10554686591937588953. JJ    3
12646065887601541794. .     1
15267657372422890137. DT    2
15308085513773655218. NN    3
17109001835818727656. VBD   1


In [28]:
#and then we can do it also for DEP 
DEP_counts = doc.count_by(spacy.attrs.DEP)
for k,v in sorted(DEP_counts.items()):
    print(f"{k}. {doc.vocab[k].text:{5}} {v}")

402. amod  3
415. det   2
429. nsubj 1
439. pobj  1
440. poss  1
443. prep  1
445. punct 1
8110129090154140942. case  1
8206900633647566924. ROOT  1
