In [1]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [2]:


# Text with Khaled's name
doc1 = nlp('''Khaled Ashraf, an AI engineer from Egypt, has worked on various projects in the field of machine learning. 
He completed the 'Deep Learning for Computer Vision' course on Udemy and has a deep passion for NLP and AI development. 
Khaled is known for his expertise in computer vision and is actively working on a Kaggle project to improve AI-based systems. 
In his spare time, he enjoys exploring new AI technologies and sharing knowledge with others.''')

# Loop through each token and print details
for token in doc1:
    print('Words is   : ' , token.text)
    print('POS is   : ' , token.pos_ ,'===', spacy.explain(token.pos_))
    print('Dep is   : ' , token.dep_ ,'===', spacy.explain(token.dep_))
    print('Tag is   : ' , token.tag_ ,'===', spacy.explain(token.tag_))
    print('-----------------------')


Words is   :  Khaled
POS is   :  PROPN === proper noun
Dep is   :  compound === compound
Tag is   :  NNP === noun, proper singular
-----------------------
Words is   :  Ashraf
POS is   :  PROPN === proper noun
Dep is   :  nsubj === nominal subject
Tag is   :  NNP === noun, proper singular
-----------------------
Words is   :  ,
POS is   :  PUNCT === punctuation
Dep is   :  punct === punctuation
Tag is   :  , === punctuation mark, comma
-----------------------
Words is   :  an
POS is   :  DET === determiner
Dep is   :  det === determiner
Tag is   :  DT === determiner
-----------------------
Words is   :  AI
POS is   :  PROPN === proper noun
Dep is   :  compound === compound
Tag is   :  NNP === noun, proper singular
-----------------------
Words is   :  engineer
POS is   :  NOUN === noun
Dep is   :  appos === appositional modifier
Tag is   :  NN === noun, singular or mass
-----------------------
Words is   :  from
POS is   :  ADP === adposition
Dep is   :  prep === prepositional modifier

In [3]:
for token in doc1:
    print(f'{token.text:{10}} {token.pos_:{8}} {token.tag_:{6}} {spacy.explain(token.tag_)}')

Khaled     PROPN    NNP    noun, proper singular
Ashraf     PROPN    NNP    noun, proper singular
,          PUNCT    ,      punctuation mark, comma
an         DET      DT     determiner
AI         PROPN    NNP    noun, proper singular
engineer   NOUN     NN     noun, singular or mass
from       ADP      IN     conjunction, subordinating or preposition
Egypt      PROPN    NNP    noun, proper singular
,          PUNCT    ,      punctuation mark, comma
has        AUX      VBZ    verb, 3rd person singular present
worked     VERB     VBN    verb, past participle
on         ADP      IN     conjunction, subordinating or preposition
various    ADJ      JJ     adjective (English), other noun-modifier (Chinese)
projects   NOUN     NNS    noun, plural
in         ADP      IN     conjunction, subordinating or preposition
the        DET      DT     determiner
field      NOUN     NN     noun, singular or mass
of         ADP      IN     conjunction, subordinating or preposition
machine    NOUN     NN

In [4]:
doc = nlp(u'I read book now.')
r = doc[1]
print(f'{r.text:{10}} {r.pos_:{8}} {r.tag_:{6}} {spacy.explain(r.tag_)}')

read       VERB     VBP    verb, non-3rd person singular present


In [5]:
doc = nlp(u'I read a book on NLP.')
r = doc[1]
print(f'{r.text:{10}} {r.pos_:{8}} {r.tag_:{6}} {spacy.explain(r.tag_)}')

read       VERB     VBD    verb, past tense


In [6]:
POS_counts = doc1.count_by(spacy.attrs.POS)
for k,v in sorted(POS_counts.items()):
    print(f'{k}. {doc1.vocab[k].text:{5}}: {v}')


84. ADJ  : 4
85. ADP  : 12
86. ADV  : 1
87. AUX  : 3
89. CCONJ: 4
90. DET  : 5
92. NOUN : 17
94. PART : 2
95. PRON : 4
96. PROPN: 15
97. PUNCT: 9
100. VERB : 10
103. SPACE: 3


In [7]:
TAG_counts = doc1.count_by(spacy.attrs.TAG)

for k,v in sorted(TAG_counts.items()):
    print(f'{k}. {doc1.vocab[k].text:{4}}: {v}')

74. POS : 1
164681854541413346. RB  : 1
783433942507015291. NNS : 4
1292078113972184607. IN  : 12
1534113631682161808. VBG : 3
2593208677638477497. ,   : 3
3822385049556375858. VBN : 3
4062917326063685704. PRP$: 2
4969857429396651903. ``  : 1
5595707737748328492. TO  : 1
6893682062797376370. _SP : 3
8214596291009089021. HYPH: 1
10554686591937588953. JJ  : 4
12646065887601541794. .   : 4
13656873538139661788. PRP : 2
13927759927860985106. VBZ : 5
14200088355797579614. VB  : 1
15267657372422890137. DT  : 5
15308085513773655218. NN  : 13
15794550382381185553. NNP : 15
17109001835818727656. VBD : 1
17571114184892886314. CC  : 4


In [8]:
DEP_counts = doc1.count_by(spacy.attrs.DEP)

for k,v in sorted(DEP_counts.items()):
    print(f'{k}. {doc1.vocab[k].text:{4}}: {v}')

399. advcl: 1
400. advmod: 1
402. amod: 5
403. appos: 1
405. aux : 3
406. auxpass: 1
407. cc  : 4
410. conj: 4
414. dep : 3
415. det : 5
416. dobj: 5
428. npadvmod: 1
429. nsubj: 3
430. nsubjpass: 1
439. pobj: 12
440. poss: 3
443. prep: 12
445. punct: 9
450. xcomp: 1
7037928807040764755. compound: 9
8110129090154140942. case: 1
8206900633647566924. ROOT: 4


In [9]:
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

In [10]:
text = 'Moses supposes his toeses are roses but moses supposes erroneously'

for w , m in nltk.pos_tag(nltk.word_tokenize(text)):
    print(f'word : ({w}), type : ({m}) , means :  ({spacy.explain(m)})')

word : (Moses), type : (NNS) , means :  (noun, plural)
word : (supposes), type : (VBZ) , means :  (verb, 3rd person singular present)
word : (his), type : (PRP$) , means :  (pronoun, possessive)
word : (toeses), type : (NNS) , means :  (noun, plural)
word : (are), type : (VBP) , means :  (verb, non-3rd person singular present)
word : (roses), type : (NNS) , means :  (noun, plural)
word : (but), type : (CC) , means :  (conjunction, coordinating)
word : (moses), type : (VBZ) , means :  (verb, 3rd person singular present)
word : (supposes), type : (NNS) , means :  (noun, plural)
word : (erroneously), type : (RB) , means :  (adverb)


In [11]:
text = '''
Thomas Gradgrind, sir.  A man of realities.  A man of facts and calculations.  A man who proceeds upon the principle that
two and two are four, and nothing over, and who is not to be talked into allowing for anything over.  Thomas Gradgrind, 
sir—peremptorily Thomas—Thomas Gradgrind.  With a rule and a pair of scales, and the multiplication table always in his pocket, 
sir, ready to weigh and measure any parcel of human nature, and tell you exactly what it comes to.  It is a mere question of
figures, a case of simple arithmetic.  You might hope to get some other nonsensical belief into the head of George Gradgrind, or Augustus Gradgrind, or John Gradgrind, or Joseph Gradgrind (all supposititious, non-existent persons), but into the head of Thomas Gradgrind—no, sir!

In such terms Mr. Gradgrind always mentally introduced himself, whether to his private circle of acquaintance, or to the public in general.  In such terms, no doubt, substituting the words ‘boys and girls,’ for ‘sir,’ Thomas Gradgrind now presented Thomas Gradgrind to the little pitchers before him, who were to be filled so full of facts.
'''

In [12]:
custom_sent_tokenizer = PunktSentenceTokenizer(text)
tokenized = custom_sent_tokenizer.tokenize(text)
tokenized[:10]

['\nThomas Gradgrind, sir.',
 'A man of realities.',
 'A man of facts and calculations.',
 'A man who proceeds upon the principle that\ntwo and two are four, and nothing over, and who is not to be talked into allowing for anything over.',
 'Thomas Gradgrind, \nsir—peremptorily Thomas—Thomas Gradgrind.',
 'With a rule and a pair of scales, and the multiplication table always in his pocket, \nsir, ready to weigh and measure any parcel of human nature, and tell you exactly what it comes to.',
 'It is a mere question of\nfigures, a case of simple arithmetic.',
 'You might hope to get some other nonsensical belief into the head of George Gradgrind, or Augustus Gradgrind, or John Gradgrind, or Joseph Gradgrind (all supposititious, non-existent persons), but into the head of Thomas Gradgrind—no, sir!',
 'In such terms Mr. Gradgrind always mentally introduced himself, whether to his private circle of acquaintance, or to the public in general.',
 'In such terms, no doubt, substituting the words

In [13]:
import re           
train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

In [14]:
train_text

'PRESIDENT GEORGE W. BUSH\'S ADDRESS BEFORE A JOINT SESSION OF THE CONGRESS ON THE STATE OF THE UNION\n \nFebruary 2, 2005\n\n\n9:10 P.M. EST \n\nTHE PRESIDENT: Mr. Speaker, Vice President Cheney, members of Congress, fellow citizens: \n\nAs a new Congress gathers, all of us in the elected branches of government share a great privilege: We\'ve been placed in office by the votes of the people we serve. And tonight that is a privilege we share with newly-elected leaders of Afghanistan, the Palestinian Territories, Ukraine, and a free and sovereign Iraq. (Applause.) \n\nTwo weeks ago, I stood on the steps of this Capitol and renewed the commitment of our nation to the guiding ideal of liberty for all. This evening I will set forth policies to advance that ideal at home and around the world. \n\nTonight, with a healthy, growing economy, with more Americans going back to work, with our nation an active force for good in the world -- the state of our union is confident and strong. (Applause.

In [15]:
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
tokenized = custom_sent_tokenizer.tokenize(sample_text)
tokenized[:5]

["PRESIDENT GEORGE W. BUSH'S ADDRESS BEFORE A JOINT SESSION OF THE CONGRESS ON THE STATE OF THE UNION\n \nJanuary 31, 2006\n\nTHE PRESIDENT: Thank you all.",
 'Mr. Speaker, Vice President Cheney, members of Congress, members of the Supreme Court and diplomatic corps, distinguished guests, and fellow citizens: Today our nation lost a beloved, graceful, courageous woman who called America to its founding ideals and carried on a noble dream.',
 'Tonight we are comforted by the hope of a glad reunion with the husband who was taken so long ago, and we are grateful for the good life of Coretta Scott King.',
 '(Applause.)',
 'President George W. Bush reacts to applause during his State of the Union Address at the Capitol, Tuesday, Jan.']

In [16]:
doc1 = nlp('''ضمت مؤلفات الخوارزمي كتاب الجمع والتفريق
في الحساب الهندي، وكتاب رسم الربع المعمور، وكتاب تقويم البلدان، وكتاب العمل بالأسطرلاب، 
وكتاب "صورة الأرض " الذي اعتمد فيه على كتاب المجسطي لبطليموس مع إضافات وشروح
وتعليقات، وأعاد كتابة كتاب الفلك الهندي المعروف باسم "السند هند الكبير" الذي ترجم إلى اللغة
العربية زمن الخليفة المنصور فأعاد الخوارزمي كتابته وأضاف إليه وسمي كتابه "السند هند الصغير".

وقد عرض في كتاب المختصر في حساب الجبر والمقابلة أول حل منهجي
للمعادلات الخطية والمعادلات التربيعية مستعملا في ذلك الطريقة المعروفة باسم إكمال المربع. ويعتبر مؤسس علم الجبر،
(اللقب الذي يتقاسمه مع ديوفانتوس) في القرن الثاني عشر، ولقد قدمت ترجمات اللاتينية عن حسابه على الأرقام الهندية، 
النظام العشري إلى العالم الغربي. نقح الخوارزمي كتاب الجغرافيا لكلاوديوس بطليموس وكتب في علم الفلك والتنجيم.
''')


for token in doc1:
    print('Words is   : ' , token.text)
    print('POS is   : ' , token.pos ,'===',token.pos_  , '===', spacy.explain(token.pos_))
    print('Dep is   : ' , token.dep , '===',token.dep_, '===', spacy.explain(token.dep_))
    print('Tag is   : ' , token.tag , '===',token.tag_, '===', spacy.explain(token.tag_))
    print('-----------------------')

Words is   :  ضمت
POS is   :  92 === NOUN === noun
Dep is   :  429 === nsubj === nominal subject
Tag is   :  15308085513773655218 === NN === noun, singular or mass
-----------------------
Words is   :  مؤلفات
POS is   :  96 === PROPN === proper noun
Dep is   :  428 === npadvmod === noun phrase as adverbial modifier
Tag is   :  15794550382381185553 === NNP === noun, proper singular
-----------------------
Words is   :  الخوارزمي
POS is   :  100 === VERB === verb
Dep is   :  8206900633647566924 === ROOT === root
Tag is   :  17109001835818727656 === VBD === verb, past tense
-----------------------
Words is   :  كتاب
POS is   :  96 === PROPN === proper noun
Dep is   :  416 === dobj === direct object
Tag is   :  15794550382381185553 === NNP === noun, proper singular
-----------------------
Words is   :  الجمع
POS is   :  96 === PROPN === proper noun
Dep is   :  7037928807040764755 === compound === compound
Tag is   :  15794550382381185553 === NNP === noun, proper singular
------------------

In [17]:
for token in doc1:
    print(f'{token.text:{10}} {token.pos_:{8}} {token.tag_:{6}} {spacy.explain(token.tag_)}')

ضمت        NOUN     NN     noun, singular or mass
مؤلفات     PROPN    NNP    noun, proper singular
الخوارزمي  VERB     VBD    verb, past tense
كتاب       PROPN    NNP    noun, proper singular
الجمع      PROPN    NNP    noun, proper singular
والتفريق   PROPN    NNP    noun, proper singular

          SPACE    _SP    whitespace
في         PROPN    NNP    noun, proper singular
الحساب     PROPN    NNP    noun, proper singular
الهندي     PROPN    NNP    noun, proper singular
،          PROPN    NNP    noun, proper singular
وكتاب      PROPN    NNP    noun, proper singular
رسم        PROPN    NNP    noun, proper singular
الربع      PROPN    NNP    noun, proper singular
المعمور    PROPN    NNP    noun, proper singular
،          PROPN    NNP    noun, proper singular
وكتاب      PROPN    NNP    noun, proper singular
تقويم      PROPN    NNP    noun, proper singular
البلدان    PROPN    NNP    noun, proper singular
،          PROPN    NNP    noun, proper singular
وكتاب      PROPN    NNP    noun, pr