In [1]:
from spacy.tokens.doc import Doc
from spacy.vocab import Vocab
doc = Doc(Vocab(), words = [u'Hello', u'World!'])
print(doc)

Hello World! 


In [2]:
print(type(doc))
print(doc.vocab)
for token in doc:
    lexeme = doc.vocab[token.text]
    print(lexeme.text)

<class 'spacy.tokens.doc.Doc'>
<spacy.vocab.Vocab object at 0x7f10ec652280>
Hello
World!


1. The Vocab object belongs to spacy.tokens.doc.Doc class
2. In spaCy, a Lexeme object represents a single entry in the vocabulary and provides access to various linguistic attributes associated with a word.

In [3]:
import spacy
nlp = spacy.load('en_core_web_sm')
doc = nlp(u'I want to learn spaCy.')
token_text1 = [token.text for token in doc]
token_text2 = [doc[i].text for i in range(len(doc))]
print(token_text1)
print(token_text2)

['I', 'want', 'to', 'learn', 'spaCy', '.']
['I', 'want', 'to', 'learn', 'spaCy', '.']


In [4]:
from distutils.sysconfig import get_python_lib
import os.path

# Get the directory containing the en_core_web_sm model
model_directory = os.path.join(get_python_lib(), 'en_core_web_sm')
print(model_directory)

/home/pheonix/University/Semester-7/NLP Tasks/Tasks/.env/lib/python3.8/site-packages/en_core_web_sm


1. en_core_web_sm: en_core_web_sm is a small-sized English model in spaCy, designed for basic natural language processing tasks.
2. 15.3 MB
3. **en_core_web_md:** A medium-sized English model with more linguistic annotations and higher accuracy for tasks compared to en_core_web_sm. **en_core_web_lg:** A large-sized English model with even more linguistic data, word vectors, and higher accuracy than both en_core_web_sm and en_core_web_md.

In [6]:
doc = nlp(u'I want to learn spaCy.')
for i in range(len(doc)):
    print([t for t in doc[i].lefts])

[]
[I]
[]
[to]
[]
[]


In [7]:
doc = nlp(u'I want to learn spaCy.')
for i in range(len(doc)):
    print([t for t in doc[i].rights])
    print([t for t in doc[i].children])

[]
[]
[learn, .]
[I, learn, .]
[]
[]
[spaCy]
[to, spaCy]
[]
[]
[]
[]


In [8]:
from spacy import displacy

In [9]:
displacy.render(doc, style='dep')

In [10]:
displacy.render(doc, style='dep', options={'compact': True, 'distance': 120})

In [11]:
displacy.render(doc, style='dep', options={'compact': True, 'distance': 120, 'arrow_spacing': 20})

In [12]:
# Process the sentence
sentence2 = "I would very much want to eat a hot dinner."
doc2 = nlp(sentence2)

# Visualize left and right dependencies
displacy.render(doc2, style='dep', options={'compact': True, 'distance': 120})


In [14]:
def extract_dependency_grammars(doc):
    dependency_grammars = []
    for token in doc:
        dependency_grammars.append((token.text, token.dep_))
    return dependency_grammars

sentence1_dependency_grammars = extract_dependency_grammars(doc)
sentence2_dependency_grammars = extract_dependency_grammars(doc2)

print("Dependency Grammars for 'I want to learn spaCy.':")
print(sentence1_dependency_grammars)

print("\nDependency Grammars for 'I would very much want to eat a hot dinner.':")
print(sentence2_dependency_grammars)


Dependency Grammars for 'I want to learn spaCy.':
[('I', 'nsubj'), ('want', 'ROOT'), ('to', 'aux'), ('learn', 'xcomp'), ('spaCy', 'dobj'), ('.', 'punct')]

Dependency Grammars for 'I would very much want to eat a hot dinner.':
[('I', 'nsubj'), ('would', 'aux'), ('very', 'advmod'), ('much', 'advmod'), ('want', 'ROOT'), ('to', 'aux'), ('eat', 'xcomp'), ('a', 'det'), ('hot', 'amod'), ('dinner', 'dobj'), ('.', 'punct')]


In [19]:
pip install nltk

Collecting nltk
  Using cached nltk-3.8.1-py3-none-any.whl (1.5 MB)
Collecting joblib (from nltk)
  Using cached joblib-1.3.2-py3-none-any.whl.metadata (5.4 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2023.12.25-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m94.5 kB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m
Downloading regex-2023.12.25-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (777 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m777.0/777.0 kB[0m [31m527.3 kB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25hUsing cached joblib-1.3.2-py3-none-any.whl (302 kB)
Installing collected packages: regex, joblib, nltk
Successfully installed joblib-1.3.2 nltk-3.8.1 regex-2023.12.25
Note: you may need to restart the kernel to use updated packages.


In [20]:
import nltk

In [21]:
texts = [u"We are nearing the end of the semester at Peshawar. Final exams of the Fall 2023 semester will start soon."]


In [22]:
for text in texts:
    sentences = nltk.sent_tokenize(text)
    print(sentences)

['We are nearing the end of the semester at Peshawar.', 'Final exams of the Fall 2023 semester will start soon.']


In [26]:
for sentence in sentences:
    words = nltk.word_tokenize(sentence)
    print(words)
    

['We', 'are', 'nearing', 'the', 'end', 'of', 'the', 'semester', 'at', 'Peshawar', '.']
['Final', 'exams', 'of', 'the', 'Fall', '2023', 'semester', 'will', 'start', 'soon', '.']


In [27]:
    tagged_words = nltk.pos_tag(words)
    print(tagged_words)

[('Final', 'JJ'), ('exams', 'NN'), ('of', 'IN'), ('the', 'DT'), ('Fall', 'NN'), ('2023', 'CD'), ('semester', 'NN'), ('will', 'MD'), ('start', 'VB'), ('soon', 'RB'), ('.', '.')]


In [28]:
    ne_tagged_words = nltk.ne_chunk(tagged_words)
    print(ne_tagged_words)

(S
  Final/JJ
  exams/NN
  of/IN
  the/DT
  Fall/NN
  2023/CD
  semester/NN
  will/MD
  start/VB
  soon/RB
  ./.)


In [29]:
nltk.download('punkt')
# Sentence Tokenize
nltk.download('averaged_perceptron_tagger') # POS Tagging
nltk.download('maxent_ne_chunker') # Named Entity Chunking
nltk.download('words')

[nltk_data] Downloading package punkt to /home/pheonix/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/pheonix/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /home/pheonix/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /home/pheonix/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [31]:
from spacy import displacy
doc = nlp(u'We are nearing the end of the semester at Peshawar. Final exams of the Fall2023 semester will start soon.')
displacy.render(doc, style='ent')

In [32]:
for ent in doc.ents:
    print(ent.text, ent.label_)

the end of the semester DATE
Peshawar GPE
Fall2023 GPE


1. NLTK doesn't have a pre-trained Named Entity Recognition (NER) model by default, requiring custom training for entity recognition. It doesn't output named entity labels like spaCy.

S
  Final/JJ
  exams/NN
  of/IN
  the/DT
  Fall/NN
  2023/CD
  semester/NN
  will/MD
  start/VB
  soon/RB
  ./.

2. spaCy offers a pre-trained NER model, identifying entities (persons, organizations, dates, locations) and labeling them, providing named entity outputs directly without the need for custom training.

the end of the semester DATE
Peshawar GPE
Fall2023 GPE

In [33]:
import spacy
nlp = spacy.load('en_core_web_sm')
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

1. tok2vec: This component handles tokenization and converts words into vectors. It's responsible for turning raw text into the sequence of tokens that downstream components operate on.

2. tagger: The part-of-speech (POS) tagger assigns grammatical tags (such as noun, verb, adjective) to each token in the text.

3. parser: The dependency parser analyzes the syntactic structure of the sentence, assigning grammatical dependencies between words and creating a parse tree.

4. attribute_ruler: This component sets attributes on tokens, possibly based on the token's text or context.

5. lemmatizer: The lemmatizer reduces words to their base or dictionary form (lemmas). For example, 'running' becomes 'run'.

6. ner: The named entity recognizer identifies and categorizes named entities in the text, such as persons, organizations, dates, and more.



In [34]:
nlp = spacy.load('en_core_web_sm',disable=['parser'])

In [35]:
from spacy.language import Language
@Language.component("my_component")
def my_component(doc):
# Do something to the doc here
    return doc

In [36]:
nlp.add_pipe("my_component")

<function __main__.my_component(doc)>

In [38]:
print(nlp.pipe_names)

['tok2vec', 'tagger', 'attribute_ruler', 'lemmatizer', 'ner', 'my_component']


In [40]:
# nlp.add_pipe("my_component", first=True)
# nlp.add_pipe("my_component", last=True)
# nlp.add_pipe("my_component", before="parser")
# nlp.add_pipe("my_component", after="parser")

In [41]:
doc = nlp(u'I want to learn spaCy.')
displacy.render(doc, style='dep')

https://spacy.io/usage/models


In [42]:
doc = nlp(u'How do I learn spaCy.')
displacy.render(doc, style='dep')

In [43]:
import spacy
nlp = spacy.load('en_core_web_sm')
def dep_pattern(doc):
    for i in range(len(doc)-1):
        print(doc[i].dep_)
        if doc[i].dep_ == 'nsubj' and doc[i+1].dep_ == 'ROOT' and doc[i+2].dep_ =='acomp':
            return True
    return False
doc = nlp(u'How do I learn spaCy.')
if dep_pattern(doc):
    print('Found')
else:
    print('Not found')

advmod
aux
nsubj
Found
