spaCy is a free open-source library for Natural Language Processing in Python. It features NER, POS tagging, dependency parsing, word vectors and more.

In [2]:
import spacy

### Data types

In [3]:
# language model

nlp = spacy.load("en_core_web_sm")

In [4]:
# spacy.tokens.Doc

doc = nlp("Izam is learning spaCy.")
doc, type(doc)

(Izam is learning spaCy., spacy.tokens.doc.Doc)

In [5]:
# spacy.tokens.Token

for token in doc:
    print(f"Token: {token.text}, Lemma: {token.lemma_}, POS: {token.pos_}")

Token: Izam, Lemma: Izam, POS: PROPN
Token: is, Lemma: be, POS: AUX
Token: learning, Lemma: learn, POS: VERB
Token: spaCy, Lemma: spaCy, POS: VERB
Token: ., Lemma: ., POS: PUNCT


In [6]:
# Named entities

for ent in doc.ents:
    print(f"Entity: {ent.text}, Label: {ent.label_}, type: {type(ent)}")

Entity: Izam, Label: PERSON, type: <class 'spacy.tokens.span.Span'>


**the overview**

<img src="https://spacy.io/images/architecture.svg" hight=600 width=500></img>

### Some Attributes

In [15]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for idx, token in enumerate(doc):
  print(f"text - {token.text}\nlemma - {token.lemma_}\npart of speech - {token.pos_}\ntag - {token.tag_}\ndependency - {token.dep_}\n"+
          f"shape - {token.shape_}\nis_alpha - {token.is_alpha}\nis_stop - {token.is_stop}\n================\n")
  if idx==2:
    break

text - Apple
lemma - Apple
part of speech - PROPN
tag - NNP
dependency - nsubj
shape - Xxxxx
is_alpha - True
is_stop - False

text - is
lemma - be
part of speech - AUX
tag - VBZ
dependency - aux
shape - xx
is_alpha - True
is_stop - True

text - looking
lemma - look
part of speech - VERB
tag - VBG
dependency - ROOT
shape - xxxx
is_alpha - True
is_stop - False



In [22]:
# in the case of entities

for ent in doc.ents:
  print(f"text - {ent.text}\nstart_char - {ent.start_char}\nend_char - {ent.end_char}\nlabel - {ent.label_}")
  print("================\n")

text - Apple
start_char - 0
end_char - 5
label - ORG

text - U.K.
start_char - 27
end_char - 31
label - GPE

text - $1 billion
start_char - 44
end_char - 54
label - MONEY



till now we are using the inbuilt small pipeline, now it's time to switch to medium pipeline

### Medium Pipeline

In [23]:
# downloading

! python -m spacy download en_core_web_md

2024-01-22 09:45:01.491126: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-22 09:45:01.491186: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-22 09:45:01.492678: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Collecting en-core-web-md==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.6.0/en_core_web_md-3.6.0-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-md
Successfully installed

In [27]:
# medium pipeline

nlp = spacy.load("en_core_web_md")
tokens = nlp("dog cat banana afskfse")

for token in tokens:
  print(f"text -> {token.text}\nis any vector -> {token.has_vector}\nvector -> {token.vector_norm}\nIs it a oov word -> {token.is_oov}")
  print("====================\n")

text -> dog
is any vector -> True
vector -> 75.25423431396484
Is it a oov word -> False

text -> cat
is any vector -> True
vector -> 63.18849563598633
Is it a oov word -> False

text -> banana
is any vector -> True
vector -> 31.62035369873047
Is it a oov word -> False

text -> afskfse
is any vector -> False
vector -> 0.0
Is it a oov word -> True



In [32]:
# finding the similarity of 2 words

nlp = spacy.load("en_core_web_md")
doc1 = nlp("I like salty fries and hamburgers.")
doc2 = nlp("Fast food tastes very good.")

print(doc1, "  <--->  ",
      doc2,"\n\nsimilarity --> ",
      doc1.similarity(doc2))

I like salty fries and hamburgers.   <--->   Fast food tastes very good. 

similarity -->  0.691649353055761


### Pipelines

In [33]:
# know some more about the pipeline
from pprint import pprint

pprint(nlp.analyze_pipes())

{'attrs': {'doc.ents': {'assigns': ['ner'], 'requires': []},
           'doc.sents': {'assigns': ['parser'], 'requires': []},
           'doc.tensor': {'assigns': ['tok2vec'], 'requires': []},
           'token.dep': {'assigns': ['parser'], 'requires': []},
           'token.ent_iob': {'assigns': ['ner'], 'requires': []},
           'token.ent_type': {'assigns': ['ner'], 'requires': []},
           'token.head': {'assigns': ['parser'], 'requires': []},
           'token.is_sent_start': {'assigns': ['parser'], 'requires': []},
           'token.lemma': {'assigns': ['lemmatizer'], 'requires': []},
           'token.tag': {'assigns': ['tagger'], 'requires': []}},
 'problems': {'attribute_ruler': [],
              'lemmatizer': [],
              'ner': [],
              'parser': [],
              'tagger': [],
              'tok2vec': []},
 'summary': {'attribute_ruler': {'assigns': [],
                                 'requires': [],
                                 'retokenizes': False,

In [36]:
# adding a custom component to the pipeline

from spacy.language import Language

@Language.component("component")
def component_func(doc):
  return doc

nlp.add_pipe("component", name="custom_compo2")
pprint(dict(nlp.analyze_pipes()))

{'attrs': {'doc.ents': {'assigns': ['ner'], 'requires': []},
           'doc.sents': {'assigns': ['parser'], 'requires': []},
           'doc.tensor': {'assigns': ['tok2vec'], 'requires': []},
           'token.dep': {'assigns': ['parser'], 'requires': []},
           'token.ent_iob': {'assigns': ['ner'], 'requires': []},
           'token.ent_type': {'assigns': ['ner'], 'requires': []},
           'token.head': {'assigns': ['parser'], 'requires': []},
           'token.is_sent_start': {'assigns': ['parser'], 'requires': []},
           'token.lemma': {'assigns': ['lemmatizer'], 'requires': []},
           'token.tag': {'assigns': ['tagger'], 'requires': []}},
 'problems': {'attribute_ruler': [],
              'custom_compo': [],
              'custom_compo2': [],
              'lemmatizer': [],
              'ner': [],
              'parser': [],
              'tagger': [],
              'tok2vec': []},
 'summary': {'attribute_ruler': {'assigns': [],
                                 '

will continue ...