# spaCy

In [1]:
! pip install spacy==3.2.3
! python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.2.0/en_core_web_lg-3.2.0-py3-none-any.whl (777.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m777.4/777.4 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:02[0m


Installing collected packages: en-core-web-lg
  Attempting uninstall: en-core-web-lg
    Found existing installation: en-core-web-lg 3.4.1
    Uninstalling en-core-web-lg-3.4.1:
      Successfully uninstalled en-core-web-lg-3.4.1
Successfully installed en-core-web-lg-3.2.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [2]:
import spacy
from spacy import displacy

nlp = spacy.load("en_core_web_lg")
raw_text = "The Mars Orbiter Mission (MOM), informally known as Mangalyaan, was launched into Earth orbit on 5 November 2013 by the Indian Space Research Organisation (ISRO) and has entered Mars orbit on 24 September 2014. India thus became the first country to enter Mars orbit on its first attempt. It was completed at a record low cost of $74 million."

doc = nlp(raw_text)

for word in doc.ents:
   	print(word.text,word.label_)

displacy.render(doc, style="ent", jupyter=True)

  from .autonotebook import tqdm as notebook_tqdm


The Mars Orbiter Mission (MOM) PRODUCT
Mangalyaan PERSON
Earth LOC
5 November 2013 DATE
the Indian Space Research Organisation ORG
ISRO ORG
Mars LOC
24 September 2014 DATE
India GPE
first ORDINAL
Mars LOC
first ORDINAL
$74 million MONEY


# Using NLTK

In [3]:
! pip install nltk==3.6.5



In [5]:
from nltk import download
download('averaged_perceptron_tagger')
download('maxent_ne_chunker')
download('words')
download('punkt')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/jyotikasingh/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/jyotikasingh/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     /Users/jyotikasingh/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [6]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tag import pos_tag
from nltk import ne_chunk

raw_text = "The Mars Orbiter Mission (MOM), informally known as Mangalyaan, was launched into Earth orbit on 5 November 2013 by the Indian Space Research Organisation (ISRO) and has entered Mars orbit on 24 September 2014. India thus became the first country to enter Mars orbit on its first attempt. It was completed at a record low cost of $74 million."
doc = pos_tag(word_tokenize(raw_text))

In [7]:
NLTK_LABELS = ["PERSON", "ORGANIZATION", "GPE"]
tagged_doc = []
for sent in sent_tokenize(raw_text):
    tagged_doc.append(pos_tag(word_tokenize(sent)))
entities = []
for sent in tagged_doc:
    trees = ne_chunk(sent)
    for tree in trees:
        if (
            hasattr(tree, "label")
            and tree.label() in NLTK_LABELS
        ):
            entities.append((
                " ".join([
                    entity
                    for (entity, label) in tree
                    # filter for non-entities
                    if (
                        # removing noise, if it is a URL or empty
                        "http" not in entity.lower()
                        and "\n" not in entity.lower()
                        and len(entity.strip()) > 0
                    )
                ]), tree.label(),
            ))
print(entities)

[('Mars', 'ORGANIZATION'), ('MOM', 'ORGANIZATION'), ('Mangalyaan', 'GPE'), ('Earth', 'GPE'), ('Indian', 'GPE'), ('Space Research Organisation', 'ORGANIZATION'), ('ISRO', 'ORGANIZATION'), ('Mars', 'PERSON'), ('India', 'GPE'), ('Mars', 'PERSON')]


# spaCy transformers

In [8]:
! pip install spacy==3.2.3
! pip install spacy-transformers==1.1.5 -f https://download.pytorch.org/whl/torch_stable.html
! python -m spacy download en_core_web_trf

Looking in links: https://download.pytorch.org/whl/torch_stable.html


Collecting en-core-web-trf==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.2.0/en_core_web_trf-3.2.0-py3-none-any.whl (460.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m460.2/460.2 MB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m


[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_trf')


In [9]:
import spacy
from spacy import displacy

raw_text = "The Mars Orbiter Mission (MOM), informally known as Mangalyaan, was launched into Earth orbit on 5 November 2013 by the Indian Space Research Organisation (ISRO) and has entered Mars orbit on 24 September 2014. India thus became the first country to enter Mars orbit on its first attempt. It was completed at a record low cost of $74 million."

nlp = spacy.load("en_core_web_trf")

doc = nlp(raw_text)

displacy.render(doc, style="ent", jupyter=True)

# Transformers

In [10]:
! pip install transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [13]:
from transformers import pipeline

ner = pipeline("ner", model="dslim/bert-base-NER", grouped_entities=True)



Downloading: 100%|██████████████████████████████| 829/829 [00:00<00:00, 439kB/s][A[A


Downloading:   0%|                                   | 0.00/413M [00:00<?, ?B/s][A[A

Downloading:   0%|                           | 175k/413M [00:00<04:38, 1.55MB/s][A[A

Downloading:   0%|                           | 431k/413M [00:00<03:29, 2.07MB/s][A[A

Downloading:   0%|                           | 734k/413M [00:00<03:35, 2.00MB/s][A[A

Downloading:   0%|                           | 932k/413M [00:00<04:46, 1.51MB/s][A[A

Downloading:   0%|                          | 1.10M/413M [00:00<04:26, 1.62MB/s][A[A

Downloading:   0%|                          | 1.39M/413M [00:00<03:45, 1.92MB/s][A[A

Downloading:   0%|                          | 1.97M/413M [00:00<02:22, 3.03MB/s][A[A

Downloading:   1%|▏                         | 2.29M/413M [00:01<02:26, 2.95MB/s][A[A

Downloading:   1%|▏                         | 2.83M/413M [00:01<01:56, 3.69MB/s][A[A

Downloading:   1%|▏          

Downloading:  45%|████████████               | 185M/413M [00:22<00:22, 10.8MB/s][A[A

Downloading:  45%|████████████▏              | 186M/413M [00:22<00:26, 8.97MB/s][A[A

Downloading:  45%|████████████▏              | 187M/413M [00:22<00:26, 8.81MB/s][A[A

Downloading:  46%|████████████▎              | 188M/413M [00:22<00:25, 9.19MB/s][A[A

Downloading:  46%|████████████▎              | 189M/413M [00:22<00:29, 8.01MB/s][A[A

Downloading:  46%|████████████▍              | 191M/413M [00:22<00:26, 8.77MB/s][A[A

Downloading:  46%|████████████▌              | 192M/413M [00:23<00:23, 9.68MB/s][A[A

Downloading:  47%|████████████▌              | 193M/413M [00:23<00:24, 9.59MB/s][A[A

Downloading:  47%|████████████▋              | 194M/413M [00:23<00:25, 9.19MB/s][A[A

Downloading:  47%|████████████▋              | 195M/413M [00:23<00:23, 9.84MB/s][A[A

Downloading:  47%|████████████▊              | 196M/413M [00:23<00:22, 9.98MB/s][A[A

Downloading:  48%|████████████▉ 

Downloading:  95%|█████████████████████████▌ | 391M/413M [00:44<00:02, 10.2MB/s][A[A

Downloading:  95%|█████████████████████████▌ | 392M/413M [00:44<00:02, 10.1MB/s][A[A

Downloading:  95%|█████████████████████████▋ | 393M/413M [00:44<00:02, 9.69MB/s][A[A

Downloading:  95%|█████████████████████████▋ | 394M/413M [00:44<00:02, 8.71MB/s][A[A

Downloading:  96%|█████████████████████████▊ | 395M/413M [00:44<00:01, 9.84MB/s][A[A

Downloading:  96%|█████████████████████████▊ | 396M/413M [00:44<00:01, 9.64MB/s][A[A

Downloading:  96%|█████████████████████████▉ | 397M/413M [00:44<00:01, 9.80MB/s][A[A

Downloading:  96%|█████████████████████████▉ | 398M/413M [00:45<00:01, 8.61MB/s][A[A

Downloading:  96%|██████████████████████████ | 399M/413M [00:45<00:01, 8.84MB/s][A[A

Downloading:  97%|██████████████████████████ | 400M/413M [00:45<00:01, 8.80MB/s][A[A

Downloading:  97%|██████████████████████████▏| 401M/413M [00:45<00:01, 8.75MB/s][A[A

Downloading:  97%|██████████████

In [14]:
raw_text = "The Mars Orbiter Mission (MOM), informally known as Mangalyaan, was launched into Earth orbit on 5 November 2013 by the Indian Space Research Organisation (ISRO) and has entered Mars orbit on 24 September 2014. India thus became the first country to enter Mars orbit on its first attempt. It was completed at a record low cost of $74 million."
print(ner(raw_text))

[{'entity_group': 'MISC', 'score': 0.7344227, 'word': 'Mars Orbiter Mission', 'start': 4, 'end': 24}, {'entity_group': 'MISC', 'score': 0.6008748, 'word': 'MOM', 'start': 26, 'end': 29}, {'entity_group': 'LOC', 'score': 0.43170515, 'word': 'Man', 'start': 52, 'end': 55}, {'entity_group': 'MISC', 'score': 0.5044298, 'word': '##gal', 'start': 55, 'end': 58}, {'entity_group': 'LOC', 'score': 0.47212577, 'word': '##ya', 'start': 58, 'end': 60}, {'entity_group': 'MISC', 'score': 0.48969588, 'word': '##an', 'start': 60, 'end': 62}, {'entity_group': 'LOC', 'score': 0.75420374, 'word': 'Earth', 'start': 82, 'end': 87}, {'entity_group': 'ORG', 'score': 0.99907124, 'word': 'Indian Space Research Organisation', 'start': 120, 'end': 154}, {'entity_group': 'ORG', 'score': 0.9986104, 'word': 'ISRO', 'start': 156, 'end': 160}, {'entity_group': 'LOC', 'score': 0.99694604, 'word': 'Mars', 'start': 178, 'end': 182}, {'entity_group': 'LOC', 'score': 0.99982953, 'word': 'India', 'start': 211, 'end': 216},