In [37]:
import spacy

In [38]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


### Common NER Entity Types:
- PERSON: Names of people (e.g., "Barack Obama", "Albert Einstein").
- NORP: Nationalities or religious/political groups (e.g., "American", "Christian", "Democrats").
- FAC: Buildings, airports, highways, bridges, etc. (e.g., "Eiffel Tower", "Berlin Airport").
- ORG: Organizations (e.g., "Google", "United Nations", "Microsoft").
- GPE: Geo-political entities (countries, cities, states) (e.g., "USA", "New York", "India").
- LOC: Non-political locations (mountains, bodies of water, etc.) (e.g., "Mount Everest", "Amazon River").
- PRODUCT: Products, including physical objects and services (e.g., "iPhone", "Tesla Model S").
- EVENT: Events, including festivals, wars, sports events, etc. (e.g., "World War II", "Super Bowl").
- WORK_OF_ART: Books, paintings, songs, movies, etc. (e.g., "Mona Lisa", "Harry Potter").
- LAW: Named laws or legal regulations (e.g., "The Constitution", "Antitrust Law").
- LANGUAGE: Languages (e.g., "English", "Spanish", "Mandarin").
- DATE: Absolute or relative dates (e.g., "January 1, 2024", "next week").
- TIME: Time expressions (e.g., "2 PM", "morning").
- PERCENT: Percentage expressions (e.g., "25%", "50 percent").
- MONEY: Monetary values (e.g., "$100", "€50").
- QUANTITY: Quantities (e.g., "100 kilograms", "3 liters").
- ORDINAL: Ordinal numbers (e.g., "first", "second").
- CARDINAL: Cardinal numbers (e.g., "one", "100")

In [39]:
spacy.explain('PERSON')

'People, including fictional'

In [40]:
spacy.explain('EVENT')

'Named hurricanes, battles, wars, sports events, etc.'

In [41]:
spacy.explain('PRODUCT')

'Objects, vehicles, foods, etc. (not services)'

### <b>```Apply Named entity recognition using spacy```</b>
 - That are better than nltk

### 1.

In [42]:
nlp = spacy.load('en_core_web_sm')
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [43]:
nlp.pipe_labels['ner']

['CARDINAL',
 'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'LANGUAGE',
 'LAW',
 'LOC',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART']

In [44]:
sent = nlp('''Mark Zukerberg will went Aditya Joshi in New York, USA on Monday 21, 2024 4pm for $3 Trillion deal.''')

In [45]:
sent.ents   # all the entity of above sentence

(Mark Zukerberg, Aditya Joshi, New York, USA, Monday 21, 2024 4, $3 Trillion)

In [46]:
for word in sent.ents:

    # here, word.label_ ==  entity type("GPE" for Geo-Political Entity, "PERSON" for a person, etc.)
    print(word,'---->',word.label_,":::",spacy.explain(word.label_))


Mark Zukerberg ----> PERSON ::: People, including fictional
Aditya Joshi ----> PERSON ::: People, including fictional
New York ----> GPE ::: Countries, cities, states
USA ----> GPE ::: Countries, cities, states
Monday 21, 2024 4 ----> DATE ::: Absolute or relative dates or periods
$3 Trillion ----> MONEY ::: Monetary values, including unit


### 2.

In [47]:
text = "Sachin Tendulkar was born in Mumbai, India on April 24, 1974."

sent = nlp(text)

for word in sent.ents:

    # here, word.label_ ==  entity type("GPE" for Geo-Political Entity, "PERSON" for a person, etc.)
    print(word,'---->',word.label_,":::",spacy.explain(word.label_))   # here, date also recognize bcz of spacy, which is not recognize by nltk in previous file

Sachin Tendulkar ----> PERSON ::: People, including fictional
Mumbai ----> GPE ::: Countries, cities, states
India ----> GPE ::: Countries, cities, states
April 24, 1974 ----> DATE ::: Absolute or relative dates or periods


### 3.

In [48]:
text = '''Indigenous people have lived in Alaska for thousands of years, and it is widely believed that the region served as the entry point for the initial settlement of North America by way of the Bering land bridge. The Russian Empire was the first to actively colonize the area beginning in the 18th century, eventually establishing Russian America, which spanned most of the current state and promoted and maintained a native Alaskan Creole population.[7] The expense and logistical difficulty of maintaining this distant possession prompted its sale to the U.S. in 1867 for US$7.2 million (equivalent to $157 million in 2023). The area went through several administrative changes before becoming organized as a territory on May 11, 1912. It was admitted as the 49th state of the U.S. on January 3, 1959.'''

In [49]:
sent = nlp(text)

for word in sent.ents:

    # here, word.label_ ==  entity type("GPE" for Geo-Political Entity, "PERSON" for a person, etc.)
    print(word,'---->',word.label_,":::",spacy.explain(word.label_))

Alaska ----> GPE ::: Countries, cities, states
thousands of years ----> DATE ::: Absolute or relative dates or periods
North America ----> LOC ::: Non-GPE locations, mountain ranges, bodies of water
The Russian Empire ----> GPE ::: Countries, cities, states
first ----> ORDINAL ::: "first", "second", etc.
the 18th century ----> DATE ::: Absolute or relative dates or periods
Russian America ----> LOC ::: Non-GPE locations, mountain ranges, bodies of water
Alaskan ----> NORP ::: Nationalities or religious or political groups
U.S. ----> GPE ::: Countries, cities, states
1867 ----> DATE ::: Absolute or relative dates or periods
US$7.2 million ----> MONEY ::: Monetary values, including unit
$157 million ----> MONEY ::: Monetary values, including unit
2023 ----> DATE ::: Absolute or relative dates or periods
May 11, 1912 ----> DATE ::: Absolute or relative dates or periods
49th ----> ORDINAL ::: "first", "second", etc.
U.S. ----> GPE ::: Countries, cities, states
January 3, 1959 ----> DATE ::

### <b>```Display the NER in interactive way```</b>

In [50]:
from spacy import displacy

In [51]:
displacy.render(sent, style='ent', jupyter=True)