#  <CENTER>CLASSWORK SPACY</CENTER>

# IMPORTING SPACY

SPACY IS AN OPEN-SOURCE NATURAL LANGUAGE PROCESSING LIBRARY FOR PYTHON.LOADING THE ENGLISH LANGUAGE MODEL 'en_core_web_lg' USING SPACY.

In [1]:
#!python -m spacy download en_core_web_lg 
import spacy
nlp = spacy.load("en_core_web_lg")


# SMALL TEXT

TOKENIZATION AND PART-OF-SPEECH TAGGING PERFORMED ON THE INPUT TEXT "MY BEST FRIEND RYAN PETERS LIKES FANCY ADVENTURE GAMES" USING SPACY.

In [2]:
text = "My best friend Ryan Peters likes fancy adventure games."
doc = nlp(text)
for token in doc:
    print(token, end="| ")


My| best| friend| Ryan| Peters| likes| fancy| adventure| games| .| 

# ATTRIBUTES OF SPACY 

GENERATING A DATAFRAME VISUALIZATION OF TOKENIZED TEXT WITH LEMMATIZATION, PART-OF-SPEECH TAGGING, AND ENTITY RECOGNITION USING SPACY.

In [3]:
import pandas as pd

def display_nlp(doc, include_punct=False):
    """Generate data frame for visualization of spaCy tokens."""
    rows = []
    for i, t in enumerate(doc):
        if not t.is_punct or include_punct:
            row = {'token': i,  'text': t.text, 'lemma_': t.lemma_, 
                   'is_stop': t.is_stop, 'is_alpha': t.is_alpha,
                   'pos_': t.pos_, 'dep_': t.dep_, 
                   'ent_type_': t.ent_type_, 'ent_iob_': t.ent_iob_}
            rows.append(row)
    
    df = pd.DataFrame(rows).set_index('token')
    df.index.name = None
    return df
display_nlp(doc)


Unnamed: 0,text,lemma_,is_stop,is_alpha,pos_,dep_,ent_type_,ent_iob_
0,My,my,True,True,PRON,poss,,O
1,best,good,False,True,ADJ,amod,,O
2,friend,friend,False,True,NOUN,nsubj,,O
3,Ryan,Ryan,False,True,PROPN,compound,PERSON,B
4,Peters,Peters,False,True,PROPN,appos,PERSON,I
5,likes,like,False,True,VERB,ROOT,,O
6,fancy,fancy,False,True,ADJ,amod,,O
7,adventure,adventure,False,True,NOUN,compound,,O
8,games,game,False,True,NOUN,dobj,,O


# REMOVING STOP WORDS USIN SPACY

FILTERING OUT STOP WORDS AND PUNCTUATION SYMBOLS FROM THE INPUT TEXT "DEAR RYAN, WE NEED TO SIT DOWN AND TALK. REGARDS, PETE" USING SPACY.

In [4]:
text = "Dear Ryan, we need to sit down and talk. Regards, Pete"
doc = nlp(text)

non_stop = [t for t in doc if not t.is_stop and not t.is_punct]
print(non_stop)


[Dear, Ryan, need, sit, talk, Regards, Pete]


# FINDING NOUNS USING SPACY

EXTRACTING NOUNS AND PROPER NOUNS FROM THE INPUT TEXT "MY BEST FRIEND RYAN PETERS LIKES FANCY ADVENTURE GAMES" USING SPACY.

In [5]:
text = "My best friend Ryan Peters likes fancy adventure games."
doc = nlp(text)

nouns = [t for t in doc if t.pos_ in ['NOUN', 'PROPN']]
print(nouns)


[friend, Ryan, Peters, adventure, games]


# NAMED ENTITY RECOGNITION

IDENTIFYING AND PRINTING NAMED ENTITIES ALONG WITH THEIR LABELS IN THE INPUT TEXT "MY BEST FRIEND RYAN PETERS LIKES FANCY ADVENTURE GAMES" USING SPACY.

In [6]:
text = "My best friend Ryan Peters likes fancy adventure games."
doc = nlp(text)

for ent in doc.ents:
    print(f"({ent.text}, {ent.label_})", end=" ")


(Ryan Peters, PERSON) 

# NAMED ENTITY RECOGNITION

In [7]:
text = "James O'Neill, chairman of World Cargo Inc, lives in San Francisco." 
doc = nlp(text)

for ent in doc.ents:
    print(f"({ent.text}, {ent.label_})", end=" ")


(James O'Neill, PERSON) (World Cargo Inc, ORG) (San Francisco, GPE) 

# VISUALIZING NMR

VISUALIZING THE NAMED ENTITIES IN THE INPUT TEXT "MY BEST FRIEND RYAN PETERS LIKES FANCY ADVENTURE GAMES" USING SPACY'S `displacy.render()` FUNCTION IN A JUPYTER NOTEBOOK ENVIRONMENT.

In [8]:
from spacy import displacy

displacy.render(doc, style='ent', jupyter=True)


# TRYING ON DATASET

THIS CODE DEFINES A FUNCTION `URL_TO_STRING` THAT EXTRACTS TEXT CONTENT FROM A GIVEN URL USING BEAUTIFULSOUP AND REGULAR EXPRESSIONS, EXCLUDING CERTAIN ELEMENTS LIKE SCRIPTS, STYLES, AND ASIDES. THEN IT RETRIEVES THE TEXT FROM THE NDTV NEWS ARTICLE URL PROVIDED, PROCESSES IT USING SPACY, AND COUNTS THE NUMBER OF NAMED ENTITIES FOUND IN THE ARTICLE.

In [9]:
from bs4 import BeautifulSoup
import requests
import re
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))
ny_bb = url_to_string("https://www.ndtv.com/india-news/sadhguru-undergoes-surgery-for-chronic-brain-bleed-at-delhi-indraprastha-apollo-hospitals-5276777#trendingnow")
article = nlp(ny_bb)
len(article.ents)


62

# NERS

In [10]:
displacy.render(article, style='ent', jupyter=True)



# NER TYPE

In [11]:
from collections import Counter

# Get labels of named entities
labels = [x.label_ for x in article.ents]

# Count occurrences of each label
label_counts = Counter(labels)

# Print label counts
print(label_counts)


Counter({'ORG': 21, 'GPE': 13, 'PERSON': 13, 'DATE': 6, 'TIME': 4, 'CARDINAL': 3, 'FAC': 1, 'LAW': 1})


# MOST POPULAR NER OF DATASET

In [12]:
items = [x.text for x in article.ents]
Counter(items).most_common(5)


[('Sadhguru', 7),
 ('Delhi', 5),
 ('the last four weeks', 2),
 ('Isha Foundation', 2),
 ('ScheduleTrainsSadhguru', 1)]

# SENTENCE TO ANALYSE

In [13]:
sentences = [x for x in article.sents]
print(sentences[0])


Sadhguru Undergoes Surgery For Chronic Brain Bleed


# NER TAGS

RENDERS THE NAMED ENTITIES IN THE FIRST SENTENCE PARSED BY SPACY USING THE 'ENT' STYLE IN A JUPYTER NOTEBOOK.

In [14]:
displacy.render(nlp(str(sentences[0])), jupyter=True, style='ent')


# TYPE OF WORDS IN SENTENCE

GENERATES A LIST COMPREHENSION THAT ITERATES OVER TOKENS IN THE FIRST SENTENCE PARSED BY SPACY, EXCLUDING STOP WORDS AND PUNCTUATION, AND EXTRACTS THEIR ORTHOGRAPHIC FORMS, PARTS OF SPEECH, AND LEMMATIZED FORMS, RETURNING THEM AS TUPLES.

In [15]:
[(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in nlp(str(sentences[0])) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]


[('Sadhguru', 'PROPN', 'Sadhguru'),
 ('Undergoes', 'PROPN', 'Undergoes'),
 ('Surgery', 'PROPN', 'Surgery'),
 ('Chronic', 'PROPN', 'Chronic'),
 ('Brain', 'PROPN', 'Brain'),
 ('Bleed', 'PROPN', 'Bleed')]

# SENTENCE DEPENDENCY TREE

VISUALIZES THE DEPENDENCY TREE OF A SENTENCE USING SPACY'S 'DISPLACY' RENDERER IN A JUPYTER NOTEBOOK WITH CUSTOMIZED DISTANCE BETWEEN NODES.

In [16]:
displacy.render(nlp(str(sentences[0])), style='dep', jupyter = True, options = {'distance': 120})
