#  <CENTER>ASSIGNMENT SPACY</CENTER>

# IMPORTING SPACY

SPACY IS AN OPEN-SOURCE NATURAL LANGUAGE PROCESSING LIBRARY FOR PYTHON.LOADING THE ENGLISH LANGUAGE MODEL 'en_core_web_lg' USING SPACY.

In [1]:
#!python -m spacy download en_core_web_lg 
import spacy
nlp = spacy.load("en_core_web_lg")


# SMALL TEXT

TOKENIZATION AND PART-OF-SPEECH TAGGING PERFORMED ON THE INPUT TEXT "MY BEST FRIEND RYAN PETERS LIKES FANCY ADVENTURE GAMES" USING SPACY.

In [2]:
text = "My best friend Ryan Peters likes fancy adventure games."
doc = nlp(text)
for token in doc:
    print(token, end="| ")


My| best| friend| Ryan| Peters| likes| fancy| adventure| games| .| 

# ATTRIBUTES OF SPACY 

GENERATING A DATAFRAME VISUALIZATION OF TOKENIZED TEXT WITH LEMMATIZATION, PART-OF-SPEECH TAGGING, AND ENTITY RECOGNITION USING SPACY.

In [3]:
import pandas as pd

def display_nlp(doc, include_punct=False):
    """Generate data frame for visualization of spaCy tokens."""
    rows = []
    for i, t in enumerate(doc):
        if not t.is_punct or include_punct:
            row = {'token': i,  'text': t.text, 'lemma_': t.lemma_, 
                   'is_stop': t.is_stop, 'is_alpha': t.is_alpha,
                   'pos_': t.pos_, 'dep_': t.dep_, 
                   'ent_type_': t.ent_type_, 'ent_iob_': t.ent_iob_}
            rows.append(row)
    
    df = pd.DataFrame(rows).set_index('token')
    df.index.name = None
    return df
display_nlp(doc)


Unnamed: 0,text,lemma_,is_stop,is_alpha,pos_,dep_,ent_type_,ent_iob_
0,My,my,True,True,PRON,poss,,O
1,best,good,False,True,ADJ,amod,,O
2,friend,friend,False,True,NOUN,nsubj,,O
3,Ryan,Ryan,False,True,PROPN,compound,PERSON,B
4,Peters,Peters,False,True,PROPN,appos,PERSON,I
5,likes,like,False,True,VERB,ROOT,,O
6,fancy,fancy,False,True,ADJ,amod,,O
7,adventure,adventure,False,True,NOUN,compound,,O
8,games,game,False,True,NOUN,dobj,,O


# REMOVING STOP WORDS USING SPACY

FILTERING OUT STOP WORDS AND PUNCTUATION SYMBOLS FROM THE INPUT TEXT "DEAR RYAN, WE NEED TO SIT DOWN AND TALK. REGARDS, PETE" USING SPACY.

In [4]:
text = "Dear Ryan, we need to sit down and talk. Regards, Pete"
doc = nlp(text)

non_stop = [t for t in doc if not t.is_stop and not t.is_punct]
print(non_stop)


[Dear, Ryan, need, sit, talk, Regards, Pete]


# FINDING NOUNS USING SPACY

EXTRACTING NOUNS AND PROPER NOUNS FROM THE INPUT TEXT "MY BEST FRIEND RYAN PETERS LIKES FANCY ADVENTURE GAMES" USING SPACY.

In [5]:
text = "My best friend Ryan Peters likes fancy adventure games."
doc = nlp(text)

nouns = [t for t in doc if t.pos_ in ['NOUN', 'PROPN']]
print(nouns)


[friend, Ryan, Peters, adventure, games]


# NAMED ENTITY RECOGNITION

IDENTIFYING AND PRINTING NAMED ENTITIES ALONG WITH THEIR LABELS IN THE INPUT TEXT "MY BEST FRIEND RYAN PETERS LIKES FANCY ADVENTURE GAMES" USING SPACY.

In [6]:
text = "My best friend Ryan Peters likes fancy adventure games."
doc = nlp(text)

for ent in doc.ents:
    print(f"({ent.text}, {ent.label_})", end=" ")


(Ryan Peters, PERSON) 

# NAMED ENTITY RECOGNITION

In [7]:
text = "James O'Neill, chairman of World Cargo Inc, lives in San Francisco." 
doc = nlp(text)

for ent in doc.ents:
    print(f"({ent.text}, {ent.label_})", end=" ")


(James O'Neill, PERSON) (World Cargo Inc, ORG) (San Francisco, GPE) 

# VISUALIZING NMR

VISUALIZING THE NAMED ENTITIES IN THE INPUT TEXT "MY BEST FRIEND RYAN PETERS LIKES FANCY ADVENTURE GAMES" USING SPACY'S `displacy.render()` FUNCTION IN A JUPYTER NOTEBOOK ENVIRONMENT.

In [8]:
from spacy import displacy

displacy.render(doc, style='ent', jupyter=True)


# TOPIC -1
# CNN NEWS ARTICLE

THIS CODE DEFINES A FUNCTION `URL_TO_STRING` THAT EXTRACTS TEXT CONTENT FROM A GIVEN URL USING BEAUTIFULSOUP AND REGULAR EXPRESSIONS, EXCLUDING CERTAIN ELEMENTS LIKE SCRIPTS, STYLES, AND ASIDES. THEN IT RETRIEVES THE TEXT FROM THE NDTV NEWS ARTICLE URL PROVIDED, PROCESSES IT USING SPACY, AND COUNTS THE NUMBER OF NAMED ENTITIES FOUND IN THE ARTICLE.

In [19]:
from bs4 import BeautifulSoup
import requests
import re
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))
ny_bb = url_to_string("https://www.cnn.com/world/egypt-new-administrative-capital-spc-intl/index.html")
article = nlp(ny_bb)
len(article.ents)


243

# NERS

In [20]:
displacy.render(article, style='ent', jupyter=True)



# NER TYPE

In [21]:
from collections import Counter

# Get labels of named entities
labels = [x.label_ for x in article.ents]

# Count occurrences of each label
label_counts = Counter(labels)

# Print label counts
print(label_counts)


Counter({'ORG': 54, 'GPE': 46, 'PERSON': 25, 'CARDINAL': 22, 'DATE': 19, 'LOC': 18, 'EVENT': 16, 'WORK_OF_ART': 12, 'QUANTITY': 9, 'MONEY': 9, 'NORP': 7, 'PERCENT': 2, 'ORDINAL': 2, 'TIME': 1, 'FAC': 1})


# MOST POPULAR NER OF DATASET

In [22]:
items = [x.text for x in article.ents]
Counter(items).most_common(5)


[('CNN', 19), ('Cairo', 15), ('Egypt', 10), ('Abbas', 7), ('ACUD', 6)]

# SENTENCE TO ANALYSE

In [23]:
sentences = [x for x in article.sents]
print(sentences[0])


                              Egypt’s New Administrative Capital: A new city is rising, but is it what the country needs?


# NER TAGS

RENDERS THE NAMED ENTITIES IN THE FIRST SENTENCE PARSED BY SPACY USING THE 'ENT' STYLE IN A JUPYTER NOTEBOOK.

In [24]:
displacy.render(nlp(str(sentences[0])), jupyter=True, style='ent')


# TYPE OF WORDS IN SENTENCE

GENERATES A LIST COMPREHENSION THAT ITERATES OVER TOKENS IN THE FIRST SENTENCE PARSED BY SPACY, EXCLUDING STOP WORDS AND PUNCTUATION, AND EXTRACTS THEIR ORTHOGRAPHIC FORMS, PARTS OF SPEECH, AND LEMMATIZED FORMS, RETURNING THEM AS TUPLES.

In [25]:
[(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in nlp(str(sentences[0])) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]


[('                              ', 'SPACE', '                              '),
 ('Egypt', 'PROPN', 'Egypt'),
 ('New', 'PROPN', 'New'),
 ('Administrative', 'PROPN', 'Administrative'),
 ('Capital', 'PROPN', 'Capital'),
 ('new', 'ADJ', 'new'),
 ('city', 'NOUN', 'city'),
 ('rising', 'VERB', 'rise'),
 ('country', 'NOUN', 'country'),
 ('needs', 'VERB', 'need')]

# SENTENCE DEPENDENCY TREE

VISUALIZES THE DEPENDENCY TREE OF A SENTENCE USING SPACY'S 'DISPLACY' RENDERER IN A JUPYTER NOTEBOOK WITH CUSTOMIZED DISTANCE BETWEEN NODES.

In [26]:
displacy.render(nlp(str(sentences[0])), style='dep', jupyter = True, options = {'distance': 120})


# TOPIC 2
# CHINESE NEWS ARTICLE

In [33]:
import spacy

# Load the Chinese language model
nlp = spacy.load("zh_core_web_lg")

from bs4 import BeautifulSoup
import requests
import re

def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))

# URL of the CNN article
cnn_article_url = "https://www.bjnews.com.cn/detail/1710951770168244.html"

# Get the text content from the URL
article_text = url_to_string(cnn_article_url)

# Process the text using the Chinese language model
article = nlp(article_text)

In [34]:
# Count the number of named entities in the article
num_entities = len(article.ents)
print("Number of named entities in the article:", num_entities)

Number of named entities in the article: 215


In [35]:
displacy.render(article, style='ent', jupyter=True)



In [36]:
from collections import Counter

# Get labels of named entities
labels = [x.label_ for x in article.ents]

# Count occurrences of each label
label_counts = Counter(labels)

# Print label counts
print(label_counts)


Counter({'CARDINAL': 52, 'GPE': 51, 'DATE': 41, 'ORG': 31, 'PERSON': 18, 'LAW': 5, 'PERCENT': 3, 'MONEY': 3, 'WORK_OF_ART': 2, 'ORDINAL': 2, 'TIME': 2, 'QUANTITY': 2, 'FAC': 1, 'LOC': 1, 'NORP': 1})


In [37]:
items = [x.text for x in article.ents]
Counter(items).most_common(5)


[('石家庄', 22), ('3', 12), ('惠州', 11), ('LPR', 7), ('新京报', 5)]

In [40]:
sentences = [x for x in article.sents]
print(sentences[0])


﻿          新京报 - 好新闻，无止境                                                                                                                                                   新京号                                  电子报                                  千龙网                                  贝壳财经                                  北京BEIJING                                                   新京雅集                                   专题                                  商务合作                                  爱心模式                                                                                                                                                                                                                                                               客户端                                                                                                                                                                                     微博                                                              

In [41]:
displacy.render(nlp(str(sentences[0])), jupyter=True, style='ent')


In [42]:
[(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in nlp(str(sentences[0])) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]


[('\ufeff', 'NOUN', ''),
 ('         ', 'SPACE', ''),
 ('新京报', 'PROPN', ''),
 ('新闻', 'NOUN', ''),
 ('无止', 'ADJ', ''),
 ('境', 'NOUN', ''),
 ('                                                                                                                                                  ',
  'SPACE',
  ''),
 ('新京号', 'PROPN', ''),
 ('                                 ', 'SPACE', ''),
 ('电子报', 'NOUN', ''),
 ('                                 ', 'SPACE', ''),
 ('千龙网', 'PROPN', ''),
 ('                                 ', 'SPACE', ''),
 ('贝壳', 'NOUN', ''),
 ('财经', 'NOUN', ''),
 ('                                 ', 'SPACE', ''),
 ('北京', 'PROPN', ''),
 ('BEI', 'NOUN', ''),
 ('JIN', 'PROPN', ''),
 ('G', 'NOUN', ''),
 ('                                                  ', 'SPACE', ''),
 ('新京', 'PROPN', ''),
 ('雅集', 'NOUN', ''),
 ('                                  ', 'SPACE', ''),
 ('专题', 'NOUN', ''),
 ('                                 ', 'SPACE', ''),
 ('商务', 'NOUN', ''),
 ('合作', 'NOUN', ''),


In [43]:
displacy.render(nlp(str(sentences[0])), style='dep', jupyter = True, options = {'distance': 120})


# TOPIC 3
# CNN NEWS ON POLITICS

In [45]:

from bs4 import BeautifulSoup
import requests
import re
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))
ny_bb = url_to_string("https://www.cnn.com/2024/03/10/politics/us-not-anticipating-israel-rafah-ramadan/index.html?iid=cnn_buildContentRecirc_end_recirc")
article = nlp(ny_bb)
len(article.ents)


156

In [46]:
displacy.render(article, style='ent', jupyter=True)


In [47]:
labels = [x.label_ for x in article.ents]
Counter(labels)


Counter({'ORG': 72,
         'PERSON': 57,
         'GPE': 10,
         'CARDINAL': 5,
         'DATE': 5,
         'FAC': 4,
         'TIME': 2,
         'QUANTITY': 1})

In [48]:
items = [x.text for x in article.ents]
Counter(items).most_common(5)


[('CNN', 28),
 ('Rafah', 11),
 ('sraeli', 8),
 ('anticipating', 3),
 ('Ramadan', 3)]

In [49]:
sentences = [x for x in article.sents]
print(sentences[20])


outlet Bild that the operation would not last more than two months but did not provide specifics on the timeline.                       


In [50]:
displacy.render(nlp(str(sentences[20])), jupyter=True, style='ent')


In [51]:
[(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in nlp(str(sentences[20])) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]


[('outlet', 'ADJ', ''),
 ('Bild', 'PROPN', ''),
 ('that', 'PRON', ''),
 ('the', 'DET', ''),
 ('operation', 'VERB', ''),
 ('would', 'ADP', ''),
 ('not', 'ADP', ''),
 ('last', 'PROPN', ''),
 ('more', 'ADJ', ''),
 ('than', 'ADJ', ''),
 ('two', 'NOUN', ''),
 ('months', 'NOUN', ''),
 ('but', 'CCONJ', ''),
 ('did', 'PROPN', ''),
 ('not', 'ADP', ''),
 ('provide', 'VERB', ''),
 ('specifics', 'VERB', ''),
 ('on', 'ADP', ''),
 ('the', 'DET', ''),
 ('timeline', 'NOUN', ''),
 ('                      ', 'SPACE', '')]

In [52]:
displacy.render(nlp(str(sentences[20])), style='dep', jupyter = True, options = {'distance': 120})
