<a href="https://colab.research.google.com/github/krishnapriyababu07/nlp/blob/main/BBC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [42]:
import requests

def fetch_news_article(api_key):
    url = ('https://newsapi.org/v2/top-headlines?'
           'country=us&'
           'apiKey={}'.format(api_key))
    response = requests.get(url)
    data = response.json()
    if data['status'] == 'ok':
        # Take the first article for simplicity
        article = data['articles'][0]
        title = article['title']
        content = article['content']
        return title, content
    else:
        raise Exception('Failed to fetch news articles')

api_key = 'f29e3591bd0f4a6f815d0a110131649a'
title, content = fetch_news_article(api_key)
print("Title:", title)
print("Content:", content)


Title: Perspective | Gross and embarrassing — teen girls’ misconceptions about their periods - The Washington Post
Content: When a 15-year-old girl I see in psychotherapy ended up in the ER with second-degree burns, she was embarrassed to tell me why. Eventually she disclosed having crouched over boiling herb-infused wate… [+132 chars]


In [43]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk

nltk.download('punkt')
nltk.download('maxent_ne_chunker')
nltk.download('words')

def extract_entities_nltk(text):
    tokens = word_tokenize(text)
    pos_tags = pos_tag(tokens)
    tree = ne_chunk(pos_tags, binary=False)
    named_entities = []

    for subtree in tree:
        if isinstance(subtree, nltk.Tree):
            entity_name = " ".join([word for word, pos in subtree.leaves()])
            entity_type = subtree.label()
            named_entities.append((entity_name, entity_type))

    return named_entities

entities_nltk = extract_entities_nltk(content)
print("Entities extracted by NLTK:")
for entity in entities_nltk:
    print(entity)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


Entities extracted by NLTK:
('ER', 'ORGANIZATION')


In [44]:
import spacy

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

def extract_entities_spacy(text):
    doc = nlp(text)
    named_entities = [(ent.text, ent.label_) for ent in doc.ents]
    return named_entities

entities_spacy = extract_entities_spacy(content)
print("Entities extracted by spaCy:")
for entity in entities_spacy:
    print(entity)


Entities extracted by spaCy:
('15-year-old', 'DATE')
('ER', 'ORG')
('second', 'ORDINAL')


In [45]:
def compare_entities(nltk_entities, spacy_entities):
    nltk_set = set(nltk_entities)
    spacy_set = set(spacy_entities)

    print("\nEntities found by both NLTK and spaCy:")
    for entity in nltk_set & spacy_set:
        print(entity)

    print("\nEntities found only by NLTK:")
    for entity in nltk_set - spacy_set:
        print(entity)

    print("\nEntities found only by spaCy:")
    for entity in spacy_set - nltk_set:
        print(entity)

compare_entities(entities_nltk, entities_spacy)



Entities found by both NLTK and spaCy:

Entities found only by NLTK:
('ER', 'ORGANIZATION')

Entities found only by spaCy:
('second', 'ORDINAL')
('15-year-old', 'DATE')
('ER', 'ORG')
