In [None]:

import os, sys
from google.colab import drive
drive.mount('/content/drive')
os.chdir("/content/drive/My Drive/Colab Notebooks/NLP")
sys.path.append("/content/drive/My Drive/Colab Notebooks/NLP")

Mounted at /content/drive


In [None]:
import nltk
nltk.download('punkt')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
import requests

def fetch_news_article(api_key):
    url = f"https://newsapi.org/v2/top-headlines?country=us&apiKey={api_key}"
    response = requests.get(url)
    articles = response.json().get('articles', [])
    if articles:
        title = articles[0].get('title', '')
        description = articles[0].get('description', '')
        return (title if title else "") + " " + (description if description else "")
    else:
        return None

api_key = 'd0d145ba3dbd4f608f02d3f3161bc49a'
article = fetch_news_article(api_key)
print(article)


Trump congratulates Putin over deal that brought Evan Gershkovich home - The Washington Post 


In [None]:

import nltk
from nltk import word_tokenize, pos_tag, ne_chunk

def nltk_named_entity_recognition(text):
    words = word_tokenize(text)
    pos_tags = pos_tag(words)
    tree = ne_chunk(pos_tags)
    named_entities = []

    for subtree in tree:
        if isinstance(subtree, nltk.Tree):
            entity_name = " ".join([leaf[0] for leaf in subtree.leaves()])
            entity_type = subtree.label()
            named_entities.append((entity_name, entity_type))

    return named_entities

nltk_entities = nltk_named_entity_recognition(article)
print(nltk_entities)


[('Trump', 'GPE'), ('Putin', 'PERSON'), ('Evan Gershkovich', 'PERSON'), ('Washington Post', 'ORGANIZATION')]


In [None]:

import spacy

def spacy_named_entity_recognition(text):
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(text)
    named_entities = [(ent.text, ent.label_) for ent in doc.ents]
    return named_entities

spacy_entities = spacy_named_entity_recognition(article)
print(spacy_entities)


[('Trump', 'ORG'), ('Putin', 'PERSON'), ('Evan Gershkovich', 'PERSON'), ('The Washington Post', 'ORG')]


In [None]:

def compare_entities(nltk_entities, spacy_entities):
    nltk_set = set(nltk_entities)
    spacy_set = set(spacy_entities)

    common_entities = nltk_set.intersection(spacy_set)
    unique_nltk_entities = nltk_set - spacy_set
    unique_spacy_entities = spacy_set - nltk_set

    return {
        "common_entities": common_entities,
        "unique_nltk_entities": unique_nltk_entities,
        "unique_spacy_entities": unique_spacy_entities
    }

comparison = compare_entities(nltk_entities, spacy_entities)
print("Common Entities:", comparison["common_entities"])
print("Unique NLTK Entities:", comparison["unique_nltk_entities"])
print("Unique SpaCy Entities:", comparison["unique_spacy_entities"])


Common Entities: {('Evan Gershkovich', 'PERSON'), ('Putin', 'PERSON')}
Unique NLTK Entities: {('Washington Post', 'ORGANIZATION'), ('Trump', 'GPE')}
Unique SpaCy Entities: {('Trump', 'ORG'), ('The Washington Post', 'ORG')}


In [None]:

def summarize_results(article, comparison):
    summary = f"""
    News Article: {article}

    Comparison of Named Entity Recognition:

    Common Entities:
    {comparison['common_entities']}

    Unique Entities (NLTK):
    {comparison['unique_nltk_entities']}

    Unique Entities (SpaCy):
    {comparison['unique_spacy_entities']}

    Observations:
    - SpaCy, being a machine learning-based approach, tends to identify entities more accurately and can recognize a wider range of entity types.
    - NLTK, being a rule-based approach, might miss some entities or classify them incorrectly, especially in complex or ambiguous cases.
    - SpaCy can be more robust in recognizing entities in different contexts, while NLTK may require more fine-tuning for different types of texts.

    Conclusions:
    - The choice between SpaCy and NLTK for named entity recognition depends on the specific requirements of the task. For general purposes and higher accuracy, SpaCy is recommended. For simple tasks or when rule-based precision is needed, NLTK can be used.
    """
    return summary

summary = summarize_results(article, comparison)
print(summary)


    News Article: Trump congratulates Putin over deal that brought Evan Gershkovich home - The Washington Post 

    Comparison of Named Entity Recognition:

    Common Entities:
    {('Evan Gershkovich', 'PERSON'), ('Putin', 'PERSON')}

    Unique Entities (NLTK):
    {('Washington Post', 'ORGANIZATION'), ('Trump', 'GPE')}

    Unique Entities (SpaCy):
    {('Trump', 'ORG'), ('The Washington Post', 'ORG')}

    Observations:
    - SpaCy, being a machine learning-based approach, tends to identify entities more accurately and can recognize a wider range of entity types.
    - NLTK, being a rule-based approach, might miss some entities or classify them incorrectly, especially in complex or ambiguous cases.
    - SpaCy can be more robust in recognizing entities in different contexts, while NLTK may require more fine-tuning for different types of texts.

    Conclusions:
    - The choice between SpaCy and NLTK for named entity recognition depends on the specific requirements of the task. 