<a href="https://colab.research.google.com/github/kimanirobbi/wk-3-ai/blob/main/Task_3_NLP_(spaCy_NER_%26_Rule_based_Sentiment).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import spacy
import re
from typing import List, Dict, Any

# -----------------------------------------------------------
# 1. Setup and Data
# -----------------------------------------------------------
print("--- Task 3: NLP with spaCy (NER & Sentiment) ---")

# Load the small English model
# NOTE: User must run 'python -m spacy download en_core_web_sm' once before execution
try:
    nlp = spacy.load("en_core_web_sm")
    print("spaCy model 'en_core_web_sm' loaded successfully.")
except OSError:
    print("ERROR: spaCy model 'en_core_web_sm' not found.")
    print("Please run: python -m spacy download en_core_web_sm")
    # Exit or use dummy data if model loading fails
    nlp = None

# Simulated user reviews data
amazon_reviews = [
    "The new Echo Dot is absolutely amazing. Great sound quality and seamless integration with my 'Amazon' smart home devices.",
    "I bought the 'Sony' WH-1000XM5 headphones. The noise cancellation is top-tier, but the price is too high.",
    "This budget laptop from 'Acer' overheats quickly. Terrible performance for gaming. I regret this purchase.",
    "The 'Samsung' QLED TV arrived damaged. Poor packaging. Will be returning it immediately.",
    "My 'Apple' iPhone 15 Pro Max is the best phone I've ever owned. The camera quality is fantastic!"
]

# -----------------------------------------------------------
# 2. Named Entity Recognition (NER)
# -----------------------------------------------------------

def extract_entities(text: str, nlp_pipeline) -> List[Dict[str, str]]:
    """Performs NER to extract entities that might represent product names or brands."""
    if not nlp_pipeline: return []
    doc = nlp_pipeline(text)
    entities = []

    # We look for common entity types that usually include product/brand names:
    # ORG (Organizations), PRODUCT (Products), GPE (Geo-Political Entities, sometimes includes brands)
    # The 'Amazon', 'Sony', 'Acer', 'Samsung', 'Apple' examples usually fall under ORG.
    for ent in doc.ents:
        if ent.label_ in ['ORG', 'PRODUCT', 'GPE']:
            entities.append({"text": ent.text, "label": ent.label_})

    return entities

print("\n--- Named Entity Recognition (NER) Results ---")
ner_results = []
for i, review in enumerate(amazon_reviews):
    entities = extract_entities(review, nlp)
    ner_results.append({"review_id": i + 1, "entities": entities})

# Displaying the first few results
for result in ner_results:
    print(f"Review {result['review_id']}:")
    if result['entities']:
        for ent in result['entities']:
            print(f"  -> Entity: '{ent['text']}', Type: {ent['label']}")
    else:
        print("  -> No product/brand entities found.")


# -----------------------------------------------------------
# 3. Rule-Based Sentiment Analysis
# -----------------------------------------------------------

def analyze_sentiment(text: str) -> str:
    """Analyzes sentiment using a simple rule-based approach based on keywords."""

    # Define sets of positive and negative keywords
    positive_words = {'amazing', 'great', 'seamless', 'top-tier', 'fantastic', 'best', 'excellent'}
    negative_words = {'overheats', 'terrible', 'regret', 'poor', 'damaged', 'too high', 'bad'}

    # Convert text to lowercase and tokenize (split into words)
    words = set(re.findall(r'\b\w+\b', text.lower()))

    # Count positive and negative keyword overlaps
    pos_count = len(words.intersection(positive_words))
    neg_count = len(words.intersection(negative_words))

    if pos_count > neg_count:
        return f"POSITIVE (Score: +{pos_count})"
    elif neg_count > pos_count:
        return f"NEGATIVE (Score: -{neg_count})"
    else:
        return f"NEUTRAL/MIXED (Pos: {pos_count}, Neg: {neg_count})"

print("\n--- Rule-Based Sentiment Analysis Results ---")
for i, review in enumerate(amazon_reviews):
    sentiment = analyze_sentiment(review)
    print(f"Review {i+1} | Sentiment: {sentiment}")
    print(f"  Review Text: {review[:50]}...")

--- Task 3: NLP with spaCy (NER & Sentiment) ---
spaCy model 'en_core_web_sm' loaded successfully.

--- Named Entity Recognition (NER) Results ---
Review 1:
  -> Entity: 'Echo Dot', Type: PRODUCT
  -> Entity: 'Amazon', Type: ORG
Review 2:
  -> Entity: 'Sony', Type: ORG
Review 3:
  -> Entity: 'Acer', Type: ORG
Review 4:
  -> No product/brand entities found.
Review 5:
  -> No product/brand entities found.

--- Rule-Based Sentiment Analysis Results ---
Review 1 | Sentiment: POSITIVE (Score: +3)
  Review Text: The new Echo Dot is absolutely amazing. Great soun...
Review 2 | Sentiment: NEUTRAL/MIXED (Pos: 0, Neg: 0)
  Review Text: I bought the 'Sony' WH-1000XM5 headphones. The noi...
Review 3 | Sentiment: NEGATIVE (Score: -3)
  Review Text: This budget laptop from 'Acer' overheats quickly. ...
Review 4 | Sentiment: NEGATIVE (Score: -2)
  Review Text: The 'Samsung' QLED TV arrived damaged. Poor packag...
Review 5 | Sentiment: POSITIVE (Score: +2)
  Review Text: My 'Apple' iPhone 15 Pro Max i