In [1]:
# spaCy_ner_sentiment.py
# Requirements: spacy
# pip install spacy
# python -m spacy download en_core_web_sm

import spacy
nlp = spacy.load("en_core_web_sm")

# Example reviews (replace with your Amazon reviews dataset)
reviews = [
    "I bought the Acme Coffee Maker last month. The Acme product works great—brews quickly. Highly recommend!",
    "The BrewMaster 2000 is a poor build. Brand: BrewMaster. Stopped working in two weeks. Terrible customer service.",
    "I love the sound quality of the SoundX headphones. SoundX has nailed the bass and comfort.",
    "The FitMax running shoe by FitCo is comfortable but the sizing is off. Good product but shipping was slow.",
    "FakeBrand's charger exploded after one day. Very dangerous. Not worth it."
]

# 1. Named Entity Recognition using spaCy
for review in reviews:
    doc = nlp(review)
    ents = [(ent.text, ent.label_) for ent in doc.ents]
    print("REVIEW:", review)
    print("ENTITIES:", ents)
    print("-" * 60)

# 2. Rule-based extraction for product names & brands (fallback)
# Heuristic: look for proper nouns (PROPN) or sequences of PROPN + NOUN near words 'by', 'brand', 'Brand:'
import re

brand_patterns = [r'Brand:\s*([A-Za-z0-9\-]+)', r'by\s+([A-Z][a-zA-Z0-9]+)']
pos_word_list = {"love","great","excellent","good","recommend","perfect","amazing","fast","easy"}
neg_word_list = {"poor","terrible","bad","slow","exploded","dangerous","stopped","not worth","problem","disappointed"}

def extract_brand_product(text):
    brands = []
    # simple regex catches
    for pat in brand_patterns:
        m = re.search(pat, text)
        if m:
            brands.append(m.group(1))
    # fallback: look for sequences of proper nouns via spaCy tokens
    doc = nlp(text)
    prod_candidates = []
    current = []
    for token in doc:
        if token.pos_ == "PROPN":
            current.append(token.text)
        else:
            if current:
                prod_candidates.append(" ".join(current))
                current = []
    if current:
        prod_candidates.append(" ".join(current))
    return brands, prod_candidates

# 3. Rule-based sentiment analysis (very simple)
def rule_sentiment(text):
    text_lower = text.lower()
    pos = sum(1 for w in pos_word_list if w in text_lower)
    neg = sum(1 for w in neg_word_list if w in text_lower)
    if pos > neg:
        return "positive"
    if neg > pos:
        return "negative"
    return "neutral"

for r in reviews:
    brands, prods = extract_brand_product(r)
    print("Review:", r)
    print("Brands found (regex):", brands)
    print("Proper-noun product candidates:", prods)
    print("Sentiment (rule):", rule_sentiment(r))
    print("-"*60)


REVIEW: I bought the Acme Coffee Maker last month. The Acme product works great—brews quickly. Highly recommend!
ENTITIES: [('last month', 'DATE'), ('Acme', 'ORG')]
------------------------------------------------------------
REVIEW: The BrewMaster 2000 is a poor build. Brand: BrewMaster. Stopped working in two weeks. Terrible customer service.
ENTITIES: [('2000', 'DATE'), ('BrewMaster', 'ORG'), ('two weeks', 'DATE')]
------------------------------------------------------------
REVIEW: I love the sound quality of the SoundX headphones. SoundX has nailed the bass and comfort.
ENTITIES: []
------------------------------------------------------------
REVIEW: The FitMax running shoe by FitCo is comfortable but the sizing is off. Good product but shipping was slow.
ENTITIES: [('FitMax', 'ORG'), ('FitCo', 'ORG')]
------------------------------------------------------------
REVIEW: FakeBrand's charger exploded after one day. Very dangerous. Not worth it.
ENTITIES: [('FakeBrand', 'ORG'), ('one