In [1]:
import pandas as pd
import nltk
from nltk import bigrams, trigrams
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from collections import Counter
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [3]:
df = pd.read_csv('/kaggle/input/sar-preprocess/processed_reviews.csv')

In [4]:
stop_words = set(stopwords.words('english')).union({
    '', ' ', 'this', 'that', 'would', 'also', 'like', 'one', 'get', 'got', 'use', 'used',
    'make', 'made', 'even', 'though', 'first', 'second', 'time', 'times', 'several',
    'every', 'go', 'went', 'really', 'actually', 'still', 'always', 'never', 'ever',
    'much', 'many', 'little', 'bit', 'lot', 'lots', 'well', 'good', 'bad', 'new',
    'old', 'big', 'small', 'large', 'happy', 'overall', 'pretty', 'quite', 'fairly',
    'long', 'short', 'nice', 'super', 'quick', 'story', 'way', 'thing', 'report',
    'camper', 'bonus', 'surprise', 'job', 'addition', 'results', 'performance'
})
custom_stoplist = {
    'amazon', 'com', 'http', 'www', 'product', 'item', 'update', 'asin', 'edit',
    'review', 'reviews', 'buy', 'bought', 'purchase', 'purchased', 'order', 'ordered',
    'kindle', 'fire', 'blu', 'ray', 'ipad', 'mini', 'ipod', 'touch', 'macbook', 'pro',
    'nintendo', 'xl', 'tom', 'blackberry', 'playbook', 'netgear', 'router', 'western',
    'digital', 'best', 'e', 'mail', 'tcp', 'cc', 'cmts', 'mac', 'na', 'gon', 'ver',
    'star', 'stars', 'rating', 'point', 'years', 'ago', 'year', 'week', 'weeks',
    'month', 'months', 'day', 'days', 'third', 'party', 'guess', 'reason', 'provide',
    'pay', 'knock', 'ones', 'dvd', 'worth', 'looking', 'unit', 'case'
}
electronics_nouns = {
    'battery', 'screen', 'sound', 'speaker', 'quality', 'setup', 'installation',
    'cord', 'cable', 'charger', 'port', 'usb', 'hdmi', 'display', 'picture',
    'image', 'resolution', 'performance', 'speed', 'connectivity', 'wifi', 'bluetooth',
    'keyboard', 'mouse', 'trackpad', 'camera', 'lens', 'flash', 'memory', 'card',
    'storage', 'drive', 'disk', 'processor', 'graphics', 'fan', 'cooling', 'design',
    'build', 'material', 'case', 'headphones', 'earbuds', 'mic', 'microphone',
    'remote', 'control', 'button', 'touchscreen', 'software', 'firmware', 'interface',
    'router', 'keys', 'pixel', 'sensor', 'webcam', 'adapter', 'monitor', 'projector', 'printer'
}
positive_modifiers = {
    'great', 'excellent', 'awesome', 'fantastic', 'amazing', 'reliable', 'easy',
    'fast', 'clear', 'smooth', 'perfect', 'crisp', 'sharp', 'loud', 'comfortable',
    'durable', 'sturdy', 'seamless', 'intuitive', 'responsive', 'stable', 'quick'
}
negative_modifiers = {
    'poor', 'terrible', 'awful', 'horrible', 'slow', 'difficult', 'cheap', 'flimsy',
    'broken', 'faulty', 'unreliable', 'disappointing', 'weak', 'bad', 'low', 'short',
    'unresponsive', 'fragile', 'shoddy', 'defective', 'inconsistent', 'unstable'
}

In [5]:
def clean_text(text):
    # Remove URLs, special characters, and extra spaces
    text = re.sub(r'http\S+|www\S+|[^\w\s]', ' ', text.lower())
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [6]:
def tokenize(text):
    tokens = word_tokenize(text.lower())
    tokens = [t for t in tokens if t.isalpha() and t not in stop_words and t not in custom_stoplist]
    return tokens

In [7]:
df['cleaned_review'] = df['reviewText'].apply(clean_text)
df['tokens'] = df['cleaned_review'].apply(tokenize)

In [8]:
def extract_sentiment_phrases(tokens, sentiment):
    tagged = pos_tag(tokens)
    phrases = []
    for i in range(len(tagged) - 1):
        word1, tag1 = tagged[i]
        word2, tag2 = tagged[i + 1]
        # Adjective-noun or adverb-noun patterns
        if (tag1.startswith('JJ') or tag1.startswith('RB')) and tag2.startswith('NN'):
            if word2 in electronics_nouns and (
                (sentiment == 'Positive' and word1 in positive_modifiers) or
                (sentiment == 'Negative' and word1 in negative_modifiers)
            ):
                phrases.append(f"{word1} {word2}")

    # Include bigrams and trigrams with electronics nouns and sentiment modifiers
    for bg in bigrams(tokens):
        phrase = ' '.join(bg)
        if any(w in electronics_nouns for w in bg) and (
            (sentiment == 'Positive' and any(w in positive_modifiers for w in bg)) or
            (sentiment == 'Negative' and any(w in negative_modifiers for w in bg))
        ):
            phrases.append(phrase)

    for tg in trigrams(tokens):
        phrase = ' '.join(tg)
        if any(w in electronics_nouns for w in tg) and (
            (sentiment == 'Positive' and any(w in positive_modifiers for w in tg)) or
            (sentiment == 'Negative' and any(w in negative_modifiers for w in tg))
        ):
            phrases.append(phrase)

    return phrases        

In [9]:
positive_phrases = Counter()
negative_phrases = Counter()

for idx, row in df.iterrows():
    tokens = row['tokens']
    sentiment = row['sentiment']
    if sentiment == 'Positive':
        positive_phrases.update(extract_sentiment_phrases(tokens, 'Positive'))
    elif sentiment == 'Negative':
        negative_phrases.update(extract_sentiment_phrases(tokens, 'Negative'))

In [10]:
# Consolidate similar phrases
def consolidated_phrases(phrase_counts):
    phrases = list(phrase_counts.keys())
    if not phrases:
        return phrase_counts

    # Compute TF-IDF vectors for similarity
    vectorizer = TfidfVectorizer()
    try:
        tfidf_matrix = vectorizer.fit_transform(phrases)
        similarities = cosine_similarity(tfidf_matrix)
    except ValueError:
        print("Error")
        return phrase_counts

    # Group similar phrases (cosine similarity > 0.7)
    clusters = []
    used = set()
    for i, phrase in enumerate(phrases):
        if phrase in used:
            continue
        cluster = [phrase]
        used.add(phrase)
        for j, other_phrase in enumerate(phrases[i + 1:], start=i + 1):
            if other_phrase not in used and similarities[i][j] > 0.95:
                cluster.append(other_phrase)
                used.add(other_phrase)
        clusters.append(cluster) 

    # Select representative phrase (most frequent)
    consolidated = Counter()
    for cluster in clusters:
        if len(cluster) == 1:
            consolidated[cluster[0]] = phrase_counts[cluster[0]]
        else:
            counts = [(p, phrase_counts[p]) for p in cluster]
            representative = max(counts, key=lambda x: x[1])[0]
            consolidated[representative] = sum(c[1] for c in counts)

    return consolidated

positive_phrases = consolidated_phrases(positive_phrases)
negative_phrases = consolidated_phrases(negative_phrases)

In [11]:
def compute_pmi(phrase_counts, word_counts, total_count, min_freq=10):
    pmi_scores = {}
    for phrase, count in phrase_counts.items():
        if count < min_freq:
            continue
        words = phrase.split()
        if len(words) < 2:
            continue
        p_w1 = word_counts[words[0]] / total_count if words[0] in word_counts else 1e-6
        p_w2 = word_counts[words[1]] / total_count if words[1] in word_counts else 1e-6
        p_w1_w2 = count / total_count
        pmi = np.log2(p_w1_w2 / (p_w1 * p_w2)) if p_w1_w2 > 0 else 0
        # Weight for sentiment modifier presence
        modifier_score = 1.5 if any(w in positive_modifiers.union(negative_modifiers) for w in words) else 1.0
        # Boost for electronics nouns
        electronics_score = 2.0 if any(w in electronics_nouns for w in words) else 1.0
        pmi_scores[phrase] = pmi * count * modifier_score * electronics_score
    return pmi_scores

In [12]:
word_counts = Counter()
for tokens in df['tokens']:
    word_counts.update(tokens)
total_count = sum(word_counts.values())

positive_pmi = compute_pmi(positive_phrases, word_counts, total_count, min_freq=20)
negative_pmi = compute_pmi(negative_phrases, word_counts, total_count, min_freq=5)

In [13]:
def filter_phrases(pos_pmi, neg_pmi, min_pmi=3.0, max_phrases=100):
    # Remove phrases that appear in both lists
    pos_phrases = set(pos_pmi.keys())
    neg_phrases = set(neg_pmi.keys())
    common_phrases = pos_phrases.intersection(neg_phrases)
    filtered_pos = {k: v for k, v in pos_pmi.items() if k not in common_phrases}
    filtered_neg = {k: v for k, v in neg_pmi.items() if k not in common_phrases}
    
    # Additional filtering: high PMI, no numbers, no stoplist terms
    def is_valid_phrase(phrase, pmi):
        words = phrase.split()
        return (
            pmi > min_pmi and
            re.search(r'\d', phrase) is None and
            not any(w in custom_stoplist for w in words) and
            any(w in electronics_nouns for w in words) and
            any(w in positive_modifiers.union(negative_modifiers) for w in words)
        )

    filtered_pos = {k: v for k, v in filtered_pos.items() if is_valid_phrase(k, v)}
    filtered_neg = {k: v for k, v in filtered_neg.items() if is_valid_phrase(k, v)}

    return(
        sorted(filtered_pos.items(), key=lambda x: x[1], reverse=True)[:max_phrases],
        sorted(filtered_neg.items(), key=lambda x: x[1], reverse=True)[:max_phrases]
    )

top_positive, top_negative = filter_phrases(positive_pmi, negative_pmi)

In [14]:
top_negative

[('poor quality', 2141.346787766827),
 ('poor design', 1079.4512545667849),
 ('low quality', 458.03149624014543),
 ('sound horrible', 177.94120443894386),
 ('poor picture', 116.74217125815758),
 ('low battery', 109.40721407489202),
 ('quality horrible', 108.28263509740222),
 ('quality terrible', 106.14843083010946),
 ('poor battery', 100.26266011409649),
 ('sound terrible', 95.59878489173073),
 ('terrible microphone', 84.15697824319457),
 ('horrible software', 73.27776000669203),
 ('terrible interface', 72.34657431102693),
 ('cheap material', 70.80425268664722),
 ('cheap earbuds', 54.33131379513118),
 ('slow speed', 51.296727702397284),
 ('flimsy cord', 48.6064640978477),
 ('poor image', 44.47446607274586),
 ('poor image quality', 44.47446607274586),
 ('cheap quality', 42.30681235195836),
 ('sound weak', 38.47432121969938),
 ('cheap cord', 30.30886769803375),
 ('slow drive', 26.978696357575352),
 ('defective cable', 24.820365406168694),
 ('broken screen', 23.488033869459805),
 ('sound 

In [15]:
with open('/kaggle/working/key_phrases.txt', 'w') as f:
    f.write("Positive Phrases:\n")
    for phrase, score in top_positive:
        f.write(f"{phrase}: {score:.4f}\n")
    f.write("Negative Phrases:\n")
    for phrase, score in top_negative:
        f.write(f"{phrase}: {score:.4f}\n")