In [91]:
import pandas as pd
import numpy as np
import spacy
from transformers import pipeline, BertTokenizer
import torch

df = pd.read_csv("Filtered Reviews2.csv")
nlp = spacy.load("en_core_web_sm")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
sentiment_pipeline = pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


In [92]:
low_conf = {}

# Function to analyze sentiment and compare with rating
def compare_rating_and_sentiment(review, rating):
    # Analyze review sentiment
    sentiment_result = sentiment_pipeline(review[:512])  # Truncate long reviews
    sentiment = sentiment_result[0]['label']  # POSITIVE or NEGATIVE
    if sentiment_result[0]['score'] < 0.8: 
        low_conf["low confidence count"] = low_conf.get("low confidence count", 0) + 1
        
    # Compare with rating (4*/5* are positive, 1*/2* are negative) and return if inconsistent
    if sentiment == "POSITIVE" and rating < 3:
        return sentiment
    elif sentiment == "NEGATIVE" and rating > 3:
        return sentiment
    return None  # Return None if there's no inconsistency

# Apply the function to each review in the DataFrame
df['Inconsistent Review'] = df.apply(lambda row: compare_rating_and_sentiment(row['Review'], row['Rating']), axis=1)

# Filter out rows where the result is None (i.e., no inconsistency)
inconsistent_reviews = df[df['Inconsistent Review'].notna()]

print(low_conf)

print(len(inconsistent_reviews))
print(len(df))

{'low confidence count': 31}
81
623


In [93]:
pros_dict = {}
cons_dict = {}

def analyze_review(review):    
    # Analyze review sentiment
    sentiment_result = sentiment_pipeline(review[:512])
    sentiment = sentiment_result[0]['label']
    confidence = sentiment_result[0]['score']
    
    # Extract noun chunks (possible pros/cons)
    doc = nlp(review)
    excluded_words = ['i', 'my', 'you', 'they', 'it', 'these', 'them', 'we', 'me', 'that', 
                      'this', 'she', 'he', 'him', 'his', 'her', 'amazon','what', 'who', 'which', 
                      'us', 'all', 'something']
    aspects = [chunk.text.lower() for chunk in doc.noun_chunks if not any(token.pos_ == "PRP" for token in chunk) and
                                                                  not any(token.text.lower() in excluded_words for token in chunk)]
    if sentiment == "POSITIVE":
        for aspect in aspects:
            pros_dict[aspect] = pros_dict.get(aspect, 0) + 1
    elif sentiment == "NEGATIVE":
        for aspect in aspects:
            cons_dict[aspect] = cons_dict.get(aspect, 0) + 1
    
    return sentiment, confidence, aspects

# Apply the function to each review
df[['Sentiment', 'Confidence', 'Extracted Aspects']] = df['Review'].apply(lambda review: pd.Series(analyze_review(review)))

In [94]:
print("Top 5 Pros:", dict(sorted(pros_dict.items(), key=lambda item: item[1], reverse=True)[:10]))
print("Top 5 Cons:", dict(sorted(cons_dict.items(), key=lambda item: item[1], reverse=True)[:10]))
print("Top 5 Pros:", dict(sorted(pros_dict.items(), key=lambda item: item[1], reverse=True)[1:10]))
print("Top 5 Cons:", dict(sorted(cons_dict.items(), key=lambda item: item[1], reverse=True)[1:10]))

Top 5 Pros: {'coconut oil': 196, 'cooking': 50, 'skin': 45, 'the product': 42, 'the coconut oil': 42, 'the smell': 41, 'coconut': 40, 'hair': 38, 'the oil': 37, 'the jar': 35}
Top 5 Cons: {'coconut oil': 54, 'the coconut oil': 27, 'the oil': 22, 'hair': 16, 'your hair': 13, 'the product': 11, 'cooking': 11, 'the jar': 10, 'skin': 10, 'some': 10}
Top 5 Pros: {'cooking': 50, 'skin': 45, 'the product': 42, 'the coconut oil': 42, 'the smell': 41, 'coconut': 40, 'hair': 38, 'the oil': 37, 'the jar': 35}
Top 5 Cons: {'the coconut oil': 27, 'the oil': 22, 'hair': 16, 'your hair': 13, 'the product': 11, 'cooking': 11, 'the jar': 10, 'skin': 10, 'some': 10}


In [None]:
# Test

def classify_aspect_sentiment(aspect, review):
    sentiment_result = sentiment_pipeline(f"{aspect} {review}")  # Combine aspect with review context for sentiment analysis
    sentiment = sentiment_result[0]['label']
    return sentiment

def extract_aspects(review):
    doc = nlp(review)
    return [chunk.text.lower() for chunk in doc.noun_chunks]

reviews = [
    "The battery life of this phone is amazing. It lasts all day!",
    "The camera quality is terrible, I am very disappointed.",
    "I love the design of the phone, but it’s a bit heavy.",
    "The screen resolution is beautiful, but the battery drains quickly.",
]

p_dict = {}
c_dict = {}

for review in reviews:
    aspects = extract_aspects(review)  # Extract aspects from the review
    for aspect in aspects:
        sentiment = classify_aspect_sentiment(aspect, review)  # Classify sentiment for each aspect

    if sentiment == "POSITIVE":
        for aspect in aspects:
            p_dict[aspect] = pros_dict.get(aspect, 0) + 1
    elif sentiment == "NEGATIVE":
        for aspect in aspects:
            c_dict[aspect] = cons_dict.get(aspect, 0) + 1

print("Top 5 Pros:", dict(sorted(p_dict.items(), key=lambda item: item[1], reverse=True)[:10]))
print("Top 5 Cons:", dict(sorted(c_dict.items(), key=lambda item: item[1], reverse=True)[:10]))
print("Top 5 Pros:", dict(sorted(p_dict.items(), key=lambda item: item[1], reverse=True)[1:10]))
print("Top 5 Cons:", dict(sorted(c_dict.items(), key=lambda item: item[1], reverse=True)[1:10]))

Top 5 Pros: {'the battery life': 1, 'this phone': 1, 'it': 1}
Top 5 Cons: {'the camera quality': 1, 'i': 1, 'the design': 1, 'the phone': 1, 'it': 1, 'the screen resolution': 1, 'the battery': 1}
Top 5 Pros: {'this phone': 1, 'it': 1}
Top 5 Cons: {'i': 1, 'the design': 1, 'the phone': 1, 'it': 1, 'the screen resolution': 1, 'the battery': 1}


In [None]:
print(df[['Sentiment', 'Extracted Aspects']])