## Prons & Cons

#### Read & Clean data

In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from tabulate import tabulate

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('vader_lexicon')

# Load the dataset
df = pd.read_csv('2-dataset_7(senti)_vader.csv')

# Additional stopwords
additional_stopwords = set([
    'also', 'would', 'could', 'like', 'make', 'sure', 'go', 
    'get', 'got', 'recommend', 'highly', 'review', 'want', 'hard'
])

# Combine NLTK and additional stopwords
stop_words = set(stopwords.words('english')).union(additional_stopwords)

def preprocess(text):
    # Remove punctuation, numbers, and convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text).lower()
    # Remove stopwords
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

# Apply preprocessing to the review column
df['cleaned_review'] = df['Review'].apply(preprocess)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\KIIT\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\KIIT\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


FileNotFoundError: [Errno 2] No such file or directory: './review_data/dataset_7.csv'

#### Analyze the data

In [2]:
sid = SentimentIntensityAnalyzer()

def get_sentiment(bigram):
    return sid.polarity_scores(bigram)['compound']

def analyze_reviews(reviews):
    # Extract bigrams with minimum document frequency of 2
    vectorizer = CountVectorizer(ngram_range=(2, 2), min_df=2)
    X = vectorizer.fit_transform(reviews)
    
    # Sum up the counts of each bigram
    bigram_counts = X.sum(axis=0).A1
    bigram_features = vectorizer.get_feature_names_out()
    
    # Create a DataFrame with the bigrams and their counts
    bigram_df = pd.DataFrame({'bigram': bigram_features, 'count': bigram_counts})
    bigram_df = bigram_df.sort_values(by='count', ascending=False)
    
    # Apply sentiment analysis to the bigrams
    bigram_df['sentiment'] = bigram_df['bigram'].apply(get_sentiment)
    
    # Separate positive and negative bigrams
    positive_bigrams = bigram_df[bigram_df['sentiment'] > 0]
    negative_bigrams = bigram_df[bigram_df['sentiment'] < 0]
    
    # Filter bigrams to keep only meaningful phrases (two words)
    positive_bigrams = positive_bigrams[positive_bigrams['bigram'].apply(lambda x: len(x.split()) == 2)]
    negative_bigrams = negative_bigrams[negative_bigrams['bigram'].apply(lambda x: len(x.split()) == 2)]
    
    # Get the top 5 positive and negative bigrams
    top_5_pros = positive_bigrams.head(5)
    top_5_cons = negative_bigrams.head(5)
    
    return top_5_pros, top_5_cons

# Initialize dictionaries to store the pros and cons for each product
product_pros = {}
product_cons = {}

# Group reviews by product and analyze each group
for product, group in df.groupby('Product'):
    top_5_pros, top_5_cons = analyze_reviews(group['cleaned_review'])
    product_pros[product] = top_5_pros
    product_cons[product] = top_5_cons

# Display the results in a structured format
for product in product_pros:
    print(f"Product: {product}")
    print("Top 5 Positive:")
    print(tabulate(product_pros[product], headers='keys', tablefmt='psql'))
    print("\nTop 5 Negative:")
    if not product_cons[product].empty:
        print(tabulate(product_cons[product], headers='keys', tablefmt='psql'))
    else:
        print("No significant negative bigrams found.")
    print("\n" + "="*50 + "\n")

Product: intel-i3-10100
Top 5 Pros:
+-----+--------------+---------+-------------+
|     | bigram       |   count |   sentiment |
|-----+--------------+---------+-------------|
|  59 | good price   |       6 |      0.4404 |
| 131 | run cool     |       5 |      0.3182 |
|  36 | easy install |       4 |      0.4404 |
|  65 | great cpu    |       4 |      0.6249 |
|  82 | im happy     |       4 |      0.5719 |
+-----+--------------+---------+-------------+

Top 5 Cons:
+-----+----------------+---------+-------------+
|     | bigram         |   count |   sentiment |
|-----+----------------+---------+-------------|
|  97 | low power      |       4 |     -0.2732 |
|  98 | low setting    |       2 |     -0.2732 |
| 119 | price terrible |       2 |     -0.4767 |
+-----+----------------+---------+-------------+


Product: intel-i3-12100f
Top 5 Pros:
+-----+----------------+---------+-------------+
|     | bigram         |   count |   sentiment |
|-----+----------------+---------+-------------|