In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import pipeline
import torch
from tqdm import tqdm
import re
from sklearn.metrics import classification_report
import warnings
import os
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
nltk.download('punkt')
nltk.download('stopwords')

os.makedirs('images', exist_ok=True)

plt.style.use('ggplot')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\prath\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\prath\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
print("Loading data...")
df = pd.read_csv('amazon`.csv', index_col=0)
print(f"Original shape: {df.shape}")

print("Cleaning data...")
df = df[df['reviewText'].notna() & (df['reviewText'].str.strip() != '')]
print(f"Shape after removing empty reviews: {df.shape}")

df = df.sample(n=min(500, len(df)), random_state=42)
print(f"Sample shape: {df.shape}")

Loading data...
Original shape: (524, 16)
Cleaning data...
Shape after removing empty reviews: (524, 16)
Sample shape: (500, 16)


In [6]:
def preprocess_text(text):
    if pd.isna(text):
        return ""
    
    # Convert to string and lowercase
    text = str(text).lower()
    
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', '', text)
    
    # Remove URLs
    text = re.sub(r'http\S+|www.\S+', '', text)
    
    # Remove special characters but keep basic punctuation
    text = re.sub(r'[^\w\s.,!?]', '', text)
    
    # Tokenize and remove stopwords
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Join tokens back into text
    text = ' '.join(tokens)
    
    return text.strip()

# Apply preprocessing
print("Preprocessing text...")
df['cleaned_text'] = df['reviewText'].apply(preprocess_text)
df = df[df['cleaned_text'].str.len() > 0]
print(f"Final shape after preprocessing: {df.shape}")

Preprocessing text...
Final shape after preprocessing: (500, 16)


In [7]:
print("\nInitializing sentiment analyzer...")
sentiment_analyzer = pipeline(
    "sentiment-analysis",
    model="nlptown/bert-base-multilingual-uncased-sentiment",
    device=0 if torch.cuda.is_available() else -1
)


Initializing sentiment analyzer...


    PyTorch 2.6.0+cu124 with CUDA 1204 (you have 2.6.0+cpu)
    Python  3.10.11 (you have 3.10.16)
  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
  Memory-efficient attention, SwiGLU, sparse and more won't be available.
  Set XFORMERS_MORE_DETAILS=1 for more details


In [8]:
print("\nAnalyzing sentiment...")
results = []
for text in tqdm(df['cleaned_text']):
    try:
        result = sentiment_analyzer(text[:512])[0]
        # Convert 5-star rating to sentiment
        rating = int(result['label'].split()[0])
        if rating >= 4:
            sentiment = 'POSITIVE'
        else:
            sentiment = 'NEGATIVE'
        results.append({
            'sentiment': sentiment,
            'score': result['score']
        })
    except Exception as e:
        print(f"Error analyzing text: {str(e)}")
        results.append({'sentiment': 'NEGATIVE', 'score': 0.5})

df['predicted_sentiment'] = [r['sentiment'] for r in results]
df['sentiment_score'] = [r['score'] for r in results]

df['true_sentiment'] = df['overall'].apply(lambda x: 'POSITIVE' if x >= 4 else 'NEGATIVE')


Analyzing sentiment...


100%|██████████| 500/500 [00:45<00:00, 11.02it/s]


In [9]:
# Print results
print("\nSentiment Distribution:")
print(df['predicted_sentiment'].value_counts())

print("\nClassification Report:")
print(classification_report(
    df['true_sentiment'],
    df['predicted_sentiment'],
    target_names=['NEGATIVE', 'POSITIVE']
))


Sentiment Distribution:
predicted_sentiment
POSITIVE    402
NEGATIVE     98
Name: count, dtype: int64

Classification Report:
              precision    recall  f1-score   support

    NEGATIVE       0.78      0.95      0.85        80
    POSITIVE       0.99      0.95      0.97       420

    accuracy                           0.95       500
   macro avg       0.88      0.95      0.91       500
weighted avg       0.96      0.95      0.95       500



In [10]:
# Plot sentiment distribution
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='predicted_sentiment')
plt.title('Sentiment Distribution')
plt.savefig('images/sentiment_distribution_v1.png')
plt.close()

# Plot sentiment scores
plt.figure(figsize=(10, 6))
sns.histplot(data=df, x='sentiment_score', hue='predicted_sentiment', bins=20)
plt.title('Sentiment Score Distribution')
plt.savefig('images/sentiment_scores_v1.png')
plt.close()

print("\nAnalysis complete! Check the generated plots and classification report.")


Analysis complete! Check the generated plots and classification report.
