In [2]:
# 📦 Step 1: Import required libraries
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import joblib

# 📥 Step 2: Load dataset
df = pd.read_csv('Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products.csv')
df = df[['reviews.text', 'reviews.rating']].dropna()
df = df.rename(columns={'reviews.text': 'review', 'reviews.rating': 'rating'})
df = df[df['rating'].isin([1, 2, 3, 4, 5])]  # Filter only valid ratings

# 🏷️ Step 3: Create sentiment labels from rating
def label_sentiment(rating):
    if rating >= 4:
        return 'positive'
    elif rating <= 2:
        return 'negative'
    else:
        return 'neutral'

df['label'] = df['rating'].apply(label_sentiment)

# 🧹 Step 4: Text preprocessing
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = re.sub(r"http\S+", "", text)  # remove URLs
    text = re.sub(r"[^a-zA-Z]", " ", text)  # remove non-letter characters
    text = text.lower()
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return " ".join(tokens)

df['cleaned_review'] = df['review'].apply(clean_text)

# ✂️ Step 5: Prepare features and labels
X = df['cleaned_review']
y = df['label']

# 🔤 Step 6: Convert text to TF-IDF vectors
vectorizer = TfidfVectorizer(max_features=5000)
X_vect = vectorizer.fit_transform(X)

# 🎯 Step 7: Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_vect, y, test_size=0.2, random_state=42)

# 🤖 Step 8: Train a Logistic Regression classifier
model = LogisticRegression()
model.fit(X_train, y_train)

# 🧪 Step 9: Evaluate the model
y_pred = model.predict(X_test)
print("📊 Classification Report:\n", classification_report(y_test, y_pred))
print("🧾 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# 💾 Step 10: Save the model and vectorizer (optional)
joblib.dump(model, 'sentiment_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')


  df = pd.read_csv('Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products.csv')
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


📊 Classification Report:
               precision    recall  f1-score   support

    negative       0.50      0.11      0.19       157
     neutral       0.50      0.06      0.11       278
    positive       0.94      1.00      0.97      6491

    accuracy                           0.94      6926
   macro avg       0.65      0.39      0.42      6926
weighted avg       0.92      0.94      0.92      6926

🧾 Confusion Matrix:
 [[  18    7  132]
 [   8   18  252]
 [  10   11 6470]]


['tfidf_vectorizer.pkl']