# Flipkart Product Reviews — Sentiment Analysis (VADER)

This notebook loads Flipkart reviews, computes VADER sentiment scores, preprocesses text, trains a simple classifier, and evaluates results. Update the `DATA_PATH` cell to point to your CSV file (place `FlipkartData.csv` in the path you choose).

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('vader_lexicon')
print('NLTK downloads complete')

In [None]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.sentiment import SentimentIntensityAnalyzer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

pd.set_option('max_colwidth', 100)
print('Libraries imported')

In [None]:
DATA_PATH = r'D:\\projects\\FlipkartData.csv'  # <-- change if needed

df = pd.read_csv(DATA_PATH, encoding='ISO-8859-1', low_memory=False)
print('Loaded shape:', df.shape)
df.head()

In [None]:
required_cols = ['Summary', 'Sentiment']
for c in required_cols:
    if c not in df.columns:
        raise ValueError(f"Dataset must contain column: {c}")
        
# Normalize Sentiment
 df['Sentiment'] = df['Sentiment'].astype(str).str.strip().str.lower()
print('Sentiment value counts:')        
print(df['Sentiment'].value_counts())

In [None]:
ps = PorterStemmer()
all_stop = set(stopwords.words('english'))
neg_keep = {"no","not","nor","don't","didn't","doesn't","isn't","wasn't","weren't","won't","can't","couldn't","shouldn't","wouldn't","cannot"}
stopwords_to_remove = all_stop - neg_keep

def preprocess(text):
    text = str(text)
    text = re.sub('[^a-zA-Z]', ' ', text)
    tokens = text.lower().split()
    tokens = [ps.stem(t) for t in tokens if t not in stopwords_to_remove]
    return ' '.join(tokens)

# Drop rows with empty summaries
 df = df[df['Summary'].notna()].copy()
 df['clean'] = df['Summary'].apply(preprocess)
 df[['Summary','clean']].head()

In [None]:
sia = SentimentIntensityAnalyzer()
df['vader_scores'] = df['clean'].apply(lambda t: sia.polarity_scores(str(t)))
# expand to columns
vader_df = pd.DataFrame(df['vader_scores'].tolist())
df = pd.concat([df.reset_index(drop=True), vader_df.reset_index(drop=True)], axis=1)
df[['Summary','clean','compound','pos','neu','neg']].head()

In [None]:
df = df[df['Sentiment'].isin(['positive','negative'])].copy()
df['label'] = df['Sentiment'].map({'positive':1,'negative':0})

cv = CountVectorizer(max_features=3000, ngram_range=(1,2))
X = cv.fit_transform(df['clean']).toarray()
y = df['label'].values
print('X shape:', X.shape, 'y shape:', y.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y, random_state=42)

clf = MultinomialNB(alpha=1.0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [None]:
print('Accuracy:', accuracy_score(y_test, y_pred))
print('\nClassification report:\n', classification_report(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred)
print('Confusion matrix:\n', cm)

plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['pred_neg','pred_pos'], yticklabels=['true_neg','true_pos'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [None]:
df.to_csv('flipkart_with_vader_and_clean.csv', index=False)
print('Saved flipkart_with_vader_and_clean.csv')