# Product Review Sentiment Analysis

In [1]:
#Importing required packages
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

In [94]:
# Loading the data
df = pd.read_csv('Train.csv')
df.head()

Unnamed: 0,Product,Review,Sentiment
0,Product 1,the speaker voice quality is terrible compare ...,0
1,Product 1,This kindle is light and easy to use especiall...,1
2,Product 1,Didnt know how much i'd use a kindle so went f...,1
3,Product 1,just okay! Product was for my girlfriend. Does...,0
4,Product 1,I do like the portability of my Tap. You do lo...,1


# Data Preprocessing

In [95]:
# Downloading stopwords and lemmatizer data
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kkere\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kkere\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kkere\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\kkere\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [96]:
# Initializing stopwords and lemmatizer objects
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [97]:
# Defining a function to preprocess text
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenize the text into individual words
    words = nltk.word_tokenize(text)
    
    # Remove stop words and lemmatize the remaining words
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    
    # Join the words back into a string
    text = ' '.join(words)
    
    return text

In [98]:
# Preprocessing the dataset for the review column 
preprocessed_df = df['Review'].apply(preprocess_text)

# Feature extraction (Bag-of-words)

In [118]:
# Initialize the CountVectorizer object
vectorizer = CountVectorizer()

# Convert the preprocessed text into a bag-of-words matrix
X = vectorizer.fit_transform(preprocessed_df)

In [119]:
#Checking the vocabulary of the vectorizer
print(vectorizer.vocabulary_)

{'speaker': 1464, 'voice': 1696, 'quality': 1227, 'terrible': 1570, 'compare': 340, 'similar': 1411, 'size': 1421, 'logitech': 912, 'ue': 1651, 'boomthe': 224, 'price': 1188, 'high': 731, 'even': 539, 'got': 682, 'promotion': 1207, '79': 41, 'kindle': 839, 'light': 884, 'easy': 493, 'use': 1672, 'especially': 537, 'beach': 190, 'didnt': 438, 'know': 845, 'much': 1002, 'id': 762, 'went': 1729, 'lower': 934, 'end': 515, 'im': 768, 'happy': 706, 'little': 901, 'dark': 405, 'okay': 1061, 'product': 1202, 'girlfriend': 665, 'doesnt': 459, 'appear': 127, 'us': 1670, 'except': 550, 'steam': 1489, 'music': 1005, 'question': 1230, 'ask': 141, 'get': 659, 'nothing': 1041, 'like': 890, 'siri': 1416, 'better': 205, 'portability': 1172, 'tap': 1555, 'lose': 923, 'feature': 588, 'echo': 496, 'speaking': 1465, 'across': 55, 'room': 1331, 'good': 677, 'sound': 1459, 'great': 688, 'take': 1548, 'pool': 1167, 'hard': 708, 'support': 1525, 'guam': 693, 'online': 1067, 'instruction': 797, 'clear': 313, 't

In [101]:
# Getting the list of words
feature_names = vectorizer.get_feature_names()

# Converting the bag-of-words matrix to a pandas DataFrame
X_df = pd.DataFrame(X.toarray(), columns=feature_names)

# Printing the rows of the DataFrame
print(X_df.head())

   10  100  101  10th  11  11yr  120  12999  139  15  ...  youtube  yr  zero  \
0   0    0    0     0   0     0    0      0    0   0  ...        0   0     0   
1   0    0    0     0   0     0    0      0    0   0  ...        0   0     0   
2   0    0    0     0   0     0    0      0    0   0  ...        0   0     0   
3   0    0    0     0   0     0    0      0    0   0  ...        0   0     0   
4   0    0    0     0   0     0    0      0    0   0  ...        0   0     0   

   zinio  zoom  äîand  äôm  äôt  äù  äúdualbattery  
0      0     0      0    0    0   0              0  
1      0     0      0    0    0   0              0  
2      0     0      0    0    0   0              0  
3      0     0      0    0    0   0              0  
4      0     0      0    0    0   0              0  

[5 rows x 1795 columns]




# Building the Model

### Naive Bayes

In [120]:
# Assigning the label column to y
y = df['Sentiment']

# Split the data into training and testing sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state = 100)

In [121]:
# Fitting the model
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

MultinomialNB()

In [122]:
# Predictions on the validation set
y_pred = classifier.predict(X_valid)

In [123]:
# Checking the accuracy of the model
accuracy = accuracy_score(y_valid, y_pred)
print("Validation Accuracy:", accuracy)

Validation Accuracy: 0.8625


In [106]:
#Checking the inaccurate predictions

# Identify incorrectly predicted reviews
incorrect_rows = X_valid[y_pred != y_valid]
incorrect_y_pred = y_pred[y_pred != y_valid]
incorrect_y_valid = y_valid[y_pred != y_valid]

# Print incorrectly predicted reviews
for i, row in enumerate(incorrect_rows):
    print(f"Incorrect prediction {i+1}:")
    print(f"  Actual label: {incorrect_y_valid.iloc[i]}")
    print(f"  Predicted label: {incorrect_y_pred[i]}")
    print(f"         Review: {vectorizer.inverse_transform(row)[0]}\n")

Incorrect prediction 1:
  Actual label: 0
  Predicted label: 1
         Review: ['anything' 'buy' 'gon' 'like' 'na']

Incorrect prediction 2:
  Actual label: 0
  Predicted label: 1
         Review: ['amazon' 'best' 'big' 'build' 'fan' 'good' 'great' 'im' 'image'
 'mediocre' 'ok' 'price' 'product' 'quality' 'software' 'tablet' 'worth']

Incorrect prediction 3:
  Actual label: 0
  Predicted label: 1
         Review: ['worth']

Incorrect prediction 4:
  Actual label: 0
  Predicted label: 1
         Review: ['like']

Incorrect prediction 5:
  Actual label: 0
  Predicted label: 1
         Review: ['bass' 'better' 'concept' 'echo' 'execution' 'good' 'great' 'however'
 'love' 'much' 'one' 'poor' 'portable' 'pretty' 'scratchy' 'sound' 'tap'
 'terrible']

Incorrect prediction 6:
  Actual label: 1
  Predicted label: 0
         Review: ['anyone' 'entertaining' 'product' 'recommend' 'useful' 'would']

Incorrect prediction 7:
  Actual label: 0
  Predicted label: 1
         Review: ['boy' 'friend' '

### Test Data

In [124]:
# Loading the data
test_df = pd.read_csv('Test.csv')

# Preprocessing the test data
preprocessed_test = test_df['Review'].apply(preprocess_text)

# Convert the preprocessed text into a bag-of-words matrix
X_test = vectorizer.transform(preprocessed_test)

In [125]:
# Assigning the label column to y
y_test = test_df['Sentiment']

# Predictions on the test set
y_test_pred = classifier.predict(X_test)

In [127]:
# Checking the accuracy of the model
accuracy = accuracy_score(y_test, y_test_pred)
print("Test Accuracy:", accuracy)

Test Accuracy: 0.745


### Testing Using Manual Input

In [148]:
# Preprocess the input text
input_text = "I hate this product."
preprocessed_text = preprocess_text(input_text)

# Convert the preprocessed text into a bag-of-words matrix
X_input = vectorizer.transform([preprocessed_text])

# Get the predicted sentiment of the input text
predicted_sentiment = classifier.predict(X_input)

if predicted_sentiment == 1:
    print("Positive review")
else:
    print("Negative review")

Negative review
