In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import tensorflow as tf
from tensorflow.keras.layers import Dense, SimpleRNN, LSTM, Dropout, Embedding, BatchNormalization
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
import warnings
warnings.filterwarnings('ignore')

In [2]:
data=pd.read_csv('Amazon-Product-Review-Sentiment-Analysis-using-RNN-Dataset.csv')
data.head(3)

Unnamed: 0,Review,Sentiment
0,Fast shipping but this product is very cheaply...,1
1,This case takes so long to ship and it's not e...,1
2,Good for not droids. Not good for iPhones. You...,1


In [3]:
data.shape

(25000, 2)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Review     24999 non-null  object
 1   Sentiment  25000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 390.8+ KB


In [5]:
data.isna().sum()

Review       1
Sentiment    0
dtype: int64

In [6]:
data.dropna(inplace=True)

In [7]:
data['Sentiment'].value_counts()

Sentiment
1    5000
2    5000
3    5000
4    5000
5    4999
Name: count, dtype: int64

In [8]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to C:\Users\BK
[nltk_data]     Yadav\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\BK
[nltk_data]     Yadav\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [9]:
stop_words=set(stopwords.words('english'))
def clean_reviews(text):
    # Correct regex pattern for HTML tags
    regex = re.compile('<.*?>')
    text = re.sub(regex, '', text)
    
    # Remove digits
    pattern = re.compile('\d+')
    text = re.sub(pattern, '', text)
    
    # Remove stopwords
    text = [word for word in text.split() if word.lower() not in stop_words]
    return ' '.join(text)

# Applying the function to your 'Review' column
data['Review'] = data['Review'].apply(clean_reviews)

In [12]:
tokenizer=Tokenizer()
review_to_list=data['Review'].tolist()
tokenizer.fit_on_texts(review_to_list)

text_sequence=tokenizer.texts_to_sequences(review_to_list)
max_words=500
padded_sequence=pad_sequences(text_sequence,maxlen=max_words)
text_sequence=pad_sequences(padded_sequence)

data=pd.get_dummies(data, columns=['Sentiment'])

x=pad_sequences(text_sequence,maxlen=max_words)
y=data[['Sentiment_1', 'Sentiment_2', 'Sentiment_3', 'Sentiment_4', 'Sentiment_5']]


In [13]:
print(x.shape,y.shape)

(24999, 500) (24999, 5)


In [14]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=43)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(19999, 500) (5000, 500) (19999, 5) (5000, 5)


In [19]:
rnn = Sequential(name="Simple_RNN")
rnn.add(Embedding(len(tokenizer.word_index)+1,max_words,input_length=max_words))
rnn.add(SimpleRNN(128,activation='relu',return_sequences=True))
rnn.add(SimpleRNN(64,activation='relu',return_sequences=False))
rnn.add(Dense(5, activation='softmax'))
# printing model summary
print(rnn.summary())

None


In [21]:
rnn.compile(
    loss="categorical_crossentropy",
    optimizer='adam',
    metrics=['accuracy']
)

history=rnn.fit(x_train,y_train,batch_size=64,epochs=2,verbose=1,validation_data=(x_test, y_test))

Epoch 1/2
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m236s[0m 731ms/step - accuracy: 0.2709 - loss: 4.4905 - val_accuracy: 0.4312 - val_loss: 1.2973
Epoch 2/2
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m230s[0m 734ms/step - accuracy: 0.5300 - loss: 1.0987 - val_accuracy: 0.4324 - val_loss: 1.2902


In [22]:
print("RNN score -> ",rnn.evaluate(x_test,y_test,verbose=1))

[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 95ms/step - accuracy: 0.4278 - loss: 1.2945
RNN score ->  [1.2902271747589111, 0.4323999881744385]
