In [1]:
import gensim 
import pandas as pd 
import numpy as np 

In [2]:
df = pd.read_json('D:\ML\Cell_Phones_and_Accessories_5.json' , lines = True)

# Link to the Dataset: http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Cell_Phones_and_Accessories_5.json.gz

In [3]:
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A30TL5EWN6DFXT,120401325X,christina,"[0, 0]",They look good and stick good! I just don't li...,4,Looks Good,1400630400,"05 21, 2014"
1,ASY55RVNIL0UD,120401325X,emily l.,"[0, 0]",These stickers work like the review says they ...,5,Really great product.,1389657600,"01 14, 2014"
2,A2TMXE2AFO7ONB,120401325X,Erica,"[0, 0]",These are awesome and make my phone look so st...,5,LOVE LOVE LOVE,1403740800,"06 26, 2014"
3,AWJ0WZQYMYFQ4,120401325X,JM,"[4, 4]",Item arrived in great time and was in perfect ...,4,Cute!,1382313600,"10 21, 2013"
4,ATX7CZYFXI1KW,120401325X,patrice m rogoza,"[2, 3]","awesome! stays on, and looks great. can be use...",5,leopard home button sticker for iphone 4s,1359849600,"02 3, 2013"


In [4]:
df.columns

Index(['reviewerID', 'asin', 'reviewerName', 'helpful', 'reviewText',
       'overall', 'summary', 'unixReviewTime', 'reviewTime'],
      dtype='object')

In [5]:
# feature -  review text , label - overall rating and train our RNN i=over this daatset for NLP  task . 

text = df.reviewText[0]
print(text)

They look good and stick good! I just don't like the rounded shape because I was always bumping it and Siri kept popping up and it was irritating. I just won't buy a product like this again


In [6]:
# Use gensim.utils.simple_preprocess() -> to preprocess a text and drop trailing spaces , puncuation marks and redundant words 

# (like am , a , was , the , etc...) for better training for RNN similar to LSTM /GRU 

# Here , we are actually Tokenizing a sentence 

# Tokenization - Breaking a sentence into its constituent Tokens without puncuation / trailing spaces for better training 
# of RNN as words are passed one by one after going through Word2Vec model . 

print(gensim.utils.simple_preprocess(text))

['they', 'look', 'good', 'and', 'stick', 'good', 'just', 'don', 'like', 'the', 'rounded', 'shape', 'because', 'was', 'always', 'bumping', 'it', 'and', 'siri', 'kept', 'popping', 'up', 'and', 'it', 'was', 'irritating', 'just', 'won', 'buy', 'product', 'like', 'this', 'again']


In [7]:
# apply tokenization function to whole col of pd DataFrame 

review_text_preprocess = df.reviewText.apply(gensim.utils.simple_preprocess)

In [8]:
review_text_preprocess 

0         [they, look, good, and, stick, good, just, don...
1         [these, stickers, work, like, the, review, say...
2         [these, are, awesome, and, make, my, phone, lo...
3         [item, arrived, in, great, time, and, was, in,...
4         [awesome, stays, on, and, looks, great, can, b...
                                ...                        
194434    [works, great, just, like, my, original, one, ...
194435    [great, product, great, packaging, high, quali...
194436    [this, is, great, cable, just, as, good, as, t...
194437    [really, like, it, becasue, it, works, well, w...
194438    [product, as, described, have, wasted, lot, of...
Name: reviewText, Length: 194439, dtype: object

In [9]:
# To convert Word2Vec and get meanigful Word embeddings so as to show relation b/w words 
# we solve a FAKE PROBLEM of 'Fill in the Blanks' using given dataset (sliding window used) and by several epochs 
# we get mcorrect weights (features values) of words in vocab. (word embedding  -> Word2Vec successful ) (ytrue = one hot enc.d)
# to solve this RNN task , we use gensim.models.Word2Vec (Vec is label and weights are feaures) 

rnn_word2vec = gensim.models.Word2Vec(
    
    window = 10 , # sliding window size 
    min_count = 2 , # min no. of words in sentence 
    workers  = 4  # CPU threads used for training 
    
)


In [10]:
# build a vocab of words 

vocab = rnn_word2vec.build_vocab(review_text_preprocess , progress_per = 1000)

In [11]:
rnn_word2vec.epochs

5

In [12]:
rnn_word2vec.train(review_text_preprocess  , total_examples = rnn_word2vec.corpus_count , epochs = rnn_word2vec.epochs )

(61508618, 83868975)

In [13]:
# save the model and use it later (.model extension)

rnn_word2vec.save("./amazon_reviews.model")

In [14]:
# we get predictions as Vector form of words ( simialr words have similar vwctor values in word embeddings )

rnn_word2vec.wv.most_similar('good') # now , it started learning our English language 

[('decent', 0.8175770044326782),
 ('great', 0.7860682606697083),
 ('nice', 0.7026644349098206),
 ('fantastic', 0.6905131936073303),
 ('excellent', 0.6373535990715027),
 ('outstanding', 0.6171596050262451),
 ('superb', 0.6157050728797913),
 ('awesome', 0.6103843450546265),
 ('exceptional', 0.6082326173782349),
 ('terrific', 0.5919394493103027)]

In [15]:
# to get similarity score between words 

rnn_word2vec.wv.similarity(w1 = 'good' , w2 = 'product')   # positive similarity - means highly correlated 

-0.03976939

In [16]:
word2vec_embeddings = rnn_word2vec

In [17]:
X = review_text_preprocess
len(X)

194439

In [18]:
y = df.overall


In [19]:
from sklearn.preprocessing import LabelEncoder 
labels = LabelEncoder()
y_enc = labels.fit_transform(y)

In [20]:
y_enc


array([3, 4, 4, ..., 4, 4, 4], dtype=int64)

In [21]:
# word2vec_embeddings = Word2Vec.load("your_word2vec_model_path")  # Replace with your actual path

# Map words to Word2Vec embeddings
embedding_dim = word2vec_embeddings.vector_size
X_word2vec = np.array([
    [word2vec_embeddings.wv[word] if word in word2vec_embeddings.wv else np.zeros(embedding_dim) for word in sequence]
    for sequence in X
])

  X_word2vec = np.array([


In [22]:
X_word2vec.shape
embedding_dim

100

In [23]:
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [24]:
X_padded = pad_sequences(X_word2vec[:12000], maxlen= 30, padding='post', truncating='post')

In [25]:
len(X_padded)

12000

In [26]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_enc[:12000], test_size=0.2)
                                                   

In [27]:
y.unique

<bound method Series.unique of 0         4
1         5
2         5
3         4
4         5
         ..
194434    5
194435    5
194436    5
194437    5
194438    5
Name: overall, Length: 194439, dtype: int64>

In [28]:
import tensorflow as tf 
from tensorflow import keras 

In [38]:
from tensorflow.keras.layers import Embedding , LSTM , Dense 
# Many to One RNN
model = tf.keras.models.Sequential()

model.add(LSTM(units = 128 , input_shape = (30 , embedding_dim) ))  # using LSTM  RNN model
model.add(Dense(units = 64 , activation = 'softmax') )
model.add(Dense(units = 5 , activation = 'softmax') )

In [39]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
model.fit(X_train, y_train, epochs= 20, batch_size=32, validation_split=0.1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20

In [36]:
loss, accuracy = model.evaluate(X_test, y_test)



In [37]:
accuracy

0.5141666531562805

In [34]:
def sentence_to_vectors(sentence, word2vec_model):
    words = sentence.split()
    word_vectors = [word2vec_model.wv[word] if word in word2vec_model.wv else np.zeros(100) for word in words]
    return np.array(word_vectors)


def predict_sentiment(sentence, model, word2vec_model, max_sequence_length):
    # Convert sentence to Word2Vec vectors
    input_sequence = sentence_to_vectors(sentence, word2vec_model)

    # Pad the input sequence
    padded_input_sequence = pad_sequences([input_sequence], maxlen=max_sequence_length, padding='post', truncating='post')

    # Make predictions
    predictions = model.predict(padded_input_sequence)

    # Get the predicted class
    predicted_class = np.argmax(predictions, axis=1)[0]

    return predicted_class, predictions[0]

In [35]:

out1 , out2  = predict_sentiment('terrible product', model,rnn_word2vec , 30)

# out2 is correct output 
index = out1

print(f"rating = {index}")

rating = 0
