In [None]:
# importing libraries
import numpy as np
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding  

In [None]:
import json
reviews = []
for line in open('/content/Musical_Instruments_5.json', 'r'):
    reviews.append(json.loads(line))

In [None]:
# checking the entry in the datset
reviews[:1]

[{'asin': '1384719342',
  'helpful': [0, 0],
  'overall': 5.0,
  'reviewText': "Not much to write about here, but it does exactly what it's supposed to. filters out the pop sounds. now my recordings are much more crisp. it is one of the lowest prices pop filters on amazon so might as well buy it, they honestly work the same despite their pricing,",
  'reviewTime': '02 28, 2014',
  'reviewerID': 'A2IBPI20UZIR0U',
  'reviewerName': 'cassandra tu "Yeah, well, that\'s just like, u...',
  'summary': 'good',
  'unixReviewTime': 1393545600}]

In [None]:
# getting the review text columns and sanity checking
review_text = [review['reviewText'] for review in reviews]
review_text[:2]

["Not much to write about here, but it does exactly what it's supposed to. filters out the pop sounds. now my recordings are much more crisp. it is one of the lowest prices pop filters on amazon so might as well buy it, they honestly work the same despite their pricing,",
 "The product does exactly as it should and is quite affordable.I did not realized it was double screened until it arrived, so it was even better than I had expected.As an added bonus, one of the screens carries a small hint of the smell of an old grape candy I used to buy, so for reminiscent's sake, I cannot stop putting the pop filter next to my nose and smelling it after recording. :DIf you needed a pop filter, this will work just as well as the expensive ones, and it may even come with a pleasing aroma like mine did!Buy this product! :]"]

In [None]:
# getting the rating column and sanity check
ratings = [float(review['overall']) for review in reviews]
ratings[:2]

[5.0, 5.0]

In [None]:
set(ratings)

{1.0, 2.0, 3.0, 4.0, 5.0}

From above, we note that there are 5 unique values of ratings column. Let us frame a rule wherein if the rating is more than 2.0 then the sentiment is deemed as positive while it is negative otherwise.

In [None]:
# creating sentiment data
sentiment = [1 if r>2 else 0 for r in ratings]

In [None]:
# getting all the words of total data
word_dict = [word for sent in review_text for word in sent.split(' ')]

# checking the total dictionary size of the reviews corpus
dict_size = len(set(word_dict))
dict_size

57715

Next, we will encode each words in the sentence with the tf.keras onehot encooding. One-hot encodes a text into a list of word indexes of size dictionary.

In [None]:
encodsent = [one_hot(sent, dict_size) for sent in review_text]

In [None]:
# checking the matrix length of review
max_len_sentence = max([len(sent) for sent in encodsent])
max_len_sentence

2059

We will set the size of review as max and padd the other review with zeros, so that there reviews could be fed into our model.

In [None]:
padmysent = pad_sequences(encodsent, maxlen=max_len_sentence, padding='pre')
print(padmysent)

[[    0     0     0 ... 12803  6611 52210]
 [    0     0     0 ... 41378  3644 38023]
 [    0     0     0 ... 34239 52947 39297]
 ...
 [    0     0     0 ... 52947 30942 32089]
 [    0     0     0 ... 51034 53601 15784]
 [    0     0     0 ... 18973 18973 56253]]


In [None]:
# train-test split

train_size = int(len(padmysent)*.8)
X_train, X_test = padmysent[:train_size], padmysent[train_size:]
y_train, y_test = np.array(sentiment[:train_size]), np.array(sentiment[train_size:])

In [None]:
# model building
mymodel = Sequential()

# this layer turns positive integers (indexes) into dense vectors of size fed. Here the size is 10
mymodel.add(Embedding(dict_size, 10, input_length=max_len_sentence))

mymodel.add(Flatten())

mymodel.add(Dense(1, activation = 'sigmoid'))

In [None]:
# compiling the model
mymodel.compile(optimizer='adam',loss='binary_crossentropy', metrics=['accuracy'])

Here we have set loss as `binary_crossentropy` (Computes the cross-entropy loss between true labels and predicted labels.) and metric value as `accuracy`

In [None]:
# checking the model summary
mymodel.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 2059, 10)          577150    
                                                                 
 flatten (Flatten)           (None, 20590)             0         
                                                                 
 dense (Dense)               (None, 1)                 20591     
                                                                 
Total params: 597,741
Trainable params: 597,741
Non-trainable params: 0
_________________________________________________________________


In [None]:
# model training
mymodel.fit(X_train, y_train, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fe126dbc090>

In [None]:
# predicting
pred = mymodel.predict(X_test)

In [None]:
pred[:5]

array([[0.9788058],
       [0.9960004],
       [0.9993549],
       [0.9978851],
       [0.9938103]], dtype=float32)

Here the output is probabilties. So we need to convert them back the firm labels to check the metric values. Lets set a rule wherein class is 1 if probability if more than 0.5 and 0 otherwise. We can of course change the same to achieve higher metric value.

In [None]:
thres=0.5
pred_label = [1 if p[0]>thres else 0 for p in pred]

In [None]:
# checking the different metric value
from sklearn.metrics import confusion_matrix, accuracy_score

print(confusion_matrix(y_test, pred_label))
print(accuracy_score(y_test, pred_label))

[[   8  101]
 [   0 1944]]
0.950803701899659


**Conclusion:** Form the above, we conclude that we can train a basic sentiment analysis model with few number of iterations and a reliable text corpus.