In [5]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re

Using TensorFlow backend.


In [37]:
pos_rev = pd.read_csv("Data\pos.txt", sep = "\n", header = None, encoding = 'latin-1')
pos_rev = pd.concat([pos_rev,pd.Series(np.ones(pos_rev.shape[0]))], ignore_index=True, axis =1)

In [38]:
pos_rev.columns = ['review', 'mood']
pos_rev.head()

Unnamed: 0,review,mood
0,the rock is destined to be the 21st century's ...,1.0
1,"the gorgeously elaborate continuation of "" the...",1.0
2,effective but too-tepid biopic,1.0
3,if you sometimes like to go to the movies to h...,1.0
4,"emerges as something rare , an issue movie tha...",1.0


In [40]:
neg_rev = pd.read_csv("Data//negative.txt", sep = "\n", header = None, encoding = 'latin-1')
neg_rev = pd.concat([neg_rev,pd.Series(np.zeros(pos_rev.shape[0]))], ignore_index=True, axis =1)
neg_rev.columns = ['review', 'mood']
neg_rev.head()

Unnamed: 0,review,mood
0,"simplistic , silly and tedious.",0.0
1,"it's so laddish and juvenile , only teenage bo...",0.0
2,exploitative and largely devoid of the depth o...,0.0
3,[garbus] discards the potential for pathologic...,0.0
4,a visually flashy but narratively opaque and e...,0.0


## Performing preprocessing on positive reviews

In [44]:
pos_rev.loc[:, 'review'] = pos_rev.loc[:, 'review'].apply(lambda x: x.lower())
pos_rev.loc[:, 'review'] = pos_rev.loc[:, 'review'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))

## Performing preprocessing on Negative reviews

In [45]:
neg_rev.loc[:, 'review'] = neg_rev.loc[:, 'review'].apply(lambda x: x.lower())
neg_rev.loc[:, 'review'] = neg_rev.loc[:, 'review'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))

### Combining the two for tokenization

In [51]:
com_rev = pd.concat([pos_rev, neg_rev], axis =0).reset_index()


In [53]:
com_rev.head()

Unnamed: 0,index,review,mood
0,0,the rock is destined to be the 21st centurys n...,1.0
1,1,the gorgeously elaborate continuation of the ...,1.0
2,2,effective but tootepid biopic,1.0
3,3,if you sometimes like to go to the movies to h...,1.0
4,4,emerges as something rare an issue movie that...,1.0


### Tokenization

In [55]:
max_features = 2000
tokenizer = Tokenizer(num_words = max_features, split=' ')
tokenizer.fit_on_texts(com_rev.loc[:, 'review'].values)
X = tokenizer.texts_to_sequences(com_rev.loc[:, 'review'].values)
X = pad_sequences(X)


## Composing the LSTM Network

In [57]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_features, embed_dim,input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 46, 128)           256000    
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 46, 128)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 196)               254800    
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 394       
Total params: 511,194
Trainable params: 511,194
Non-trainable params: 0
_________________________________________________________________
None


In [59]:
Y = pd.get_dummies(com_rev['mood']).values
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.33, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(7143, 46) (7143, 2)
(3519, 46) (3519, 2)


# batch size and epoch
The batch size is a hyperparameter that defines the number of samples to work through before updating the internal model parameters.

Think of a batch as a for-loop iterating over one or more samples and making predictions. At the end of the batch, the predictions are compared to the expected output variables and an error is calculated. From this error, the update algorithm is used to improve the model, e.g. move down along the error gradient.

### Batch Gradient Descent. Batch Size = Size of Training Set
### Stochastic Gradient Descent. Batch Size = 1
### Mini-Batch Gradient Descent. 1 < Batch Size < Size of Training Set

In [61]:
batch_size = 32
model.fit(X_train, Y_train, epochs = 15, batch_size=batch_size, verbose = 2)

Epoch 1/15
 - 22s - loss: 0.2399 - accuracy: 0.8925
Epoch 2/15
 - 23s - loss: 0.2109 - accuracy: 0.9126
Epoch 3/15
 - 23s - loss: 0.1866 - accuracy: 0.9213
Epoch 4/15
 - 23s - loss: 0.1744 - accuracy: 0.9282
Epoch 5/15
 - 24s - loss: 0.1519 - accuracy: 0.9364
Epoch 6/15
 - 24s - loss: 0.1346 - accuracy: 0.9476
Epoch 7/15
 - 24s - loss: 0.1287 - accuracy: 0.9489
Epoch 8/15
 - 24s - loss: 0.1202 - accuracy: 0.9506
Epoch 9/15
 - 24s - loss: 0.1135 - accuracy: 0.9546
Epoch 10/15
 - 27s - loss: 0.1104 - accuracy: 0.9559
Epoch 11/15
 - 25s - loss: 0.0978 - accuracy: 0.9604
Epoch 12/15
 - 25s - loss: 0.0872 - accuracy: 0.9651
Epoch 13/15
 - 24s - loss: 0.0798 - accuracy: 0.9696
Epoch 14/15
 - 24s - loss: 0.0758 - accuracy: 0.9702
Epoch 15/15
 - 24s - loss: 0.0769 - accuracy: 0.9665


<keras.callbacks.callbacks.History at 0x15457410470>

In [62]:
validation_size = 1500

X_validate = X_test[-validation_size:]
Y_validate = Y_test[-validation_size:]
X_test = X_test[:-validation_size]
Y_test = Y_test[:-validation_size]
score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

score: 1.47
acc: 0.70


In [63]:
pos_cnt, neg_cnt, pos_correct, neg_correct = 0, 0, 0, 0
for x in range(len(X_validate)):
    
    result = model.predict(X_validate[x].reshape(1,X_test.shape[1]),batch_size=1,verbose = 2)[0]
   
    if np.argmax(result) == np.argmax(Y_validate[x]):
        if np.argmax(Y_validate[x]) == 0:
            neg_correct += 1
        else:
            pos_correct += 1
       
    if np.argmax(Y_validate[x]) == 0:
        neg_cnt += 1
    else:
        pos_cnt += 1



print("pos_acc", pos_correct/pos_cnt*100, "%")
print("neg_acc", neg_correct/neg_cnt*100, "%")

pos_acc 66.71018276762402 %
neg_acc 77.9291553133515 %


# let's test the model for a review

In [66]:
Review= "the film is a hoot and is just as good if not better than   much of whats on saturday morning tv especially the pseudo educational stuff we all cant stand"
rev = tokenizer.texts_to_sequences(Review)
rev = pad_sequences(rev, maxlen=46, dtype='int32', value =0)
print(rev)
sentiment = model.predict(rev,batch_size=1,verbose = 2)[0]
if(np.argmax(sentiment) == 0):
    print("negative")
elif (np.argmax(sentiment) == 1):
    print("positive")

[[   0    0    0 ...    0    0 1056]
 [   0    0    0 ...    0    0 1970]
 [   0    0    0 ...    0    0  804]
 ...
 [   0    0    0 ...    0    0    2]
 [   0    0    0 ...    0    0    0]
 [   0    0    0 ...    0    0    0]]
positive


In [67]:
print(sentiment)

[0.00691895 0.99308103]
