In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [28]:
vocab_size = 10000
embedding_dim = 16
max_length = 500
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
training_size = 35000

In [2]:
reviews = pd.read_csv('IMDB Dataset.csv')
reviews.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [74]:
sentences = reviews['review']
labels = reviews['sentiment']

In [7]:
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
stop_words = set(stopwords.words('english')) 

In [18]:
filtered_sentences = []
for s in sentences:
    word_tokens = word_tokenize(s) 
    filtered_sentence = [w for w in word_tokens if not w in stop_words]
    filtered_sentences.append(" ".join(filtered_sentence))

In [21]:
print(len(filtered_sentences[0]))

1339


In [20]:
len(filtered_sentences)

50000

In [75]:
labels

0        positive
1        positive
2        positive
3        negative
4        positive
           ...   
49995    positive
49996    negative
49997    negative
49998    negative
49999    negative
Name: sentiment, Length: 50000, dtype: object

In [76]:
def sentiment(word):
    if word == 'positive':
        return 1
    else:
        return 0
labels = labels.apply(sentiment)

In [77]:
training_sentences = sentences[0:training_size]
testing_sentences = sentences[training_size:]
training_labels = labels[0:training_size]
testing_labels = labels[training_size:]

In [78]:
training_labels.value_counts()

0    17510
1    17490
Name: sentiment, dtype: int64

In [30]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)

word_index = tokenizer.word_index

training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [79]:
print(len(training_sequences))
print(training_padded)

35000
[[ 28   5   2 ...   0   0   0]
 [  4 388 119 ...   0   0   0]
 [ 11 190  12 ...   0   0   0]
 ...
 [ 10 811  21 ...   0   0   0]
 [  1 125  10 ...   0   0   0]
 [ 12  18  91 ...   0   0   0]]


In [80]:
training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)

In [81]:
print(len(training_labels[training_labels==0]))

17510


In [82]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [83]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 500, 16)           160000    
_________________________________________________________________
global_average_pooling1d_3 ( (None, 16)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 24)                408       
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 25        
Total params: 160,433
Trainable params: 160,433
Non-trainable params: 0
_________________________________________________________________


In [84]:
num_epochs = 10
history = model.fit(training_padded, training_labels, epochs=num_epochs, validation_data=(testing_padded, testing_labels), verbose=2)

Train on 35000 samples, validate on 15000 samples
Epoch 1/10
 - 6s - loss: 0.4860 - acc: 0.7806 - val_loss: 0.3185 - val_acc: 0.8733
Epoch 2/10
 - 5s - loss: 0.2624 - acc: 0.8986 - val_loss: 0.2661 - val_acc: 0.8980
Epoch 3/10
 - 6s - loss: 0.2159 - acc: 0.9190 - val_loss: 0.2604 - val_acc: 0.8968
Epoch 4/10
 - 6s - loss: 0.1901 - acc: 0.9286 - val_loss: 0.2616 - val_acc: 0.8999
Epoch 5/10
 - 6s - loss: 0.1714 - acc: 0.9376 - val_loss: 0.2760 - val_acc: 0.8945
Epoch 6/10
 - 6s - loss: 0.1590 - acc: 0.9412 - val_loss: 0.2751 - val_acc: 0.8970
Epoch 7/10
 - 6s - loss: 0.1452 - acc: 0.9478 - val_loss: 0.2906 - val_acc: 0.8927
Epoch 8/10
 - 6s - loss: 0.1351 - acc: 0.9523 - val_loss: 0.3038 - val_acc: 0.8917
Epoch 9/10
 - 6s - loss: 0.1263 - acc: 0.9563 - val_loss: 0.3302 - val_acc: 0.8857
Epoch 10/10
 - 6s - loss: 0.1173 - acc: 0.9605 - val_loss: 0.3330 - val_acc: 0.8881


In [112]:
def predict_sentence(sentence):
    sentence = [sentence]
    sequences = tokenizer.texts_to_sequences(sentence)
    padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
    sentiment = model.predict(padded)
    return ("positive",float(sentiment)) if sentiment > 0.5 else ("negative", float(sentiment))

# TESTING ON DARK SERIES REVIEWS

In [116]:
predict_sentence("""While Season One was OK, Season Two proved the law of diminishing returns with an increasingly 
convoluted plot and no significant new ideas. Season Three has become a dull, hyper-complex mess, where the characters 
mostly stare a) open mouthed or b) teary eyed (sometimes both!) at some 'revelation' or other in lieu of an actual plot 
or anything even remotely exciting or interesting. To distract from this narrative vacuum, a parallel world and even more 
time travel destinations, more characters, older/younger selves and parallel selves is introduced with the effect of 
creating an utterly boring and confusing shambles. The actors aren't given a chance to show their skill as they are only 
allowed to either utter some semi philosophical nonsense or revert to expressions a) and b) above. Sadly Ben Frost's 
soundtrack and the D.O.P. are wasted on this.""")

('negative', 0.017044639214873314)

In [117]:
predict_sentence("""The show wants to be a serious mystery. It want's to be grim. There is a constant synth sound in 
nearly every scene foreshadowing grim things are coming,even on the most banal scenes.It feels like someone in post 
production wanted to play around with Dolby Atmos as much as possible instead of doing a proper sound mix that supports 
the action on screen. From a technical aspect the sound is well engineered, but it doesn't do anything for the show.
And you get a perfect exercise in overacting by the semi talented cast. Every line is delivered frowned with meaning, 
swollen as if there were an underlying meaning to every word, glimpse or motion.""")

('negative', 0.06525209546089172)

In [118]:
predict_sentence("""Netflix list this as a "teen" series. I think any discerning watcher of any age will get a lot
out of this dark, compelling mystery. Set in a small German town where weird things are happening, it's a mind bending,
time shifting saga with each character given weight and importance. The acting is first rate, especially the young actors 
portraying the kids. This is not "Stranger Things" Euro style - its nothing like it. Eerie and intelligent, you have to 
really concentrate on the narrative to get all the nuances of the plot and characters. Very, very good. A solid 8 from me
and I'm really harsh with my ratings! I'm two episodes into Season 2, and it's as good as Season 1, so far. More mysteries
unfolding and characters you really care about. Recommended.""")

('positive', 0.999691367149353)