In [4]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd

In [21]:
data = keras.datasets.imdb

In [22]:
(train_data, train_labels), (test_data, test_labels) = data.load_data(num_words=10000)
# only words that appera over 10000 times

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


In [23]:
word_index = data.get_word_index()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json


In [24]:
word_index = {k:(v+3) for k,v in word_index.items()}

In [25]:
word_index["<PAD>"] = 0
word_index["<START>"] = 1
word_index["<UNK>"] = 2
word_index["<INUSED>"] = 3

In [26]:
reverse_word_index = dict([(value, key) for (key,value) in word_index.items()])

In [9]:
train_data = keras.preprocessing.sequence.pad_sequences(train_data, value = word_index["<PAD>"], padding="post", maxlen = 250)
test_data  = keras.preprocessing.sequence.pad_sequences(test_data, value = word_index["<PAD>"], padding="post", maxlen = 250)
# data must have the same len

In [10]:
print(len(test_data[0]),len(test_data[3]))

250 250


In [11]:
def decode_review(text):
    return " ".join([reverse_word_index.get(i, "?") for i in text])

In [12]:
print(decode_review(test_data[0]))

<START> please give this one a miss br br <UNK> <UNK> and the rest of the cast rendered terrible performances the show is flat flat flat br br i don't know how michael madison could have allowed this one on his plate he almost seemed to know this wasn't going to work out and his performance was quite <UNK> so all you madison fans give this a miss <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PA

In [13]:
# now defining model
model = keras.Sequential()
model.add(keras.layers.Embedding(88000, 16))
model.add(keras.layers.GlobalAveragePooling1D())
model.add(keras.layers.Dense(16, activation="relu"))
model.add(keras.layers.Dense(1,  activation="sigmoid"))

Instructions for updating:
Colocations handled automatically by placer.


In [14]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 16)          1408000   
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 16)                272       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 1,408,289
Trainable params: 1,408,289
Non-trainable params: 0
_________________________________________________________________


In [15]:
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
x_val = train_data[:10000]
x_train = train_data[10000:]

y_val = train_labels[:10000]
y_train = train_labels[10000:]

In [16]:
fitModel = model.fit(x_train, y_train, epochs = 35, batch_size = 512, validation_data = (x_val, y_val), verbose = 1)

Train on 15000 samples, validate on 10000 samples
Instructions for updating:
Use tf.cast instead.
Epoch 1/35
Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 30/35
Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35


In [17]:
result = model.evaluate(test_data, test_labels)
print(result)

[0.3151730310535431, 0.87368]


In [18]:
test_review = test_data[0]
predict = model.predict([test_review])
print("Review: ")
print(decode_review(test_review))
print("Prediction: " + str(predict[0]))
print("Accual: " + str(test_labels[0]))
print(result)

Review: 
<START> please give this one a miss br br <UNK> <UNK> and the rest of the cast rendered terrible performances the show is flat flat flat br br i don't know how michael madison could have allowed this one on his plate he almost seemed to know this wasn't going to work out and his performance was quite <UNK> so all you madison fans give this a miss <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> 

In [5]:
def  review_encode(s):
    encoded = [1]
    for word in s:
        if word.lower() in word_index:
            encoded.append(word_index[word.lower()])
        else:
            encoded.append(2)
    return encoded

In [20]:
model.save("model.h5")

In [6]:
model = keras.models.load_model("model.h5")

In [22]:
with open("test.txt") as infile, open('output.txt', 'w') as outfile:
    outfile.write(infile.read().replace(',', '').replace(',', '').replace('.', '')
                  .replace('(', '').replace(')', '').replace('\"', '').replace(':', '')
                  .replace('\n', ' ').replace('\r', ''))
    

In [239]:
movies_reviews = pd.read_csv('movies_reviews.csv')

In [240]:
movies_reviews

Unnamed: 0,Title,Reviews
0,The Irishman,"""When I love a movie as much as I loved The Ir..."
1,The Irishman,", The last 5 minutes of the movie explain ever..."
2,The Irishman,", I was able to attend the NYC premiere this a..."
3,Frozen II,"""(This has a post-credit scene in case youre w..."
4,Frozen II,"This is a beautiful movie, and Elsas journey ..."
5,Frozen II,Its not hard to imagine how big this movie wi...
6,Knives Out,"""Best movie of the year, love everything about..."
7,Knives Out,Reading through the reviews (and assuming tha...
8,Knives Out,", What an excellent film by Rian Johnson; defi..."


In [241]:
def format_review(review_string):
    '''Formating reviews string and separating them to list elements.'''
    review_string.replace(',', '').replace(',', '').replace('.', '').replace('(', '').replace(')', '').replace('\"', '').replace(':', '').replace('\n', ' ').replace('\r', '')
    
    return review_string

In [242]:
movies_reviews.loc[:,'Reviews'] = movies_reviews.loc[:,'Reviews'].apply(format_review)

In [243]:
movies_reviews.loc[:,'Encoded_Reviews'] = movies_reviews.loc[:,'Reviews'].apply(review_encode)
movies_reviews

Unnamed: 0,Title,Reviews,Encoded_Reviews
0,The Irishman,"""When I love a movie as much as I loved The Ir...","[1, 2, 1992, 2023, 963, 3363, 2, 13, 2, 2014, ..."
1,The Irishman,", The last 5 minutes of the movie explain ever...","[1, 2, 2, 830, 2023, 963, 2, 2014, 6, 590, 830..."
2,The Irishman,", I was able to attend the NYC premiere this a...","[1, 2, 2, 13, 2, 1992, 6, 590, 2, 6, 503, 2014..."
3,Frozen II,"""(This has a post-credit scene in case youre w...","[1, 2, 2, 830, 2023, 13, 590, 2, 2023, 6, 590,..."
4,Frozen II,"This is a beautiful movie, and Elsas journey ...","[1, 2, 830, 2023, 13, 590, 2, 13, 590, 2, 6, 2..."
5,Frozen II,Its not hard to imagine how big this movie wi...,"[1, 2, 13, 830, 590, 2, 3363, 1604, 830, 2, 20..."
6,Knives Out,"""Best movie of the year, love everything about...","[1, 2, 503, 963, 590, 830, 2, 1983, 1604, 1964..."
7,Knives Out,Reading through the reviews (and assuming tha...,"[1, 2, 1479, 963, 6, 1095, 13, 3363, 1331, 2, ..."
8,Knives Out,", What an excellent film by Rian Johnson; defi...","[1, 2, 2, 1992, 2023, 6, 830, 2, 6, 3363, 2, 9..."


In [244]:

movies_reviews.loc[:,'Preprocessed_Reviews'] = [keras.preprocessing.sequence.pad_sequences( k,  value = word_index["<PAD>"], padding="post", maxlen = 250) for index, k in movies_reviews[['Encoded_Reviews']].iterrows()]

In [245]:
movies_reviews.loc[:,'Preprocessed_Reviews'] = [keras.preprocessing.sequence.pad_sequences( k,  value = word_index["<PAD>"],) for index, k in movies_reviews[['Encoded_Reviews']].iterrows()]

In [246]:
def prediction(m):
    b = model.predict(m)
    return b

In [247]:
movies_reviews.loc[:,'Preprocessed_Reviews'] = movies_reviews.loc[:,'Preprocessed_Reviews'].apply(model.predict)

In [248]:
movies_reviews.loc[:,'Preprocessed_Reviews']

0     [[0.0011626381]]
1     [[0.0029579871]]
2     [[0.0015028614]]
3     [[0.0018451641]]
4    [[0.00078531343]]
5     [[0.0010952073]]
6     [[0.0024196669]]
7      [[0.004572668]]
8     [[0.0014141949]]
Name: Preprocessed_Reviews, dtype: object

In [249]:
movies_reviews

Unnamed: 0,Title,Reviews,Encoded_Reviews,Preprocessed_Reviews
0,The Irishman,"""When I love a movie as much as I loved The Ir...","[1, 2, 1992, 2023, 963, 3363, 2, 13, 2, 2014, ...",[[0.0011626381]]
1,The Irishman,", The last 5 minutes of the movie explain ever...","[1, 2, 2, 830, 2023, 963, 2, 2014, 6, 590, 830...",[[0.0029579871]]
2,The Irishman,", I was able to attend the NYC premiere this a...","[1, 2, 2, 13, 2, 1992, 6, 590, 2, 6, 503, 2014...",[[0.0015028614]]
3,Frozen II,"""(This has a post-credit scene in case youre w...","[1, 2, 2, 830, 2023, 13, 590, 2, 2023, 6, 590,...",[[0.0018451641]]
4,Frozen II,"This is a beautiful movie, and Elsas journey ...","[1, 2, 830, 2023, 13, 590, 2, 13, 590, 2, 6, 2...",[[0.00078531343]]
5,Frozen II,Its not hard to imagine how big this movie wi...,"[1, 2, 13, 830, 590, 2, 3363, 1604, 830, 2, 20...",[[0.0010952073]]
6,Knives Out,"""Best movie of the year, love everything about...","[1, 2, 503, 963, 590, 830, 2, 1983, 1604, 1964...",[[0.0024196669]]
7,Knives Out,Reading through the reviews (and assuming tha...,"[1, 2, 1479, 963, 6, 1095, 13, 3363, 1331, 2, ...",[[0.004572668]]
8,Knives Out,", What an excellent film by Rian Johnson; defi...","[1, 2, 2, 1992, 2023, 6, 830, 2, 6, 3363, 2, 9...",[[0.0014141949]]
