# Strings to numbers

In [2]:
vocab = {} #maps words to integer representing it
word_encoding = 1
def bag_of_words(text):
    global word_encoding

    words = text.lower().split(" ") #create a list of all of the words in the text
    bag = {} #stores all the encoding and their frequency
    
    for word in words:
        
        if word in vocab:
            encoding = vocab[word] #get encoding from vocab
        
        else:
            vocab[word] = word_encoding
            encoding = word_encoding
            word_encoding += 1
        
        if encoding in bag:
            bag[encoding] += 1
        else:
            bag[encoding] = 1
    return bag

text  = 'this is simple text that i have created created in a a day'
bag = bag_of_words(text)
print(bag)
print(vocab)

{1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: 2, 9: 1, 10: 2, 11: 1}
{'this': 1, 'is': 2, 'simple': 3, 'text': 4, 'that': 5, 'i': 6, 'have': 7, 'created': 8, 'in': 9, 'a': 10, 'day': 11}


# Sentimental Analysis for Movies review

In [3]:
from keras.datasets import imdb
from keras.utils.data_utils import pad_sequences
import tensorflow as tf
import os
import numpy as np
import keras

VOCAB_SIZE = 88584
MAXLEN = 250
BATCH_SIZE = 64

(train_data, train_labels), (test_data, test_lables) = imdb.load_data(num_words = VOCAB_SIZE)

In [7]:
train_data[0]

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     1,    14,    22,    16,
          43,   530,   973,  1622,  1385,    65,   458,  4468,    66,
        3941,     4,   173,    36,   256,     5,    25,   100,    43,
         838,   112,    50,   670, 22665,     9,    35,   480,   284,
           5,   150,     4,   172,   112,   167, 21631,   336,   385,
          39,     4,   172,  4536,  1111,    17,   546,    38,    13,
         447,     4,   192,    50,    16,     6,   147,  2025,    19,
          14,    22,     4,  1920,  4613,   469,     4,    22,    71,
          87,    12,    16,    43,   530,    38,    76,    15,    13,
        1247,     4,    22,    17,   515,    17,    12,    16,   626,
          18, 19193,     5,    62,   386,    12,     8,   316,     8,
         106,     5,

***More Preprocessing***
If we have a look at some of our loaded in reviews we'll notice that they are different lengths. This is an issue.We cannot pass different length
data into our neural network.Therefore we must make each review the same length.To do this we will follow the procedure below:
• if the review is greater than 250 words then trim off the extra words
• if the review is less than 250 words add the necessary amount of's to make it equal to 250.
Luckily for us keras has a function that can do this for us:

In [5]:
train_data = pad_sequences(train_data, MAXLEN)
test_data = pad_sequences(test_data, MAXLEN)
print(train_data)

[[    0     0     0 ...    19   178    32]
 [    0     0     0 ...    16   145    95]
 [    0     0     0 ...     7   129   113]
 ...
 [    0     0     0 ...     4  3586 22459]
 [    0     0     0 ...    12     9    23]
 [    0     0     0 ...   204   131     9]]


# Creating the Model

In [14]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(VOCAB_SIZE, 32),
    tf.keras.layers.LSTM(32),
    tf.keras.layers.Dense(1, activation="sigmoid")
])

# Training

In [16]:
model.compile(loss="binary_crossentropy", optimizer="rmsprop", metrics=['acc'])

history = model.fit(train_data, train_labels, epochs=10, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Making Predictions

In [19]:
word_index = imdb.get_word_index()

def encode_text(text):
    tokens = keras.preprocessing.text.text_to_word_sequence(text)
    tokens = [word_index[word] if word in word_index else 0 for word in tokens]
    return pad_sequences([tokens], MAXLEN)[0]

text = "that movie was just amazing, so amazing"
encoded = encode_text(text)
print(encoded)

[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0  12  17  13  4

In [20]:
# while were at it lets make a decode function

reverse_word_index = {value: key for (key, value) in word_index.items()}

def decode_integers(integers):
    PAD = 0
    text = ""
    for num in integers:
        if num != PAD:
            text += reverse_word_index[num] + " "
    return text[:-1]
print(decode_integers(encoded))

that movie was just amazing so amazing


In [21]:
# Now its time to make a prediction

def predict(text):
    encoded_text = encode_text(text)
    pred = np.zeros((1, 250))
    pred[0] = encoded_text
    result = model.predict(pred)
    print(result[0])
    

positive_review = "That movie was awesome! I really loved it and would watch it again because it was amazingly great"
predict(positive_review)

negative_review = "That movie was sucked. I hated it and wouldn't watch it again. Was one of the worst things I've ever watched"
predict(negative_review)

[0.8197192]
[0.3272635]
