<a href="https://colab.research.google.com/github/justinpatel/Text-Classification-ML/blob/master/text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [29]:
import tensorflow as tf
import numpy as np

VOCAB_SIZE = 88584
MAXLEN = 250
BATCH_SIZE = 64

In [2]:
(train_data, train_label), (test_data, test_label) = tf.keras.datasets.imdb.load_data(num_words=VOCAB_SIZE)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


In [3]:
train_data.shape

(25000,)

In [4]:
test_data.shape

(25000,)

In [5]:
train_data = tf.keras.preprocessing.sequence.pad_sequences(train_data, maxlen=MAXLEN)
test_data = tf.keras.preprocessing.sequence.pad_sequences(test_data, maxlen=MAXLEN)

In [6]:
train_data.shape

(25000, 250)

In [7]:
model = tf.keras.models.Sequential([
                                    tf.keras.layers.Embedding(VOCAB_SIZE, 32),
                                    tf.keras.layers.LSTM(32),
                                    tf.keras.layers.Dense(1, activation="sigmoid")
])

In [8]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 32)          2834688   
_________________________________________________________________
lstm (LSTM)                  (None, 32)                8320      
_________________________________________________________________
dense (Dense)                (None, 1)                 33        
Total params: 2,843,041
Trainable params: 2,843,041
Non-trainable params: 0
_________________________________________________________________


In [9]:
model.compile(optimizer='rmsprop',loss='binary_crossentropy', metrics=['acc'])

In [10]:
history = model.fit(train_data, train_label, batch_size=BATCH_SIZE, validation_split=0.2, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [11]:
result = model.evaluate(test_data, test_label)



In [12]:
print(result)

[0.5505431890487671, 0.8456400036811829]


Prediction

In [13]:
word_index = tf.keras.datasets.imdb.get_word_index()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json


In [None]:
word_index

In [16]:
def encode_text(text):
  tokens = tf.keras.preprocessing.text.text_to_word_sequence(text)
  tokens = [word_index[word] if word in tokens else 0 for word in tokens]
  return tf.keras.preprocessing.sequence.pad_sequences([tokens], maxlen=MAXLEN)[0]

In [21]:
text = "that movie was just amazing!"
encoded = encode_text(text)
print(encoded)

[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0  12  1

In [27]:
reverse_word_index= {value: key for (key,value) in word_index.items()}

def decode_integers(integers):
  pad = 0
  text = ""
  for num in integers:
    if num!=pad:
      text += reverse_word_index[num] + " "
  return text[:-1]

In [28]:
print(decode_integers(encoded))

that movie was just amazing


In [35]:
def predict(text):
  encoded_text = encode_text(text)
  pred = np.zeros((1,MAXLEN))
  pred[0] = encoded_text
  result = model.predict(pred)
  print(result[0])

In [41]:
positive_review = "I thought that movie sucks, but it was really awesome and great"
predict(positive_review)

[0.6631945]


In [42]:
model.save("text_classification_v1.h5")