<a href="https://colab.research.google.com/github/keivanipchihagh/Intro_To_MachineLearning/blob/master/Models/Movie_Classification_with_IMDB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Movie Classification with IMDB

##### Imports

In [1]:
import numpy as np                      # numpy
import pandas as pd                     # Pandas
from keras.datasets import imdb         # IMDB Dataset
from tensorflow import keras            # Keras
from matplotlib import pyplot as plt    # Matplotlib

##### Loading Data

In [2]:
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words = 10000)  # Load the top 10000 frequently data into numpy arrays
print('Training data [0]:', train_data[0])
print('Training Label [0]:', train_labels[0])
print('Legnth:', len(train_data))

Training data [0]: [1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]
Training Label [0]: 1
Legnth: 25000


##### Get the feel of the data

In [3]:
def decode(index):  # Decoding the sequential integers into the corresponding words
  word_index = imdb.get_word_index()
  reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
  decoded_review = ' '.join([reverse_word_index.get(i - 3, '?') for i in train_data[index]])
  return decoded_review

print('Training data [0]:', decode(0))

Training data [0]: ? this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert ? is an amazing actor and now the same being director ? father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for ? and would recommend it to everyone to watch and the fly fishing was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also ? to the two little boy's that played the ? of norman and paul they were just brilliant children are often left out of the ? list i think because the stars that play them all grown up are such a big profile for the whole film but these children are amazing and should be praised for what they have

##### Data Prep (One-Hot Encoding)

In [4]:
def vectorize_sequences(sequences, dimension = 10000):  # Encoding the integer sequences into a binary matrix
  results = np.zeros((len(sequences), dimension)) # Creating an all-zero matrix
  for i, sequence in enumerate(sequences):
    results[i, sequence] = 1.
  return results

train_data = vectorize_sequences(train_data)
test_data = vectorize_sequences(test_data)

train_labels = np.asarray(train_labels).astype('float32')
test_labels = np.asarray(test_labels).astype('float32')

print("Vectorized training data: ", train_data, sep = '\n');

Vectorized training data: 
[[0. 1. 1. ... 0. 0. 0.]
 [0. 1. 1. ... 0. 0. 0.]
 [0. 1. 1. ... 0. 0. 0.]
 ...
 [0. 1. 1. ... 0. 0. 0.]
 [0. 1. 1. ... 0. 0. 0.]
 [0. 1. 1. ... 0. 0. 0.]]


##### Building the model

In [5]:
model = keras.models.Sequential()
model.add(keras.layers.Dense(units = 16, activation = 'relu', input_shape = (10000,)))
model.add(keras.layers.Dense(units = 16, activation = 'relu'))
model.add(keras.layers.Dense(1, activation = 'sigmoid'))
model.compile(optimizer = keras.optimizers.RMSprop(0.001), loss = keras.losses.binary_crossentropy, metrics = [keras.metrics.binary_accuracy, 'acc'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 16)                160016    
_________________________________________________________________
dense_1 (Dense)              (None, 16)                272       
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 17        
Total params: 160,305
Trainable params: 160,305
Non-trainable params: 0
_________________________________________________________________


##### Training the moel

In [6]:
x_val = train_data[:10000]
train_data = train_data[10000:]

y_val = train_labels[:10000]
train_labels = train_labels[10000:]

history = model.fit(train_data, train_labels, batch_size = 512, epochs = 5, verbose = False, validation_data = (x_val, y_val))

##### Evalucating The Model

In [None]:
test_loss, test_binary_acc, test_acc = model.evaluate(test_data, test_labels)
print('Loss:', test_loss)
print('Binary Accuracy:', test_binary_acc * 100)
print('Accuracy:', test_acc * 100)

##### Statistics

In [None]:
# Draw the statistice for the training & validation process

epochs = range(1, len(history.history['acc']) + 1)
plt.plot(epochs, history.history['loss'], 'b', label = 'Training loss')
plt.plot(epochs, history.history['val_loss'], 'r', label = 'Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

plt.clf()
plt.plot(epochs, history.history['acc'], 'b', label='Training acc')
plt.plot(epochs, history.history['val_acc'], 'r', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()