In [0]:
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import os
import re

# Load all files from a directory in a DataFrame.
def load_directory_data(directory):
  data = {}
  data["sentence"] = []
  data["sentiment"] = []
  for file_path in os.listdir(directory):
    with tf.gfile.GFile(os.path.join(directory, file_path), "r") as f:
      data["sentence"].append(f.read())
      data["sentiment"].append(re.match("\d+_(\d+)\.txt", file_path).group(1))
  return pd.DataFrame.from_dict(data)

# Merge positive and negative examples, add a polarity column and shuffle.
def load_dataset(directory):
  pos_df = load_directory_data(os.path.join(directory, "pos"))
  neg_df = load_directory_data(os.path.join(directory, "neg"))
  pos_df["polarity"] = 1
  neg_df["polarity"] = 0
  return pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True)

# Download and process the dataset files.
def download_and_load_datasets(force_download=False):
  dataset = tf.keras.utils.get_file(
      fname="aclImdb.tar.gz", 
      origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 
      extract=True)
  
  train_df = load_dataset(os.path.join(os.path.dirname(dataset), 
                                       "aclImdb", "train"))
  
  return train_df


In [0]:
train = download_and_load_datasets()
train

In [0]:
train_text = train['sentence'].values
train_label = train['polarity'].values

In [0]:
index = 3
print(train_text[index])
print(train_label[index])

In [0]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

max_words = 10000  # We will only consider the top 10,000 words in the dataset

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(train_text)
sequences = tokenizer.texts_to_sequences(train_text)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

maxlen = 200 # We will cut reviews after maxlen words
data = pad_sequences(sequences, maxlen=maxlen)
labels = np.asarray(train_label)

print('Shape of data tensor', data.shape)
print('Shape of labels tensor', labels.shape)

In [0]:
training_samples = 16000  
validation_samples = 4000  
test_samples = 5000

# Split the data into a training set and a validation set
x_train = data[:training_samples]
y_train = labels[:training_samples]
x_val = data[training_samples: training_samples + validation_samples]
y_val = labels[training_samples: training_samples + validation_samples]
x_test = data[training_samples + validation_samples:]
y_test = labels[training_samples + validation_samples:]

test_text = train_text[training_samples + validation_samples:]
test_label = train_label[training_samples + validation_samples:]

print("x_train shape:", x_train.shape)
print("y_train shape:", y_train.shape)
print("x_val shape:", x_val.shape)
print("y_val shape:", y_val.shape)
print("x_test shape:", x_test.shape)
print("y_test shape:", y_test.shape)

In [0]:
#load a pre-trainied Google News w2v

!pip install nltk
!pip install gensim

import nltk
nltk.download('word2vec_sample')

from nltk.data import find
import gensim

word2vec_sample = str(find('models/word2vec_sample/pruned.word2vec.txt'))
w2v = gensim.models.KeyedVectors.load_word2vec_format(word2vec_sample, binary=False)

#Each word is represented in the space of 300 dimensions:
embedding_dim = w2v.vector_size

print("No.of words in the vocab:", len(w2v.vocab))
print("Model dimensions:", embedding_dim)

In [0]:
testWord = 'boy'
w2v[testWord]

In [0]:
# build the embedding matrix that we will be able to load into an Embedding layer. 
# It must be a matrix of shape (max_words,embedding_dim), 
# where each entry i contains the embedding_dim-dimensional vector for the word of index i in our reference word index 
# (built during tokenization). Note that the index 0 is not supposed to stand for any word or token -- it's a placeholder.

embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
  # Words not found in embedding index will be all-zeros.
  if word in w2v.vocab and i < max_words:
    embedding_vector = w2v[word]
    embedding_matrix[i] = embedding_vector

In [0]:
# check this vector is the same as the previous w2v vector
wIndex = word_index[testWord]
embedding_matrix[wIndex]

In [0]:
print("embedding_matrix.shape:", embedding_matrix.shape)
print("max_words:", max_words)
print("embedding_dim:", embedding_dim)
print("maxlen:", maxlen)

In [0]:
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense

model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary()

In [0]:
# Load the Embedding layer from the embedding_matrix and no need to train the parameters

model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False
model.summary()

In [0]:
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['acc'])
history = model.fit(x_train, y_train,
                    epochs=10,
                    batch_size=32,
                    validation_data=(x_val, y_val))

In [0]:
# Do the prediction on the test set
score = model.evaluate(x_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

ans = model.predict(x_test)

In [0]:
# Predict individual movie review
p_index = 15

print("Movie Review:", test_text[p_index])
print("Truth Sentiment:", test_label[p_index])
print("Predicted Sentiment:", round(ans[p_index][0]))