In [3]:
from tqdm import tqdm

In [4]:
!ls ../input/glove-global-vectors-for-word-representation

glove.6B.100d.txt  glove.6B.200d.txt  glove.6B.50d.txt


In [5]:
import os

imdb_dir = '../input/keras-imdb/aclImdb_v1/aclImdb' # Data directory
train_dir = os.path.join(imdb_dir, 'train') # Get the path of the train set

# Setup empty lists to fill
labels = []
texts = []

# First go through the negatives, then through the positives
for label_type in ['neg', 'pos']:
    # Get the sub path
    dir_name = os.path.join(train_dir, label_type)
    print('loading ',label_type)
    # Loop over all files in path
    for fname in tqdm(os.listdir(dir_name)):
        
        # Only consider text files
        if fname[-4:] == '.txt':
            # Read the text file and put it in the list
            f = open(os.path.join(dir_name, fname))
            texts.append(f.read())
            f.close()
            # Attach the corresponding label
            if label_type == 'neg':
                labels.append(0)
            else:
                labels.append(1)

loading  neg


100%|██████████| 12500/12500 [00:10<00:00, 1153.23it/s]
  1%|          | 112/12500 [00:00<00:11, 1115.41it/s]

loading  pos


100%|██████████| 12500/12500 [00:09<00:00, 1279.97it/s]


We should have 25,000 texts and labels.

In [6]:
len(labels), len(texts)

(25000, 25000)

Half of the reviews are positive

In [7]:
import numpy as np
np.mean(labels)

0.5

Let's look at a positive review:

In [8]:
print('Label',labels[24002])
print(texts[24002])

Label 1
It is a story of Siberian village people from the beginning of 20th century till the 60ties. It is about passion and feelings, about Russian soul, and very romantic. This movie IS NOT action packed, it flowes slowely. In second part one can find great songs - Russian romances. It is much more better than Doctor Zhivago. The director of this movie moved to America and made Runaway Train for example.


And a negative review:

In [9]:
print('Label',labels[1])
print(texts[1])

Label 0
What a terrible, TERRIBLE, film! One of the worst movies I have seen in my life. I usually love movies like this, the whole "A guy meets an eccentric woman who he likes, but he happens to already be involved with someone, who not right for him....". I expected something predictable and I didn't mind. The movies are always entertaining mixing the right amount of romance with comedy, but not this one! Every single joke falls flat and the "romance" makes me want to vomit. The title character is one of the most "please kill me" characters that I have ever witnessed on my television, the "eccentric woman" isn't very eccentric, more like quirky and annoying. The "other someone" is the most reasonable, mature person in this film but also happens to be just as annoying. This films flat out sucks, there's no way around it, don't waste your time.


**Tokenizing text**
Computers can not work with words directly. To them, a word is just a meaningless row of characters. To work with words, we need to turn words into so called 'Tokens'. A token is a number that represents that word. Each word gets assigned a token. Tokens are usually assigned by word frequency. The most frequent words like 'a' or 'the' get tokens like 1 or 2 while less often used words like 'profusely' get assigned very high numbers.

We can tokenize text directly with Keras. When we tokenize text, we usually choose a maximum number of words we want to consider, our vocabulary so to speak. This prevents us from assigning tokens to words that are hardly ever used, mostly because of typos or because they are not actual words or because they are just very uncommon. This prevents us from over fitting to texts that contain strange words or wired spelling errors. Words that are beyond that cutoff point get assigned the token 0, unknown.

In [10]:
from keras.preprocessing.text import Tokenizer
import numpy as np

max_words = 10000 # We will only consider the 10K most used words in this dataset

tokenizer = Tokenizer(num_words=max_words) # Setup
tokenizer.fit_on_texts(texts) # Generate tokens by counting frequency
sequences = tokenizer.texts_to_sequences(texts) # Turn text into sequence of numbers

Using TensorFlow backend.


In [11]:
word_index = tokenizer.word_index
print('Token for "the"',word_index['the'])
print('Token for "Movie"',word_index['movie'])
print('Token for "generator"',word_index['generator'])

Token for "the" 1
Token for "Movie" 17
Token for "generator" 20287


In [12]:
# Display the first 10 words of the sequence tokenized
sequences[24002][:10]

[9, 6, 3, 62, 4, 2054, 81, 36, 1, 451]

In [13]:
from keras.preprocessing.sequence import pad_sequences
maxlen = 100 # Make all sequences 100 words long
data = pad_sequences(sequences, maxlen=maxlen)
print(data.shape) # We have 25K, 100 word sequences now

(25000, 100)


In [14]:
labels = np.asarray(labels)

# Shuffle data
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

training_samples = 20000  # We will be training on 10K samples
validation_samples = 5000  # We will be validating on 10000 samples

# Split data
x_train = data[:training_samples]
y_train = labels[:training_samples]
x_val = data[training_samples: training_samples + validation_samples]
y_val = labels[training_samples: training_samples + validation_samples]

In [15]:
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense

embedding_dim = 50

model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 50)           500000    
_________________________________________________________________
flatten_1 (Flatten)          (None, 5000)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 32)                160032    
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 33        
Total params: 660,065
Trainable params: 660,065
Non-trainable params: 0
_________________________________________________________________


In [16]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['acc'])

In [17]:
history = model.fit(x_train, y_train,
                    epochs=10,
                    batch_size=32,
                    validation_data=(x_val, y_val))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


Note that training your own embeddings is prone to over fitting. As you can see our model archives 100% accuracy on the training set but only 83% accuracy on the validation set. A clear sign of over fitting. In practice it is therefore quite rare to train new embeddings unless you have a massive dataset. Much more commonly, pre trained embeddings are used. A common pretrained embedding is GloVe, Global Vectors for Word Representation. It has been trained on billions of words from Wikipedia and the Gigaword 5 dataset, more than we could ever hope to train from our movie reviews. After downloading the GloVe embeddings from the GloVe website we can load them into our model:

In [19]:
glove_dir = '../input/glove-global-vectors-for-word-representation' # This is the folder with the dataset

print('Loading word vectors')
embeddings_index = {} # We create a dictionary of word -> embedding
f = open(os.path.join(glove_dir, 'glove.6B.100d.txt')) # Open file

# In the dataset, each line represents a new word embedding
# The line starts with the word and the embedding values follow
for line in tqdm(f):
    values = line.split()
    word = values[0] # The first value is the word, the rest are the values of the embedding
    embedding = np.asarray(values[1:], dtype='float32') # Load embedding
    embeddings_index[word] = embedding # Add embedding to our embedding dictionary
f.close()

print('Found %s word vectors.' % len(embeddings_index))

1625it [00:00, 16247.03it/s]

Loading word vectors


400000it [00:24, 16457.78it/s]

Found 400000 word vectors.





In [20]:
# Create a matrix of all embeddings
all_embs = np.stack(embeddings_index.values())
emb_mean = all_embs.mean() # Calculate mean
emb_std = all_embs.std() # Calculate standard deviation
emb_mean,emb_std

  if (await self.run_code(code, result,  async_=asy)):


(0.004451992, 0.4081574)

In [21]:
embedding_dim = 100 # We now use larger embeddings

word_index = tokenizer.word_index
nb_words = min(max_words, len(word_index)) # How many words are there actually

# Create a random matrix with the same mean and std as the embeddings
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embedding_dim))

# The vectors need to be in the same position as their index. 
# Meaning a word with token 1 needs to be in the second row (rows start with zero) and so on

# Loop over all words in the word index
for word, i in word_index.items():
    # If we are above the amount of words we want to use we do nothing
    if i >= max_words: 
        continue
    # Get the embedding vector for the word
    embedding_vector = embeddings_index.get(word)
    # If there is an embedding vector, put it in the embedding matrix
    if embedding_vector is not None: 
        embedding_matrix[i] = embedding_vector

In [22]:
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen, weights = [embedding_matrix], trainable = False))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 100, 100)          1000000   
_________________________________________________________________
flatten_2 (Flatten)          (None, 10000)             0         
_________________________________________________________________
dense_3 (Dense)              (None, 32)                320032    
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 33        
Total params: 1,320,065
Trainable params: 320,065
Non-trainable params: 1,000,000
_________________________________________________________________


In [23]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['acc'])

In [24]:
history = model.fit(x_train, y_train,
                    epochs=10,
                    batch_size=32,
                    validation_data=(x_val, y_val))

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [25]:
# Demo on a positive text
my_text = 'I love dogs. Dogs are the best. They are lovely, cuddly animals that only want the best for humans.'

seq = tokenizer.texts_to_sequences([my_text])
print('raw seq:',seq)
seq = pad_sequences(seq, maxlen=maxlen)
print('padded seq:',seq)
prediction = model.predict(seq)
print('positivity:',prediction)

raw seq: [[10, 116, 2518, 2518, 23, 1, 115, 33, 23, 1331, 1383, 12, 61, 178, 1, 115, 15, 1707]]
padded seq: [[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0   10  116
  2518 2518   23    1  115   33   23 1331 1383   12   61  178    1  115
    15 1707]]
positivity: [[0.98324835]]


In [26]:
# Demo on a negative text
my_text = 'The bleak economic outlook will force many small businesses into bankruptcy.'

seq = tokenizer.texts_to_sequences([my_text])
print('raw seq:',seq)
seq = pad_sequences(seq, maxlen=maxlen)
print('padded seq:',seq)
prediction = model.predict(seq)
print('positivity:',prediction)

raw seq: [[1, 3762, 7037, 77, 1145, 108, 389, 80]]
padded seq: [[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    1 3762 7037   77 1145  108
   389   80]]
positivity: [[0.07994457]]
