In [1]:
from keras.datasets import imdb
from keras import preprocessing

In [2]:
max_features = 10000
maxlen = 20

In [4]:
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((25000,), (25000,), (25000,), (25000,))

In [5]:
# turns lists of integers into a 2D integer tensor of shape
x_train = preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = preprocessing.sequence.pad_sequences(x_test, maxlen=maxlen)

In [6]:
x_train.shape, x_test.shape

((25000, 20), (25000, 20))

#### building model

In [9]:
from keras.models import Sequential
from keras.layers import Flatten, Dense, Embedding

In [10]:
model = Sequential()
# why 8 dimensional
model.add(Embedding(10000,8, input_length=maxlen))
# has to be flatten before connecting to dense layer
model.add(Flatten())

# We add the classifier on top
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
model.summary()

history = model.fit(x_train, y_train,
                    epochs=10,
                    batch_size=32,
                    validation_split=0.2)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 20, 8)             80000     
_________________________________________________________________
flatten_1 (Flatten)          (None, 160)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 161       
Total params: 80,161
Trainable params: 80,161
Non-trainable params: 0
_________________________________________________________________
Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## Using pre-trained word embeddings

In [11]:
pwd

'/Users/i846240/projects/_deep_learning/deep-learning-with-python-notebooks'

In [13]:
data_path='/Users/i846240/projects/_deep_learning/fastai/courses/dl1/data/aclimdb/'

In [14]:
ls {data_path}

README      imdb.vocab  imdbEr.txt  [34mtest[m[m/       [34mtrain[m[m/


In [15]:
import os
train_dir = os.path.join(data_path,'train')

In [16]:
ls {train_dir}

labeledBow.feat  [34mpos[m[m/             unsupBow.feat    urls_pos.txt
[34mneg[m[m/             [34munsup[m[m/           urls_neg.txt     urls_unsup.txt


In [21]:
def load_data(dir_name):
    """
        load data from directory to list
        expects pos and neg subdirectories in dir_name
        returns texts, labels as lists
    """
    labels = []
    texts = []
    
    for label_type in ['neg', 'pos']:
        dir_name = os.path.join(train_dir,label_type)
        for fname in os.listdir(dir_name):
            if fname[-4:]=='.txt':
                f = open(os.path.join(dir_name,fname))
                texts.append(f.read())
                f.close()
                if label_type == 'neg':
                    labels.append(0)
                else:
                    labels.append(1)
    return texts, labels

In [22]:
texts, labels = load_data(train_dir)

In [24]:
len(texts)

25000

In [25]:
texts[0]

"Working with one of the best Shakespeare sources, this film manages to be creditable to it's source, whilst still appealing to a wider audience.<br /><br />Branagh steals the film from under Fishburne's nose, and there's a talented cast on good form."

### Tokenize the data

In [26]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

In [27]:
maxlen = 100 # We will cut reviews after 100 words
training_samples = 200
validation_samples = 10000
max_words = 10000 # we will only consider the top 10,000 words in the dataset

#### https://faroit.github.io/keras-docs/1.2.2/preprocessing/text/

In [29]:
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

In [60]:
len(sequences)

25000

#### get word index from tokenizer

In [34]:
word_index = tokenizer.word_index
print('found %s unique tokens.' % len(word_index))

found 106398 unique tokens.


In [58]:
word_index['man']

129

In [36]:
data = pad_sequences(sequences, maxlen=maxlen)
labels = np.asarray(labels)
data.shape, labels.shape

((25000, 100), (25000,))

In [37]:
x_train = data[:training_samples]
y_train = labels[:training_samples]

In [39]:
x_val = data[training_samples: training_samples + validation_samples]
y_val = labels[training_samples: training_samples + validation_samples]

#### Download the GloVe word embeddings
https://nlp.stanford.edu/projects/glove/ 

In [42]:
glove_dir='/Users/i846240/projects/_deep_learning/data/glove.6B/'

In [43]:
ls {glove_dir}

glove.6B.100d.txt  glove.6B.200d.txt  glove.6B.300d.txt  glove.6B.50d.txt


In [44]:
!tail {os.path.join(glove_dir,'glove.6B.100d.txt')}

sigarms 0.18917 -0.3181 -0.43749 0.61209 -0.031357 0.1568 0.30505 -0.39915 0.30101 -0.17353 -0.032278 -0.29338 -0.16598 -0.15038 -0.29364 -0.062909 -0.32706 0.13117 0.010177 0.078467 -0.32796 0.008198 -0.34072 0.1286 -0.27844 0.41588 -0.045799 0.45221 -0.065252 -0.055483 0.50665 -0.067356 -0.29428 -0.12657 -0.024669 -0.37284 0.23507 -0.48212 0.041255 -0.081911 0.59964 0.2047 -0.23401 -0.34091 -0.13306 0.36 -0.00697 0.6013 0.50416 0.099127 -0.031218 0.40532 -0.16901 -0.4664 0.14094 0.74539 0.15076 0.18017 -0.51756 -0.59651 -0.20991 -0.5506 -0.023214 -0.4166 -0.33079 -0.048234 -0.16065 0.36674 0.0072976 0.051298 -0.38274 0.57222 -0.34032 0.16816 -0.40298 -0.12421 0.16196 -0.15993 0.43126 -0.4145 -0.48541 -0.28735 0.34193 -0.029869 0.70822 0.01381 -0.034878 -0.26084 0.20595 0.64408 -0.056202 -0.1277 0.28507 -0.16824 -0.018927 -0.24652 -0.10403 0.32856 -0.43073 -0.10308
katuna -0.21887 -0.30785 -0.28557 0.36186 -0.20892 -0.011439 -0.39597 0.27104 0.51042 -0.29147 -0.18927 0.167 -0.043189 

#### build embeddings index with word as key and embedding index as value

In [45]:
embeddings_index = {}
f = open(os.path.join(glove_dir,'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:],dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))
    

Found 400000 word vectors.


#### build an embedding matrix that will be load into an Embedding layer

In [49]:
embeddings_index['man'].size

100

In [51]:
embedding_dim = 100

# initialize (10000, 100) matrix with zeros 
embedding_matrix = np.zeros((max_words, embedding_dim))

# loop through word in index 
# (all unique words from document in numeric value)
for word, i in word_index.items():
    # find word in index and index as value
    embedding_vector = embeddings_index.get(word)
    if i < max_words:
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
        


In [62]:
max(word_index.values())

106398