## Neural network with no hidden layers

In [1]:
import numpy as np
import pandas as pd
# Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, GRU, Dropout, Embedding, Flatten
from tensorflow.keras.utils import set_random_seed
# Preprocessing
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
# Train test split

In [2]:
filename='../data/glove.6B.50d.txt'
embeddings_index = dict()
with open(filename, encoding="utf8") as file:
    for line in file:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
file.close()

In [3]:
data = pd.read_csv('../data/imdb_movie.zip')
print(data.head())
# 0 is positive
# 1 is negative
print(data['label'].value_counts())

                                                text  label
0  I grew up (b. 1965) watching and loving the Th...      0
1  When I put this movie in my DVD player, and sa...      0
2  Why do people who do not know what a particula...      0
3  Even though I have great interest in Biblical ...      0
4  Im a die hard Dads Army fan and nothing will e...      1
label
0    20019
1    19981
Name: count, dtype: int64


In [4]:
labels = np.array(data['label'])
reviews = np.array(data['text'])

In [5]:
t = Tokenizer()
t.fit_on_texts(reviews)
vocab_size = len(t.word_index) + 1
encoded_docs = t.texts_to_sequences(reviews)

padded_docs = pad_sequences(encoded_docs, padding='post')
input_length = padded_docs.shape[1]
print(vocab_size, input_length)

112204 2493


In [6]:
# This nifty method creates a matrix the same size as the words that appear in the GLOVE word emebddings
# If a word does not exist in GLOVE, then its embedded vector remains as 0
# The first row is an array of zeroes, for words that cannot be matched
embedding_matrix = np.zeros((vocab_size, 50))
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [7]:
x = padded_docs.astype(np.float32)
y = labels.astype(np.float32)

In [16]:
# define model
model = Sequential()
e = Embedding(input_dim=vocab_size, 
              output_dim=50, 
              weights=[embedding_matrix], 
              input_length=input_length, 
              trainable=False)
model.add(e)
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
print(model.summary())
model.fit(x, y, epochs=20, verbose=1)

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 2493, 50)          5610200   
                                                                 
 flatten_1 (Flatten)         (None, 124650)            0         
                                                                 
 dense_1 (Dense)             (None, 1)                 124651    
                                                                 
Total params: 5,734,851
Trainable params: 124,651
Non-trainable params: 5,610,200
_________________________________________________________________
None
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x29ae4e3e140>