In [0]:
import tensorflow as tf
!wget -q http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar zxf aclImdb_v1.tar.gz
#!tree -d aclImdb

In [0]:
NUM_WORDS=10000
SEQ_LEN=500
EMBEDDING_SIZE=128
BATCH_SIZE=128
EPOCHS=5
THRESHOLD=0.5

In [3]:
import pandas as pd
import glob
import os
import string

def get_dfs(start_path):

  df = pd.DataFrame(columns=['text', 'sent'])
  text = []
  sent = []
  for p in ['pos','neg']:
    path=os.path.join(start_path, p)
    files = [f for f in os.listdir(path)
             if os.path.isfile(os.path.join(path,f))]
    for f in files:
      with open (os.path.join(path, f), "r") as myfile:
        # replace carriage return linefeed with spaces
        text.append(myfile.read()
                    .replace("\n", " ")
                    .replace("\r", " "))
        # convert positive reviews to 1 and negative reviews to zero
        sent.append(1 if p == 'pos' else 0)

  df['text']=text
  df['sent']=sent
  #This line shuffles the data so you don't end up with contiguous
  #blocks of positive and negative reviews
  df = df.sample(frac=1).reset_index(drop=True)      
  return df

train_df = get_dfs ("aclImdb/train/")
test_df = get_dfs ("aclImdb/test/")

train_df.head()


Unnamed: 0,text,sent
0,"""Sundown:The Vampire in Retreat"" is a rubbish....",0
1,Still love it 17 or so years after the first t...,1
2,"I love a good war film and I fall into the ""be...",0
3,I remember originally seeing this film at Radi...,1
4,"Watching CBS's ""Surrender, Dorothy"", I kept wo...",0


In [0]:
#create tokenizer for our data
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=NUM_WORDS, oov_token='<UNK>')
tokenizer.fit_on_texts(train_df['text'])


#convert text data to numerical indexes
train_seqs=tokenizer.texts_to_sequences(train_df['text'])
test_seqs=tokenizer.texts_to_sequences(test_df['text'])

#pad data up to SEQ_LEN (note that we truncate if there are more than SEQ_LEN tokens)
train_seqs=tf.keras.preprocessing.sequence.pad_sequences(train_seqs, maxlen=SEQ_LEN, padding="post")
test_seqs=tf.keras.preprocessing.sequence.pad_sequences(test_seqs, maxlen=SEQ_LEN, padding="post")

In [5]:

model = tf.keras.Sequential([
  tf.keras.layers.Embedding(NUM_WORDS, EMBEDDING_SIZE),
  tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(20)),
  tf.keras.layers.Dense(1, activation='sigmoid')])

model.summary()

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 128)         1280000   
_________________________________________________________________
bidirectional (Bidirectional (None, 40)                23840     
_________________________________________________________________
dense (Dense)                (None, 1)                 41        
Total params: 1,303,881
Trainable params: 1,303,881
Non-trainable params: 0
_________________________________________________________________


In [6]:
history = model.fit(train_seqs, train_df['sent'].values, batch_size=BATCH_SIZE, epochs=5, validation_split=0.2)

model.evaluate(test_seqs, test_df['sent'].values)[1]

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


0.8451600074768066

In [7]:
!wget http://nlp.stanford.edu/data/glove.6B.zip

--2020-04-19 21:55:33--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2020-04-19 21:55:33--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2020-04-19 21:55:34--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2020-0

In [8]:
!unzip glove.6B.zip

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [9]:
import numpy as np
# Load GloVe pretrained embeddings
embedding_dim = 100
word_index = tokenizer.word_index

embeddings_index = {}
f = open('glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Word vectors: %s' % len(embeddings_index))
print('Embedding size: %s'% embedding_dim)

embedding_matrix = np.zeros((NUM_WORDS, embedding_dim))
for word, i in word_index.items():
    if i < NUM_WORDS:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

Word vectors: 400000
Embedding size: 100


In [11]:
model = tf.keras.Sequential([
  tf.keras.layers.Embedding(NUM_WORDS, 100),
  tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(20)),
  tf.keras.layers.Dense(1, activation='sigmoid')])

model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False

model.summary()

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

history = model.fit(train_seqs, train_df['sent'].values, batch_size=BATCH_SIZE, epochs=5, validation_split=0.2)

model.evaluate(test_seqs, test_df['sent'].values)[1]

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 100)         1000000   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 40)                19360     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 41        
Total params: 1,019,401
Trainable params: 19,401
Non-trainable params: 1,000,000
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


0.8131200075149536

In [12]:
model1 = tf.keras.Sequential([
  tf.keras.layers.Embedding(NUM_WORDS, EMBEDDING_SIZE),
  tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
  tf.keras.layers.Dense(256, activation='relu'),
  tf.keras.layers.Dense(256, activation='relu'),
  tf.keras.layers.Dense(1, activation='sigmoid')])

model1.summary()

model1.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

es = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', mode='max')
callbacks=[es]
history = model1.fit(train_seqs, train_df['sent'].values, batch_size=BATCH_SIZE, epochs=5, validation_split=0.2)

model1.evaluate(test_seqs, test_df['sent'].values)[1]

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, None, 128)         1280000   
_________________________________________________________________
bidirectional_3 (Bidirection (None, 128)               98816     
_________________________________________________________________
dense_3 (Dense)              (None, 256)               33024     
_________________________________________________________________
dense_4 (Dense)              (None, 256)               65792     
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 257       
Total params: 1,477,889
Trainable params: 1,477,889
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


0.852400004863739