In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import tensorflow as tf

In [0]:
tf.__version__

'1.15.0'

In [0]:
import numpy as np

In [0]:
f = open('drive/My Drive/AFC/embeddings/glove.6B.100d.txt', 'r', encoding='utf8')
lines = f.readlines()
word2index = {}
embeddings = np.zeros(shape=(len(lines) + 1, len(lines[0].split()[1:])), dtype=np.float32)
for idx, line in enumerate(lines):
    line = line.split()
    word2index[line[0]] = len(word2index) + 1
    embeddings[idx + 1] = np.array(line[1:], dtype=np.float32)

In [0]:
embeddings.shape

(400001, 100)

In [0]:
word_ids = tf.keras.Input([None], dtype=tf.int32)

emb = tf.keras.layers.Embedding(embeddings.shape[0], embeddings.shape[1], weights=[embeddings], mask_zero=True, trainable=False)(word_ids)
bid = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=False))(emb)
dropout = tf.keras.layers.Dropout(0.5)(bid)
bn = tf.keras.layers.BatchNormalization()(dropout)
dense = tf.keras.layers.Dense(1, activation='sigmoid')(bn)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [0]:
model = tf.keras.Model(inputs=word_ids, outputs=dense)

In [0]:
model.compile(optimizer=tf.keras.optimizers.Adam(0.001), 
              loss='binary_crossentropy')

In [0]:
import pandas as pd

In [0]:
d = pd.read_csv('drive/My Drive/AFC/dataset/IMDB Dataset.csv')

In [0]:
d.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [0]:
import tqdm

In [0]:
from sklearn.model_selection import train_test_split

In [0]:
import re

In [0]:
def tokenize(sentence):
  #remove HTML tags
  s = re.sub('<[^>]*>', ' ',sentence.lower())
  #remove symbols
  s = re.sub('[^a-z0-9\']', ' ', s)
  s = re.sub('([\'])([ ]*)', r' \1', s)
  return s.strip().split()

In [0]:
train = []
for sentence in tqdm.tqdm(d['review'].values):
  sentence = tokenize(sentence)
  train.append([word2index[w] if w in word2index else word2index['unk'] for w in sentence])

100%|██████████| 50000/50000 [00:07<00:00, 6439.90it/s]


In [0]:
X_train, X_test, y_train, y_test = train_test_split(train, d['sentiment'].values, test_size=0.3, random_state=42)

In [0]:
y_train

array(['negative', 'positive', 'negative', ..., 'negative', 'positive',
       'positive'], dtype=object)

In [0]:
def accuracy(y_true, y_pred):
  assert len(y_true) == len(y_pred)
  c = 0
  for true, pred in zip(y_true, y_pred):
    if true == pred:
      c += 1
  return c / len(y_true)

In [0]:
batch_size = 16
steps = len(X_train) // batch_size
epochs = 2
for epoch in range(epochs):
  avg_loss = 0
  for step in tqdm.tqdm(range(steps), desc='Epoch ' + str(epoch + 1) + '/' + str(epochs)):

    l = model.train_on_batch(x=tf.keras.preprocessing.sequence.pad_sequences(X_train[step * batch_size: (step + 1) * batch_size], padding='post'),
                             y=[0 if val == 'positive' else 1 for val in y_train[step * batch_size: (step + 1) * batch_size]])

    avg_loss += l

    if (step > 0) and step % 500 == 0 or (step == steps - 1):
          print('Loss:', avg_loss / step)

  y_true = []
  y_pred = []
  for x, y in tqdm.tqdm(zip(X_test, y_test), total=len(X_test), desc='Evaluating model on test set'):
    pred = model.predict([[x]]).reshape(-1)[0]
    y_pred.append(int(round(pred)))
    y_true.append(0) if y == 'positive' else y_true.append(1)
  print('accuracy:', accuracy(y_true, y_pred))
  

Epoch 1/2:  23%|██▎       | 501/2187 [22:32<1:22:58,  2.95s/it]

Loss: 0.24113700525462628


Epoch 1/2:  46%|████▌     | 1001/2187 [45:31<1:08:41,  3.48s/it]

Loss: 0.2422814720068127


Epoch 1/2:  69%|██████▊   | 1501/2187 [1:08:07<23:45,  2.08s/it]

Loss: 0.23739955872669816


Epoch 1/2:  91%|█████████▏| 2001/2187 [1:30:56<07:00,  2.26s/it]

Loss: 0.23255186537606642


Epoch 1/2: 100%|██████████| 2187/2187 [1:39:18<00:00,  2.72s/it]
Evaluating model on test set:   0%|          | 0/15000 [00:00<?, ?it/s]

Loss: 0.23093285982787595


Evaluating model on test set: 100%|██████████| 15000/15000 [20:57<00:00, 11.93it/s]
Epoch 2/2:   0%|          | 0/2187 [00:00<?, ?it/s]

accuracy: 0.8826


Epoch 2/2:  23%|██▎       | 501/2187 [22:36<1:23:43,  2.98s/it]

Loss: 0.213968127399683


Epoch 2/2:  46%|████▌     | 1001/2187 [45:48<1:11:12,  3.60s/it]

Loss: 0.21588932528253646


Epoch 2/2:  69%|██████▊   | 1501/2187 [1:08:36<24:01,  2.10s/it]

Loss: 0.213490092072015


Epoch 2/2:  91%|█████████▏| 2001/2187 [1:31:24<07:01,  2.27s/it]

Loss: 0.20774362299824134


Epoch 2/2: 100%|██████████| 2187/2187 [1:39:49<00:00,  2.74s/it]
Evaluating model on test set:   0%|          | 0/15000 [00:00<?, ?it/s]

Loss: 0.20567268394027502


Evaluating model on test set: 100%|██████████| 15000/15000 [20:46<00:00, 12.03it/s]

accuracy: 0.8838666666666667





In [0]:
model.save('drive/My Drive/AFC/models/sentiment_classifier_sigmoid.h5')

In [0]:
model.load_weights('drive/My Drive/AFC/models/sentiment_classifier_sigmoid.h5')

In [0]:
y_true = []
y_pred = []
for x, y in tqdm.tqdm(zip(X_test, y_test), total=len(X_test)):
  pred = model.predict([[x]]).reshape(-1)[0]
  y_pred.append(int(round(pred)))
  y_true.append(0) if y == 'positive' else y_true.append(1)
print('accuracy:', accuracy(y_true, y_pred))

100%|██████████| 15000/15000 [20:13<00:00, 12.36it/s]

accuracy: 0.8881333333333333





In [0]:
#5/5 epochs