In [1]:
import numpy as np
import pandas as pd

In [2]:
reviews_data = pd.read_csv("drive/MyDrive/IMDB Dataset.csv")
reviews_data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
reviews_data.shape

(50000, 2)

## Data Preprocessing

In [27]:
def get_reviews(data):
  reviews = []  # list-of-lists of words
  for i in range(data.shape[0]):
    sent = data.iloc[i,0]
    punc_str = "!\"#$%&()*+,-./:;<=>?@[\\]^_`{|}~"
    punc_table = {ord(char): None for char in punc_str}
    sent = sent.translate(punc_table)  # remove punc
    sent = sent.lower()  # lower case
    sent = " ".join(sent.split())  # remove consecutive WS
    word_list = sent.split(" ")
    reviews.append(word_list)
  word_freq_dict = {}
  for review in reviews:
    for word in review:
      if word in word_freq_dict:
        word_freq_dict[word] += 1
      else:
        word_freq_dict[word] = 1
  return reviews,word_freq_dict

In [28]:
def vocab_list_sorted(word_freq_dict):
  kv_list = []  # list of word-freq tuples so can sort
  for (k,v) in word_freq_dict.items():
    kv_list.append((k,v))

  sorted_kv_list = sorted(kv_list, key=lambda x: x[1], reverse=True) # sort by freq
  vocab_dict = {}
  # key = word, value = 1-based rank
  # ('the' = 1, 'a' = 2, etc.)
  for i in range(len(sorted_kv_list)):
    w = sorted_kv_list[i][0]  # word is at [0]
    vocab_dict[w] = i+1       # 1-based as in Keras dataset
  return vocab_dict


In [34]:
def generate_file(reviews_lists, vocab_dict, max_review_len, label_char):

  num_reviews = []
  offset = 3  # Keras offset: 'the' = 1 (most frequent)

  for i in range(len(reviews_lists)):  # walk each review
    curr_review = reviews_lists[i]
    num_review = []
    n_words = len(curr_review)
    if n_words > max_review_len:
      continue  # next i, continue without writing anything

    n_pad = max_review_len - n_words   # num of 0s to pre-pend

    for j in range(n_pad):  # write padding to get 50 values
      num_review.append(int(0))

    for word in curr_review:
      # a word in test set might not have been in training set
      if word not in vocab_dict:
        num_review.append(int(2))  # 2 is out-of-vocab index
      else:
        idx = vocab_dict[word] + offset
        num_review.append(idx)
    num_review.append(label_char)
    num_reviews.append(num_review)
  return num_reviews

In [35]:
positive = reviews_data[reviews_data["sentiment"] == "positive"]
negative = reviews_data[reviews_data["sentiment"] == "negative"]

In [36]:
reviews_lists_p, word_freq_p =  get_reviews(positive)
vocab_dict_p = vocab_list_sorted(word_freq_p)
max_review_len = 100
label_char1 = 0 #positive
p_data = generate_file(reviews_lists_p, vocab_dict_p, max_review_len, label_char1)

In [40]:
reviews_lists_n, word_freq_n =  get_reviews(negative)
vocab_dict_n = vocab_list_sorted(word_freq_n)
max_review_len = 100
label_char2 = 1 #negative
n_data = generate_file(reviews_lists_n, vocab_dict_n, max_review_len, label_char2)

In [38]:
length_p = int((0.9)*len(p_data))
X_train_p = p_data[:length_p][:-1]
y_train_p = p_data[:length_p][-1]
X_test_p = p_data[length_p:][:-1]
y_test_p = p_data[length_p:][-1]

In [41]:
length_n = int((0.9)*len(n_data))
X_train_n = n_data[:length_n][:-1]
y_train_n = n_data[:length_n][-1]
X_test_n = n_data[length_n:][:-1]
y_test_n = n_data[length_n:][-1]

In [42]:
X_train = X_train_p + X_train_n
y_train = y_train_p + y_train_n
X_test = X_test_p + X_test_n
y_test = y_test_p + y_test_n

In [43]:
import torch as T
device = T.device('cuda')

In [44]:
# convert data to tensors
train_x = T.tensor(X_train, dtype=T.int64).to(device)
train_y = T.tensor(y_train, dtype=T.int64).to(device)
test_x = T.tensor(X_test, dtype=T.int64).to(device)
test_y = T.tensor(y_test, dtype=T.int64).to(device)

N = len(train_x)
print("Data loaded. Number train items = %d " % N)


Data loaded. Number train items = 5557 


## LSTM model

In [45]:
class LSTM_Net(T.nn.Module):
  def __init__(self):
    # vocab_size = 129892
    super(LSTM_Net, self).__init__()
    self.embed = T.nn.Embedding(129892, 32)
    self.lstm = T.nn.LSTM(32, 75)
    self.drop = T.nn.Dropout(0.10)
    self.fc1 = T.nn.Linear(75, 10)
    self.fc2 = T.nn.Linear(10, 2)  # 0=neg, 1=pos

  def forward(self, x):
    # x = review/sentence. length = 50 (fixed w/ padding)
    z = self.embed(x)
    z = z.view(50, 1, 32)  # "seq batch input"
    lstm_oupt, (h_n, c_n) = self.lstm(z)
    z = lstm_oupt[-1]
    z = self.drop(z)
    z = T.tanh(self.fc1(z))
    z = self.fc2(z)  # CrossEntropyLoss will apply softmax
    return z

In [46]:
def accuracy(model, data_x, data_y):
  # data_x and data_y are lists of tensors
  model.eval()
  num_correct = 0; num_wrong = 0
  for i in range(len(data_x)):
    X = data_x[i]
    Y = data_y[i].reshape(1)
    with T.no_grad():
      oupt = model(X)

    idx = T.argmax(oupt.data)
    if idx == Y:  # predicted == target
      num_correct += 1
    else:
      num_wrong += 1
  acc = (num_correct * 100.0) / (num_correct + num_wrong)
  model = model.train()
  return acc

In [None]:
  # 2. create network
net = LSTM_Net().to(device)

  # 3. train model
loss_func = T.nn.CrossEntropyLoss()  # does log-softmax()
optimizer = T.optim.Adam(net.parameters(), lr=1.0e-3)
max_epochs = 12
log_interval = 2  # display progress

print("\nStarting training with bat_size = 1")
for epoch in range(0, max_epochs):
  net.train()  # set training mode
  indices = np.arange(N)
  np.random.shuffle(indices)
  tot_err = 0.0

  for i in range(N):  # one review at a time
    j = indices[i]
    X = train_x[j]
    Y = train_y[j].reshape(1)

    optimizer.zero_grad()
    oupt = net(X)
    loss_val = loss_func(oupt, Y)
    tot_err += loss_val.item()
    loss_val.backward()  # compute gradients
    optimizer.step()     # update weights

  if epoch % log_interval == 0:
    print("epoch = %4d  |" % epoch, end="")
    print("  avg loss = %7.4f  |" % (tot_err / N), end="")
    train_acc = accuracy(net, train_x, train_y)
    print("  accuracy = %7.2f%%" % train_acc)
    # test_acc = accuracy(net, test_x, test_y)  #
    # print("  test accuracy = %7.2f%%" % test_acc)
print("Training complete")

In [None]:
  # 4. evaluate model
test_acc = accuracy(net, test_x, test_y)
print("\nAccuracy on test data = %7.2f%%" % test_acc)