Sentiment classification using LSTM model.
Using 220d GloVe embeddings (Jeffrey Pennington, Richard Socher, and Christopher D. Manning. 2014. GloVe: Global Vectors for Word Representation.)
Training and testing model on SNLi dataset.

To run model put embeddings in ./data/sentiment/word_vectors.txt file and put data (from SNLI dataset) as train.txt and test.txt in ./data/sentiment.

In [None]:
import numpy as np
import re, string, unicodedata
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('gutenberg')
nltk.download('averaged_perceptron_tagger')
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize, sent_tokenize, TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, PorterStemmer,WordNetLemmatizer
import inflect

In [None]:
labels = {'neutral': 0, 'contradiction': 1, 'entailment': 2}
words_snli = {}
removed_sent = []
all_words = 0
unk_words = 0
word_to_vec = np.array([], dtype=np.float32).reshape(0, 200) 
word_pos = {}

In [None]:
def to_lowercase(words):
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def normalize(words):
    words = to_lowercase(words)
    words = remove_punctuation(words)
    return words

def add_words_snli(words):
  for word in words:
    if word not in words_snli:
      words_snli[word] = len(words_snli)

def read_snli(name):
  sentences = []
  with open('./data/sentiment/'+name+'.txt', 'r') as test_reader:
    first = True
    for line in test_reader:
      if first == True: 
        first = False
        continue
      test_line = line.strip().split('\t')
      label = test_line[0]
      sent1 = test_line[5]
      sent2 = test_line[6]
      if(label == '-'):
        if len(removed_sent) < 10:
          removed_sent.append((sent1, sent2))
        continue
      sent1 = normalize(word_tokenize(sent1))
      add_words_snli(sent1)
      sent2 = normalize(word_tokenize(sent2))
      add_words_snli(sent2)
      sentences.append((label, sent1, sent2))
  return sentences

def create_embeddings():
  global word_to_vec
  global word_pos
  word_to_vec = np.random.rand(3, 200)
  word_pos = {'xxbos': 0, 'xxeos':1, 'xxunk':2}
  with open('./data/sentiment/word_vectors.txt', 'r') as reader:
    for line in reader:
      line_split = line.strip().split(' ')
      if line_split[0] not in words_snli:
        continue
      word_pos[line_split[0]] = len(word_pos)
      word_vec = np.array([float(x) for x in line_split[1:]], dtype=np.float32)
      word_to_vec = np.vstack((word_to_vec, word_vec))

def adjust_sent(sent):
  global unk_words
  global all_words
  new_sent = ['xxbos']
  for word in sent:
    if(word not in word_pos):
      new_sent.append('xxunk')
      unk_words += 1
    else:
      new_sent.append(word)
      all_words += 1
  new_sent.append('xxeos')
  return new_sent

def convert_sents(sents):
  new_sents = []
  for i, ex in enumerate(sents):
    new_sent1 = adjust_sent(ex[1])
    new_sent2 = adjust_sent(ex[2])
    new_sents.append((ex[0], new_sent1, new_sent2, i))
  return new_sents


In [None]:
train_sent_const = read_snli('train')
test_sent_const = read_snli('test')

create_embeddings()

train_sent = convert_sents(train_sent_const)
test_sent = convert_sents(test_sent_const)
valid_sent = train_sent[-5000:]
train_sent = train_sent[:-5000]

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
torch.manual_seed(1)
from tqdm import tqdm
import matplotlib.pyplot as plt
import copy


In [None]:
labels = {'neutral': 0, 'contradiction': 1, 'entailment': 2}
labels_rev = {0:'neutral', 1:'contradiction', 2: 'entailment'}
def prepare_sent(sent):
  em_sent = np.array([], dtype=np.float32).reshape(0, 200)
  for word in sent:
    em_sent = np.vstack((em_sent, word_to_vec[word_pos[word]]))
  return em_sent

def prepare_tags(examples):
  return torch.tensor([labels[ex[0]] for ex in examples], dtype=torch.long, device=torch.device("cuda"))

def prepare_sents(sents, no):
  sent_lenghts = [len(sent[no]) for sent in sents]
  max_sent = np.amax([len(sent[no]) for sent in sents])
  embedded_sent = np.array([], dtype=np.float32).reshape(0, max_sent, INPUT_DIM)
  for sent in sents:
    sent_emb = prepare_sent(sent[no])
    while(sent_emb.shape[0] < max_sent):
      sent_emb = np.vstack((sent_emb, np.zeros((1, 200))))
    embedded_sent = np.vstack((embedded_sent, np.expand_dims(sent_emb, 0)))

  return torch.tensor(embedded_sent, dtype=torch.float32, device=torch.device("cuda")), sent_lenghts
  

In [None]:
class LSTMTagger(nn.Module):
    def __init__(self, input_dim, hidden_dim, batch_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim
        self.batch_size = batch_size
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)

        self.concat2hidden = nn.Linear(2*hidden_dim, 200)
        self.hidden2tag = nn.Linear(200, tagset_size)
        self.hidden1 = self.init_hidden()
        self.hidden2 = self.init_hidden()

    def init_hidden(self):
        return (torch.zeros(1, self.batch_size, self.hidden_dim).cuda(),
                torch.zeros(1, self.batch_size, self.hidden_dim).cuda())

    def forward(self, sent1, sent1_len, sent2, sent2_len):
        X = torch.nn.utils.rnn.pack_padded_sequence(sent1, sent1_len, batch_first=True, enforce_sorted=False)
        lstm_out1, self.hidden1 = self.lstm(X, self.hidden1)
        Y = torch.nn.utils.rnn.pack_padded_sequence(sent2, sent2_len, batch_first=True, enforce_sorted=False)
        lstm_out2, self.hidden2 = self.lstm(Y, self.hidden2)
        x = torch.cat((self.hidden1[0], self.hidden2[0]), 2)
        x = self.concat2hidden(x)
        x = F.relu(x)
        x = self.hidden2tag(x)
        tag_scores = F.log_softmax(x, dim=1)
        return tag_scores.view(self.batch_size, -1)

In [None]:
false_class = [None, None, None]
correct_class = [None, None, None]

def eval_model(model, values, name, find=False):
  global false_class
  global correct_class
  if find:
    false_class = [None, None, None]
    correct_class = [None, None, None]  
  with torch.no_grad():
    correct = 0
    with tqdm(total=len(values)//BATCH_SIZE) as pbar:
      for pos in range(0, len(values), BATCH_SIZE):
          pbar.update(1)
          if(pos+BATCH_SIZE > len(values)): continue
          ex = values[pos:pos+BATCH_SIZE]
          model.hidden1 = model.init_hidden()
          model.hidden2 = model.init_hidden()
          sent1_emb, sent1_len = prepare_sents(ex, 1)
          sent2_emb, sent2_len = prepare_sents(ex, 2)
          targets = prepare_tags(ex)
          tag_scores = model(sent1_emb, sent1_len, sent2_emb, sent2_len)
          idx = torch.argmax(tag_scores, axis=1)
          correct += torch.sum(idx == targets)
          if find == True:
            counter = 0
            for i in range(len(targets)):
              if idx[i] != targets[i] and false_class[targets[i]] == None:
                false_class[targets[i]] = (idx[i].data.numpy(), targets[i].data.numpy(), ex[i][3])
              elif idx[i] == targets[i] and correct_class[targets[i]] == None:
                correct_class[targets[i]] = (targets[i].data.numpy(), ex[i][3])
            for i in range(3):
              if false_class[i] != None or correct_class[i] != None: break
              else: counter += 2
            if counter == 6: find = False
    print("ACCURACY ON {}: {:.2f}".format(name, correct.data.numpy()/len(values)*100))
  return correct.data.numpy()/len(values)*100

def save_model(model, epoch, best_acc, SAVE_PATH, name):
  print("SAVED model after {} epoch and with valid acc: {}".format(epoch, best_acc))
  torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'best_acc': best_acc
            }, SAVE_PATH+name+'.tar')

def load_model(SAVE_PATH, name):
  model = LSTMTagger(INPUT_DIM, HIDDEN_DIM, BATCH_SIZE, TAGSET_SIZE)
  optimizer = optim.Adam(model.parameters(), lr=0.01)

  checkpoint = torch.load(SAVE_PATH+name+'.tar')
  model.load_state_dict(checkpoint['model_state_dict'])
  optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
  epoch = checkpoint['epoch']
  best_acc = checkpoint['best_acc']

  return model, optimizer, epoch, best_acc

In [None]:
INPUT_DIM = 200
HIDDEN_DIM = 200
BATCH_SIZE = 64
TAGSET_SIZE = 3
EPOCHS = 5
SAVE_PATH = "./sentiment/model/"

model = LSTMTagger(INPUT_DIM, HIDDEN_DIM, BATCH_SIZE, TAGSET_SIZE).to(device)
loss_function = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)
best_model_acc = 0
epoch = 0

LOAD = False
if LOAD == True:
  model, optimizer, epoch, best_model_acc = load_model(SAVE_PATH, 'lstm_model3')

valid_acc_tab = []
test_acc_tab = []
loss_tab = []
total_loss_tab = []

while epoch < EPOCHS: 
    epoch += 1
    print("EPOCH: {}".format(epoch))
    train_sent =  np.random.permutation(train_sent)
    total_loss = 0
    steps = len(train_sent)//(BATCH_SIZE)
    with tqdm(total=steps) as pbar:
     for pos in range(0, len(train_sent), BATCH_SIZE):
        pbar.update(1)
        if(pos+BATCH_SIZE > len(train_sent)): continue
        ex = train_sent[pos:pos+BATCH_SIZE]
        model.zero_grad()
        model.hidden1 = model.init_hidden()
        model.hidden2 = model.init_hidden()

        sent1_emb, sent1_len = prepare_sents(ex, 1)
        sent2_emb, sent2_len = prepare_sents(ex, 2)
        targets = prepare_tags(ex)
        tag_scores = model(sent1_emb, sent1_len, sent2_emb, sent2_len)
        loss = loss_function(tag_scores, targets)
        total_loss += loss
        loss_tab.append(loss.cpu().data.numpy())
        loss.backward()
        optimizer.step()
    print("LOSS ON Trainig:")
    plt.plot(loss_tab)
    plt.show() 
    total_loss_tab.append(total_loss.data.numpy()/steps)
    print("AVARAGE LOSS ON Training:")
    plt.plot(total_loss_tab)
    plt.show()
    print("ACCURACY ON Valid:")
    model_acc = eval_model(model, valid_sent, "VALID", True)
    valid_acc_tab.append(model_acc)
    plt.plot(valid_acc_tab)
    plt.show()
    print("ACCURACY ON Test:")
    test_acc = eval_model(model, test_sent, "TEST")
    test_acc_tab.append(test_acc)
    plt.plot(test_acc_tab)
    plt.show()
    print("ACCURACY VALID: {:.2f}, BEST ACCURACY VALID: {:.2f}, ACCURACY TEST: {:.2f}".format(model_acc, best_model_acc, test_acc))
    if best_model_acc < model_acc:
      best_model_acc = model_acc
      save_model(model, epoch, best_model_acc, SAVE_PATH, 'lstm_model3')

if false_class[0] is not None:
  print("False classified examples:")
  for x in false_class:
    print("""Given: {} 
    and {}. 
    Classified as {}, should be {}.\n""".format(train_sent_const[x[2]][1], train_sent_const[x[2]][2], labels_rev[int(x[0])], labels_rev[int(x[1])]))

  print("Correctly classified examples")
  for x in correct_class:
    print("""Given: {} 
    and {}. 
    Classified as {}, should be {}.\n""".format(train_sent_const[x[1]][1], train_sent_const[x[1]][2], labels_rev[int(x[0])], labels_rev[int(x[0])]))


In [None]:
test_acc = eval_model(model, test_sent, "TEST")
print(test_acc)

valid_acc = eval_model(model, valid_sent, "VALID")
print(valid_acc)