In [1]:
import numpy as np
import pandas as pd
import nltk
import re
from collections import Counter
nltk.download('punkt')
import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn
# !pip install bcolz
import bcolz
import pickle
from sklearn.model_selection import StratifiedKFold

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
traindata = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/sentiment-train.csv')
testdata = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/sentiment-test.csv')
train_all = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/training.1600000.processed.noemoticon.csv',header=None,encoding='latin-1')
train_all.loc[train_all[0] == 4, 0]=1

In [3]:
def preprocess(data):
  for i in range(len(data)):
      data[i] = re.sub(r'https?:\/\/\S*','url',data[i])
      data[i] = re.sub(r'www?:\/\/\S*','url',data[i])
      data[i] = re.sub(r'@\S*','@',data[i])

In [4]:
train_sentences = list(traindata.text)
test_sentences = list(testdata.text)
preprocess(train_sentences)
preprocess(test_sentences)

In [5]:
dic = Counter()
for i, sentence in enumerate(train_sentences):
  train_sentences[i] = []
  for word in nltk.word_tokenize(sentence):
    dic.update([word.lower()])
    train_sentences[i].append(word.lower())

In [6]:
# Removing the words that only appear once
vocab = {k:v for k,v in dic.items() if v>1}
# Sorting the words according to the number of appearances, with the most common word being first
vocab = sorted(vocab, key=vocab.get, reverse=True)
# Adding padding and unknown to our vocabulary so that they will be assigned an index
vocab = ['_PAD','_UNK'] + vocab
# Dictionaries to store the word to index mappings and vice versa
word2idx = {o:i for i,o in enumerate(vocab)}
idx2word = {i:o for i,o in enumerate(vocab)}

In [7]:
for i, sentence in enumerate(train_sentences):
    # Looking up the mapping dictionary and assigning the index to the respective words
    train_sentences[i] = [word2idx[word] if word in word2idx else word2idx['_UNK'] for word in sentence]

for i, sentence in enumerate(test_sentences):
    # For test sentences, we have to tokenize the sentences as well
    test_sentences[i] = [word2idx[word.lower()] if word.lower() in word2idx else word2idx['_UNK'] for word in nltk.word_tokenize(sentence)]

In [8]:
# Defining a function that either shortens sentences or pads sentences with 0 to a fixed length

def pad_input(sentences, seq_len):
    features = np.zeros((len(sentences), seq_len),dtype=int)
    for ii, review in enumerate(sentences):
        if len(review) != 0:
            features[ii, -len(review):] = np.array(review)[:seq_len]
    return features

In [9]:
seq_len = 200 #The length that the sentences will be padded/shortened to

train_sentences = pad_input(train_sentences, seq_len)
test_sentences = pad_input(test_sentences, seq_len)

# Converting our labels into numpy arrays
train_labels = np.array(list(traindata.sentiment))
test_labels = np.array(list(testdata.sentiment))

In [10]:
train_data = TensorDataset(torch.from_numpy(train_sentences), torch.from_numpy(train_labels))
test_data = TensorDataset(torch.from_numpy(test_sentences), torch.from_numpy(test_labels))

batch_size = 400

train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)

In [11]:
# torch.cuda.is_available() checks and returns a Boolean True if a GPU is available, else it'll return False
is_cuda = torch.cuda.is_available()

# If we have a GPU available, we'll set our device to GPU. We'll use this device variable later in our code.
if is_cuda:
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

GPU is available


In [12]:
# dataiter = iter(train_loader)
# sample_x, sample_y = dataiter.next()

# print(sample_x.shape, sample_y.shape)

In [13]:
class LSTMNet(nn.Module):
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5, bidirectional = False):
        super(LSTMNet, self).__init__()
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        num_directions = 2 if bidirectional else 1
        self.num_directions = num_directions
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=drop_prob, batch_first=True, bidirectional = bidirectional)
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x, hidden):
        batch_size = x.size(0)
        x = x.long()
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, hidden)
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        out = self.dropout(lstm_out)
        out = self.fc(out)
        out = self.sigmoid(out)
        
        out = out.view(batch_size, -1)
        out = out[:,-1]
        return out, hidden
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = (weight.new(self.n_layers*self.num_directions, batch_size, self.hidden_dim).zero_().to(device),
            weight.new(self.n_layers*self.num_directions, batch_size, self.hidden_dim).zero_().to(device))
        return hidden

In [14]:
class GRUNet(nn.Module):
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, pretrain = None, drop_prob=0.5, bidirectional =False):
        super(GRUNet, self).__init__()
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        num_directions = 2 if bidirectional else 1
        self.num_directions = num_directions

        if pretrain is None:
          self.embedding = nn.Embedding(vocab_size, embedding_dim)
          self.gru = nn.GRU(embedding_dim, hidden_dim, n_layers, dropout=drop_prob, batch_first=True, bidirectional = bidirectional)
        else:
          (num,d) = pretrain.shape
          self.embedding = nn.Embedding.from_pretrained(torch.from_numpy(pretrain).float(),freeze=False)
          self.gru = nn.GRU(d, hidden_dim, n_layers, dropout=drop_prob, batch_first=True, bidirectional = bidirectional)
        self.fc = nn.Linear(hidden_dim, output_size)
        self.dropout = nn.Dropout(0.2)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x, hidden):
        batch_size = x.size(0)
        x = x.long()
        embeds = self.embedding(x)
        gru_out, hidden = self.gru(embeds, hidden)
        gru_out = gru_out.contiguous().view(-1, self.hidden_dim)
        
        out = self.dropout(gru_out)
        out = self.fc(out)
        out = self.sigmoid(out)
        
        out = out.view(batch_size, -1)
        out = out[:,-1]
        return out, hidden
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = weight.new(self.n_layers*self.num_directions, batch_size, self.hidden_dim).zero_().to(device)
        return hidden

In [15]:
vocab_size = len(word2idx)
output_size = 1
embedding_dim = 400
hidden_dim = 512
n_layers = 2
lr=0.005
epochs = 2
clip = 5

In [16]:
def train(model, trainloader, epochs, batch_size, lr, clip, lstm = True,):
  optimizer = torch.optim.Adam(model.parameters(), lr=lr)
  criterion = nn.BCELoss()
  model.train()
  for i in range(epochs):
      h = model.init_hidden(batch_size)
      
      for inputs, labels in trainloader:
        if lstm:
          h = tuple([e.data for e in h])
        else:
          h = h.data
        inputs, labels = inputs.to(device), labels.to(device)
        model.zero_grad()
        output, h = model(inputs, h)
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()      

In [17]:
def eval(model,testloader,dim = 359,lstm = True):
  criterion = nn.BCELoss()
  test_losses = []
  num_correct = 0
  h = model.init_hidden(dim)

  model.eval()
  for inputs, labels in testloader:
    if lstm:
      h = tuple([each.data for each in h])
    else:
      h = h.data
    inputs, labels = inputs.to(device), labels.to(device)
    output, h = model(inputs, h)
    test_loss = criterion(output.squeeze(), labels.float())
    test_losses.append(test_loss.item())
    pred = torch.round(output.squeeze()) #rounds the output to 0/1
    correct_tensor = pred.eq(labels.float().view_as(pred))
    correct = np.squeeze(correct_tensor.cpu().numpy())
    num_correct += np.sum(correct)
          
  print("Test loss: {:.3f}".format(np.mean(test_losses)))
  test_acc = num_correct/len(testloader.dataset)
  print("Test accuracy: {:.3f}%".format(test_acc*100))
  return test_acc

3.1 LSTM

In [18]:
model = LSTMNet(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)
model.to(device)
print(model)

LSTMNet(
  (embedding): Embedding(17150, 400)
  (lstm): LSTM(400, 512, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc): Linear(in_features=512, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)


In [19]:
train(model,train_loader ,epochs,batch_size,lr,clip)

In [20]:
acc=eval(model,test_loader)

Test loss: 0.488
Test accuracy: 77.716%


3.2 GRU

In [21]:
model2 = GRUNet(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)
model2.to(device)
print(model2)

GRUNet(
  (embedding): Embedding(17150, 400)
  (gru): GRU(400, 512, num_layers=2, batch_first=True, dropout=0.5)
  (fc): Linear(in_features=512, out_features=1, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (sigmoid): Sigmoid()
)


In [22]:
train(model2,train_loader,epochs,batch_size,lr,clip,lstm = False)

In [23]:
acc = eval(model2,test_loader,lstm = False)

Test loss: 0.493
Test accuracy: 78.273%


3.3 Bidirectional LSTM

In [24]:
model3 = LSTMNet(vocab_size, output_size, embedding_dim, hidden_dim, n_layers, bidirectional= True)
model3.to(device)
print(model3)

LSTMNet(
  (embedding): Embedding(17150, 400)
  (lstm): LSTM(400, 512, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc): Linear(in_features=512, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)


In [25]:
train(model3,train_loader,epochs,batch_size,lr,clip)

In [26]:
acc = eval(model3,test_loader)

Test loss: 0.505
Test accuracy: 77.437%


3.4 Bidirectional GRU

In [27]:
model4 = GRUNet(vocab_size, output_size, embedding_dim, hidden_dim, n_layers, bidirectional= True)
model4.to(device)
print(model4)

GRUNet(
  (embedding): Embedding(17150, 400)
  (gru): GRU(400, 512, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (fc): Linear(in_features=512, out_features=1, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (sigmoid): Sigmoid()
)


In [28]:
train(model4,train_loader,epochs,batch_size,lr,clip,lstm=False)

In [29]:
acc = eval(model4,test_loader,lstm = False)

Test loss: 0.566
Test accuracy: 76.880%


In [None]:
# words = []
# idx = 0
# word2idx = {}
# vectors = bcolz.carray(np.zeros(1), rootdir=f'/content/drive/MyDrive/Colab Notebooks/27B.100.dat', mode='w')

# with open(f'/content/drive/MyDrive/Colab Notebooks/glove.twitter.27B.100d.txt', 'rb') as f:
#     for l in f:
#       line = l.decode().split()
#       if idx == 38522:
#         word = l.decode().split(' ')[0]
#         vect = np.array(line).astype(np.float)
#       else:
#         word = line[0]
#         vect = np.array(line[1:]).astype(np.float)
#       words.append(word)
#       word2idx[word] = idx
#       idx += 1
#       vectors.append(vect)
    
# vectors = bcolz.carray(vectors[1:].reshape((idx, 100)), rootdir=f'/content/drive/MyDrive/Colab Notebooks/27B.100.dat', mode='w')
# vectors.flush()
# pickle.dump(words, open(f'/content/drive/MyDrive/Colab Notebooks/27B.100_words.pkl', 'wb'))
# pickle.dump(word2idx, open(f'/content/drive/MyDrive/Colab Notebooks/27B.100_idx.pkl', 'wb'))

3.5 Pretrained GRU

In [30]:
vectors = bcolz.open(f'/content/drive/MyDrive/Colab Notebooks/27B.100.dat')[:]
words = pickle.load(open(f'/content/drive/MyDrive/Colab Notebooks/27B.100_words.pkl', 'rb'))
w2i = pickle.load(open(f'/content/drive/MyDrive/Colab Notebooks/27B.100_idx.pkl', 'rb'))

twitterglove = {w: vectors[w2i[w]] for w in words}

In [31]:
matrix_len = len(vocab)
weights_matrix = np.zeros((matrix_len, 100))

for i, word in enumerate(vocab):
    try: 
        weights_matrix[i] = twitterglove[word]
    except KeyError:
        weights_matrix[i] = np.random.normal(scale=0.6, size=(100))

In [32]:
model5 = GRUNet(vocab_size, output_size, 100, hidden_dim, n_layers, pretrain = weights_matrix)
model5.to(device)
print(model5)

GRUNet(
  (embedding): Embedding(17150, 100)
  (gru): GRU(100, 512, num_layers=2, batch_first=True, dropout=0.5)
  (fc): Linear(in_features=512, out_features=1, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (sigmoid): Sigmoid()
)


In [33]:
train(model5,train_loader,epochs,batch_size,lr,clip,lstm=False)

In [34]:
acc = eval(model5,test_loader,lstm = False)

Test loss: 0.456
Test accuracy: 77.994%


3.6 Average accuracies for different params

In [35]:
for (hidden_dim,embedding_dim) in [(128,100),(128,400),(512,100),(512,400)]:
  skf = StratifiedKFold()
  acc = 0
  for train_index, test_index in skf.split(train_sentences, train_labels):
    x_train, x_test = train_sentences[train_index], train_sentences[test_index]
    y_train, y_test = train_labels[train_index], train_labels[test_index]
    temp_train_data = TensorDataset(torch.from_numpy(x_train), torch.from_numpy(y_train))
    temp_test_data = TensorDataset(torch.from_numpy(x_test), torch.from_numpy(y_test))
    temp_train_loader = DataLoader(temp_train_data, shuffle=True, batch_size=batch_size)
    temp_test_loader = DataLoader(temp_test_data, shuffle=True, batch_size=batch_size)
    best = GRUNet(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)
    best.to(device)
    train(best,temp_train_loader,epochs,batch_size,lr,clip,lstm=False)
    acc += eval(best,temp_test_loader,dim=batch_size,lstm = False)
  print('Hidden size: {} '.format(hidden_dim),'Embedding size: {} '.format(embedding_dim), 'Average accuracy: {}'.format(acc/5))

Test loss: 0.460
Test accuracy: 78.367%
Test loss: 0.481
Test accuracy: 77.875%
Test loss: 0.459
Test accuracy: 78.192%
Test loss: 0.462
Test accuracy: 78.892%
Test loss: 0.454
Test accuracy: 78.592%
Hidden size: 128  Embedding size: 100  Average accuracy: 0.7838333333333333
Test loss: 0.466
Test accuracy: 78.350%
Test loss: 0.489
Test accuracy: 77.992%
Test loss: 0.461
Test accuracy: 78.942%
Test loss: 0.453
Test accuracy: 79.200%
Test loss: 0.452
Test accuracy: 79.483%
Hidden size: 128  Embedding size: 400  Average accuracy: 0.7879333333333334
Test loss: 0.466
Test accuracy: 77.750%
Test loss: 0.501
Test accuracy: 76.050%
Test loss: 0.477
Test accuracy: 77.183%
Test loss: 0.478
Test accuracy: 78.792%
Test loss: 0.472
Test accuracy: 77.825%
Hidden size: 512  Embedding size: 100  Average accuracy: 0.7752
Test loss: 0.481
Test accuracy: 78.358%
Test loss: 0.493
Test accuracy: 77.158%
Test loss: 0.489
Test accuracy: 77.425%
Test loss: 0.465
Test accuracy: 78.792%
Test loss: 0.471
Test ac

3.7 GRU with hidden size 128, embedding size 400 

In [36]:
hidden_dim,embedding_dim = 128,400
model6 = GRUNet(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)
model6.to(device)
train(model6,train_loader,epochs,batch_size,lr,clip,lstm=False)

In [37]:
acc = eval(model6,test_loader,lstm = False)

Test loss: 0.478
Test accuracy: 77.159%


3.8 GRU trained on all the sentiment data with hidden size 128, embedding size 400 

In [38]:
train_all_sentences = list(train_all[5])
preprocess(train_all_sentences)

In [39]:
dic_all = Counter()
for i, sentence in enumerate(train_all_sentences):
    train_all_sentences[i] = []
    for word in nltk.word_tokenize(sentence): #Tokenizing the words
        dic_all.update([word.lower()]) #Converting all the words to lower case
        train_all_sentences[i].append(word.lower())

In [40]:
vocab2 = {k:v for k,v in dic_all.items() if v>1}
vocab2 = sorted(vocab2, key=vocab2.get, reverse=True)
vocab2 = ['_PAD','_UNK'] + vocab2
word2idx = {o:i for i,o in enumerate(vocab2)}
idx2word = {i:o for i,o in enumerate(vocab2)}

In [41]:
for i, sentence in enumerate(train_all_sentences):
    # Looking up the mapping dictionary and assigning the index to the respective words
    train_all_sentences[i] = [word2idx[word] if word in word2idx else word2idx['_UNK'] for word in sentence]

In [42]:
train_all_sentences = pad_input(train_all_sentences,seq_len)
train_all_labels = np.array(list(train_all[0]))

train_all_data = TensorDataset(torch.from_numpy(train_all_sentences), torch.from_numpy(train_all_labels))
train_all_loader = DataLoader(train_all_data, shuffle=True, batch_size=batch_size)

In [43]:
model7 = GRUNet(len(vocab2), output_size, embedding_dim, hidden_dim, n_layers)
model7.to(device)
train(model7,train_all_loader,epochs,batch_size,lr,clip,lstm=False)

In [44]:
acc = eval(model7,test_loader,lstm = False)

Test loss: 1.014
Test accuracy: 53.482%
