In [0]:
# from google.colab import files
# uploaded = files.upload()

In [83]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)  

Mounted at /content/gdrive


In [0]:
data_path = "/content/gdrive/My Drive/nlp_project/"
train_name = "spanglish_train_demojised.txt"
validation_name = "spanglish_validation_demojised.txt"
model_save_name = "spanglish_model.txt"

In [0]:
import sys
sys.path.append(data_path)
sys.path.append(data_path)


In [86]:
import torch

import random
import numpy as np

SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

!pip install transformers



In [0]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')

In [88]:
len(tokenizer.vocab)
tokens = tokenizer.tokenize('how ARE you ')

print(tokens)
indexes = tokenizer.convert_tokens_to_ids(tokens)

print(indexes)


['how', 'are', 'you']
[12548, 10320, 10855]


In [89]:
init_token = tokenizer.cls_token
eos_token = tokenizer.sep_token
pad_token = tokenizer.pad_token
unk_token = tokenizer.unk_token

print(init_token, eos_token, pad_token, unk_token)


# We can get the indexes of the special tokens by converting them using the vocabulary...

# In[7]:


init_token_idx = tokenizer.convert_tokens_to_ids(init_token)
eos_token_idx = tokenizer.convert_tokens_to_ids(eos_token)
pad_token_idx = tokenizer.convert_tokens_to_ids(pad_token)
unk_token_idx = tokenizer.convert_tokens_to_ids(unk_token)

print(init_token_idx, eos_token_idx, pad_token_idx, unk_token_idx)


# ...or by explicitly getting them from the tokenizer.

# In[8]:


init_token_idx = tokenizer.cls_token_id
eos_token_idx = tokenizer.sep_token_id
pad_token_idx = tokenizer.pad_token_id
unk_token_idx = tokenizer.unk_token_id

print(init_token_idx, eos_token_idx, pad_token_idx, unk_token_idx)


# Another thing we need to handle is that the model was trained on sequences with a defined maximum length - it does not know how to handle sequences longer than it has been trained on. We can get the maximum length of these input sizes by checking the `max_model_input_sizes` for the version of the transformer we want to use. In this case, it is 512 tokens.

# In[9]:


max_input_length = tokenizer.max_model_input_sizes['bert-base-multilingual-uncased']

print(max_input_length)


# Previously we have used the `spaCy` tokenizer to tokenize our examples. However we now need to define a function that we will pass to our `TEXT` field that will handle all the tokenization for us. It will also cut down the number of tokens to a maximum length. Note that our maximum length is 2 less than the actual maximum length. This is because we need to append two tokens to each sequence, one to the start and one to the end.

# In[10]:


def tokenize_and_cut(sentence):
    tokens = tokenizer.tokenize(sentence) 
    tokens = tokens[:max_input_length-2]
    return tokens


# Now we define our fields. The transformer expects the batch dimension to be first, so we set `batch_first = True`. As we already have the vocabulary for our text, provided by the transformer we set `use_vocab = False` to tell torchtext that we'll be handling the vocabulary side of things. We pass our `tokenize_and_cut` function as the tokenizer. The `preprocessing` argument is a function that takes in the example after it has been tokenized, this is where we will convert the tokens to their indexes. Finally, we define the special tokens - making note that we are defining them to be their index value and not their string value, i.e. `100` instead of `[UNK]` This is because the sequences will already be converted into indexes.
# 
# We define the label field as before.

# In[87]:


from torchtext import data


[CLS] [SEP] [PAD] [UNK]
101 102 0 100
101 102 0 100
512


In [90]:
UID = data.Field(sequential=False, use_vocab=False, pad_token=None)
TEXT = data.Field(batch_first = True,
                  use_vocab = False,
                  tokenize = tokenize_and_cut,
                  preprocessing = tokenizer.convert_tokens_to_ids,
                  init_token = init_token_idx,
                  eos_token = eos_token_idx,
                  pad_token = pad_token_idx,
                  unk_token = unk_token_idx)

LABEL = data.LabelField()


# We load the data and create the validation splits as before.

# In[89]:

from torchtext import datasets
fields = [('uid',UID),('text', TEXT),('label', LABEL)]
train_data, test_data = data.TabularDataset.splits(
                                        path = data_path,
                                        train = train_name,
                                        test = validation_name,
                                        format = 'tsv',
                                        fields = fields,
                                        skip_header = True)

# train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)
# valid_data = train_data
train_data, valid_data = train_data.split(random_state = random.seed(SEED))


# In[90]:


print(vars(train_data[1]))


{'uid': '1915', 'text': [10230, 10251, 11153, 45899, 10140, 11531, 22004, 77003, 10688, 10117, 40886, 25218, 10111, 10102, 47127, 106, 106, 10525, 131, 10181, 16440, 26354, 35523, 10102, 70355, 10536, 29642, 14965, 10952, 15995, 12828, 12828, 10171, 39214, 10108, 27318, 12828, 10171, 39214, 10108, 27318], 'label': '2'}


In [91]:
print(f"Number of training examples: {len(train_data)}")
print(f"Number of validation examples: {len(valid_data)}")
print(f"Number of testing examples: {len(test_data)}")
print(tokenizer.convert_ids_to_tokens(vars(test_data.examples[331])['text']))


# We can check an example and ensure that the text has already been numericalized.

# In[92]:


print(vars(train_data.examples[6]))


# We can use the `convert_ids_to_tokens` to transform these indexes back into readable tokens.

# In[93]:


tokens = tokenizer.convert_ids_to_tokens(vars(train_data.examples[6])['text'])

print(tokens)


# Although we've handled the vocabulary for the text, we still need to build the vocabulary for the labels.

# In[94]:


LABEL.build_vocab(train_data)


# In[95]:


print(LABEL.vocab.stoi)


# As before, we create the iterators. Ideally we want to use the largest batch size that we can as I've found this gives the best results for transformers.

# In[96]:


BATCH_SIZE = 128

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


Number of training examples: 8401
Number of validation examples: 3601
Number of testing examples: 2998
[':', 'highest', 'iq', 'in', 'the', 'world', '.', '.']
{'uid': '11003', 'text': [10707, 10624, 10547, 10109, 21105, 10119, 100, 15239, 10453, 100, 12698, 10102, 26201], 'label': '1'}
['cu', '##el', '##ga', 'en', 'facebook', 'un', '[UNK]', 'self', '##ie', '[UNK]', 'antes', 'de', 'morir']
defaultdict(<function _default_unk_index at 0x7f993d408620>, {'2': 0, '1': 1, '0': 2})


In [92]:
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    sort_key=lambda x: len(x.text), 
    batch_size = BATCH_SIZE, 
    device = device)

size = 0
for batch in test_iterator:
  #print(batch.uid[0], tokenizer.convert_ids_to_tokens(batch.text[0]))
  size += batch.uid.shape[0]
print(size)

2998


In [0]:

from transformers import BertTokenizer, BertModel
bert = BertModel.from_pretrained('bert-base-multilingual-uncased')


In [0]:
import torch.nn as nn

class BERTGRUSentiment(nn.Module):
    def __init__(self,
                 bert,
                 hidden_dim,
                 output_dim,
                 n_layers,
                 bidirectional,
                 dropout):
        
        super().__init__()
        
        self.bert = bert
        
        embedding_dim = bert.config.to_dict()['hidden_size']
        
        self.rnn = nn.GRU(embedding_dim,
                          hidden_dim,
                          num_layers = n_layers,
                          bidirectional = bidirectional,
                          batch_first = True,
                          dropout = 0 if n_layers < 2 else dropout)
        # self.conv = nn.Conv1d(74,74, 3)
        self.out = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        
        #text = [batch size, sent len]
                
        with torch.no_grad():
            embedded = self.bert(text)[0]
        #print(embedded.size())
        #embedded = [batch size, sent len, emb dim]
        
        _, hidden = self.rnn(embedded)
        #hidden = [n layers * n directions, batch size, emb dim]
        #print(hidden.size())
        if self.rnn.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        else:
            hidden = self.dropout(hidden[-1,:,:])
        #print(hidden.size())
        #hidden = [batch size, hid dim]
        
        output = self.out(hidden)
        
        #output = [batch size, out dim]
        
        return output


In [95]:
HIDDEN_DIM = 256
OUTPUT_DIM = 3
N_LAYERS = 3
BIDIRECTIONAL = True
DROPOUT = 0

model = BERTGRUSentiment(bert,
                         HIDDEN_DIM,
                         OUTPUT_DIM,
                         N_LAYERS,
                         BIDIRECTIONAL,
                         DROPOUT)


# We can check how many parameters the model has. Our standard models have under 5M, but this one has 112M! Luckily, 110M of these parameters are from the transformer and we will not be training those.

# In[100]:


def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')


# In order to freeze paramers (not train them) we need to set their `requires_grad` attribute to `False`. To do this, we simply loop through all of the `named_parameters` in our model and if they're a part of the `bert` transformer model, we set `requires_grad = False`. 

# In[101]:


for name, param in model.named_parameters():                
    if name.startswith('bert'):
        param.requires_grad = False


# We can now see that our model has under 3M trainable parameters, making it almost comparable to the `FastText` model. However, the text still has to propagate through the transformer which causes training to take considerably longer.

# In[102]:


def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')


The model has 171,299,331 trainable parameters
The model has 3,942,915 trainable parameters


In [96]:
model

BERTGRUSentiment(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(105879, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=

In [97]:

for name, param in model.named_parameters():                
    if param.requires_grad:
        print(name)

rnn.weight_ih_l0
rnn.weight_hh_l0
rnn.bias_ih_l0
rnn.bias_hh_l0
rnn.weight_ih_l0_reverse
rnn.weight_hh_l0_reverse
rnn.bias_ih_l0_reverse
rnn.bias_hh_l0_reverse
rnn.weight_ih_l1
rnn.weight_hh_l1
rnn.bias_ih_l1
rnn.bias_hh_l1
rnn.weight_ih_l1_reverse
rnn.weight_hh_l1_reverse
rnn.bias_ih_l1_reverse
rnn.bias_hh_l1_reverse
rnn.weight_ih_l2
rnn.weight_hh_l2
rnn.bias_ih_l2
rnn.bias_hh_l2
rnn.weight_ih_l2_reverse
rnn.weight_hh_l2_reverse
rnn.bias_ih_l2_reverse
rnn.bias_hh_l2_reverse
out.weight
out.bias


In [0]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())


# In[105]:


criterion = nn.CrossEntropyLoss()


# Place the model and criterion onto the GPU (if available)

# In[106]:


model = model.to(device)
criterion = criterion.to(device)


# Next, we'll define functions for: calculating accuracy, performing a training epoch, performing an evaluation epoch and calculating how long a training/evaluation epoch takes.

# In[107]:




In [0]:

def categorical_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    max_preds = preds.argmax(dim = 1, keepdim = True) # get the index of the max probability
    correct = max_preds.squeeze(1).eq(y)
    return correct.sum() / torch.FloatTensor([y.shape[0]])


# In[108]:


def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        predictions = model(batch.text).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = categorical_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [0]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = categorical_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)


# In[110]:


import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs


In [101]:
N_EPOCHS = 20

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), model_save_name)
        path = data_path+F"{model_save_name}"
        torch.save(model.state_dict(), path)
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')



Epoch: 01 | Epoch Time: 0m 35s
	Train Loss: 1.016 | Train Acc: 47.60%
	 Val. Loss: 0.976 |  Val. Acc: 51.07%
Epoch: 02 | Epoch Time: 0m 36s
	Train Loss: 0.972 | Train Acc: 50.91%
	 Val. Loss: 0.963 |  Val. Acc: 51.34%
Epoch: 03 | Epoch Time: 0m 36s
	Train Loss: 0.938 | Train Acc: 53.09%
	 Val. Loss: 0.932 |  Val. Acc: 53.40%
Epoch: 04 | Epoch Time: 0m 37s
	Train Loss: 0.920 | Train Acc: 54.38%
	 Val. Loss: 0.929 |  Val. Acc: 54.00%
Epoch: 05 | Epoch Time: 0m 37s
	Train Loss: 0.898 | Train Acc: 56.51%
	 Val. Loss: 0.931 |  Val. Acc: 53.94%
Epoch: 06 | Epoch Time: 0m 37s
	Train Loss: 0.870 | Train Acc: 57.99%
	 Val. Loss: 0.945 |  Val. Acc: 53.51%
Epoch: 07 | Epoch Time: 0m 37s
	Train Loss: 0.838 | Train Acc: 61.50%
	 Val. Loss: 1.002 |  Val. Acc: 53.17%
Epoch: 08 | Epoch Time: 0m 37s
	Train Loss: 0.804 | Train Acc: 62.64%
	 Val. Loss: 1.004 |  Val. Acc: 53.88%
Epoch: 09 | Epoch Time: 0m 36s
	Train Loss: 0.747 | Train Acc: 66.15%
	 Val. Loss: 1.070 |  Val. Acc: 53.59%
Epoch: 10 | Epoch T

KeyboardInterrupt: ignored

In [102]:
path = data_path+F"{model_save_name}"
model.load_state_dict(torch.load(path))

<All keys matched successfully>

In [0]:

def test_evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    epoch_all_acc = torch.FloatTensor([0,0,0,0,0,0])
    print(epoch_all_acc)
    model.eval()

    label_dict = {0:"positive", 1:"neutral", 2:"negative"}
    answer = open("answer.txt", "w")
    answer.write("Uid,Sentiment\n")
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            #write to file
            #print("##############")
            #print(batch.uid.shape)
            #print(predictions.shape)
            for i in range(batch.uid.shape[0]):
              uid = batch.uid[i].item()
              label_number = torch.argmax(predictions[i]).item()
              label_string = label_dict[label_number]
              #print(uid, label_string)
              answer.write(str(uid)+','+label_string+'\n')
            loss = criterion(predictions, batch.label)
            
            acc,all_acc = test_categorical_accuracy(predictions, batch.label)
            print(all_acc)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
            epoch_all_acc += all_acc
        
    answer.close()
    return epoch_loss / len(iterator), epoch_acc / len(iterator),epoch_all_acc/len(iterator)

def test_categorical_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    count0,count1,count2 = torch.zeros(1),torch.zeros(1),torch.zeros(1)
    total0,total1,total2 = torch.FloatTensor(1),torch.FloatTensor(1),torch.FloatTensor(1)
    max_preds = preds.argmax(dim = 1, keepdim = True) # get the index of the max probability
    correct = max_preds.squeeze(1).eq(y)
    
    for j,i in enumerate(y.cpu().numpy()):
      if i==0:
        count0+=correct[j]
        total0+=1
      elif i==1:
        count1+=correct[j]
        total1+=1
      elif i==2:
        count2+=correct[j]
      else:
        print(i,i==0,i==1,i==2)
        total2+=1
    # print(count0,count1,count2,total0,total1,total2)
    # print([count0/total0,count1/total1,count2/total2])
    # print(torch.FloatTensor([count0/total0,count1/total1,count2/total2]))
    # print(correct.sum() / torch.FloatTensor([y.shape[0]]))
    # print(torch.FloatTensor([count0/total0,count1/total1,count2/total2]))
    print(count0,count1,count2)
    return correct.sum() / torch.FloatTensor([y.shape[0]]),torch.FloatTensor([count0/total0,count1/total1,count2/total2,count0,count1,count2])



In [104]:
# model.load_state_dict(torch.load('tut6-model.pt'))

test_loss, test_acc,test_all_acc = test_evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%',test_all_acc)


# ## Inference
# 
# We'll then use the model to test the sentiment of some sequences. We tokenize the input sequence, trim it down to the maximum length, add the special tokens to either side, convert it to a tensor, add a fake batch dimension and then pass it through our model.

# In[ ]:







tensor([0., 0., 0., 0., 0., 0.])
tensor([40.]) tensor([19.]) tensor([0.])
tensor([ 0.8163,  0.2603, -0.0000, 40.0000, 19.0000,  0.0000])
tensor([52.]) tensor([10.]) tensor([5.])
tensor([ 8.6667e-01,  1.9608e-01, -1.4826e+15,  5.2000e+01,  1.0000e+01,
         5.0000e+00])
tensor([36.]) tensor([9.]) tensor([2.])
tensor([ 7.2000e-01,  1.5517e-01, -5.7793e+14,  3.6000e+01,  9.0000e+00,
         2.0000e+00])
tensor([53.]) tensor([6.]) tensor([6.])
tensor([ 8.8333e-01,  1.3043e-01, -1.9778e+15,  5.3000e+01,  6.0000e+00,
         6.0000e+00])
tensor([40.]) tensor([5.]) tensor([7.])
tensor([ 7.0175e-01,  9.8039e-02, -2.0751e+15,  4.0000e+01,  5.0000e+00,
         7.0000e+00])
tensor([54.]) tensor([10.]) tensor([5.])
tensor([ 9.1525e-01,  2.0000e-01, -1.4820e+15,  5.4000e+01,  1.0000e+01,
         5.0000e+00])
tensor([48.]) tensor([8.]) tensor([5.])
tensor([ 9.0566e-01,  1.4545e-01, -1.6480e+15,  4.8000e+01,  8.0000e+00,
         5.0000e+00])
tensor([42.]) tensor([8.]) tensor([4.])
tensor([ 7.

In [0]:
def predict_sentiment(model, tokenizer, sentence):
    model.eval()
    tokens = tokenizer.tokenize(sentence)
    tokens = tokens[:max_input_length-2]
    indexed = [init_token_idx] + tokenizer.convert_tokens_to_ids(tokens) + [eos_token_idx]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(0)
    prediction = torch.sigmoid(model(tensor))
    # ind = np.argmax(np.array(prediction))
    # if ind ==0:
    #   print('neutral')
    # elif ind == 1:
    #   print("positive")
    # else:
    #   print("negative")
    print(prediction)
predict_sentiment(model, tokenizer, "This film is terrible")


# In[ ]:


predict_sentiment(model, tokenizer, "This film is great")

In [0]:
while True:
  sent = input('->')
  if sent != '$':
    predict_sentiment(model, tokenizer, sent)
  else:
    break
