In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [19]:
!pip install transformers torch wandb -q 

[K     |████████████████████████████████| 1.8 MB 7.0 MB/s 
[K     |████████████████████████████████| 181 kB 77.4 MB/s 
[K     |████████████████████████████████| 144 kB 69.9 MB/s 
[K     |████████████████████████████████| 63 kB 2.0 MB/s 
[?25h  Building wheel for pathtools (setup.py) ... [?25l[?25hdone


In [20]:
import random
import pandas as pd
import torch
import torch.nn as nn
import numpy as np
import wandb
import sys
import time
import datetime

from transformers import BertForSequenceClassification, AdamW, BertConfig
from torch.utils.data import TensorDataset, random_split
from transformers import BertTokenizer
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

In [21]:
# First checking if GPU is available
train_on_gpu = torch.cuda.is_available()

if train_on_gpu:
    device_name = torch.cuda.get_device_name()
    n_gpu = torch.cuda.device_count()
    print(f"Found device: {device_name}, n_gpu: {n_gpu}")
    device = torch.device("cuda")
    print('Training on GPU.')
else:
    print('No GPU available, training on CPU.')

Found device: Tesla T4, n_gpu: 1
Training on GPU.


In [23]:
wandb.init(project="Sentiment-Analysis on Movie Reviews")

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

In [25]:
def tokenize_and_format(sentences):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
    input_ids = []
    attention_masks = []

    for sentence in sentences:
        encoded_dict = tokenizer.encode_plus(
            sentence,  # Sentence to encode.
            add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
            max_length=64,  # Pad & truncate all sentences.
            padding='max_length',
            truncation=True,
            return_attention_mask=True,  # Construct attn. masks.
            return_tensors='pt',  # Return pytorch tensors.
        )
        input_ids.append(encoded_dict['input_ids'])

        attention_masks.append(encoded_dict['attention_mask'])
    return input_ids, attention_masks


def split_data(df, split_ratio, input_ids, attention_masks, texts, labels):
    total = len(df)
    train_ratio = split_ratio
    val_ratio = 0.1
    num_train = int(total * train_ratio)
    num_val = int(total * val_ratio)
    num_test = total - num_train - num_val

    # make lists of 3-tuples (already shuffled the dataframe in cell above)

    train_set = [(input_ids[i], attention_masks[i], labels[i]) for i in range(num_train)]
    val_set = [(input_ids[i], attention_masks[i], labels[i]) for i in range(num_train, num_val + num_train)]
    test_set = [(input_ids[i], attention_masks[i], labels[i]) for i in range(num_val + num_train, total)]

    train_text = [texts[i] for i in range(num_train)]
    val_text = [texts[i] for i in range(num_train, num_val + num_train)]
    test_text = [texts[i] for i in range(num_val + num_train, total)]

    return train_text, train_set, val_set, val_text, test_set, test_text


def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)


In [28]:
train_df = pd.read_csv('./train.tsv', delimiter="\t")
train_df = train_df.sample(frac=1).reset_index(drop=True)
train_df = train_df[:10000]
texts = train_df.Phrase.values
labels = train_df.Sentiment.values

input_ids, attention_masks = tokenize_and_format(texts)
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)


In [29]:
# Print sentence 0, now as a list of IDs.
print('Original: ', texts[0])
print('Token IDs:', input_ids[0])
print('Label:', labels[0])

Original:  definitely
Token IDs: tensor([ 101, 5791,  102,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0])
Label: tensor(3)


In [30]:
total = len(train_df)
train_ratio = 0.8
val_ratio = 0.1
num_train = int(total * train_ratio)
num_val = int(total * val_ratio)
num_test = total - num_train - num_val

# make lists of 3-tuples (already shuffled the dataframe in cell above)

train_set = [(input_ids[i], attention_masks[i], labels[i]) for i in range(num_train)]
val_set = [(input_ids[i], attention_masks[i], labels[i]) for i in range(num_train, num_val + num_train)]
test_set = [(input_ids[i], attention_masks[i], labels[i]) for i in range(num_val + num_train, total)]

train_text = [texts[i] for i in range(num_train)]
val_text = [texts[i] for i in range(num_train, num_val + num_train)]
test_text = [texts[i] for i in range(num_val + num_train, total)]


In [31]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 5, # The number of output labels.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

# Tell pytorch to run this model on the GPU.
model.cuda()


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [32]:
batch_size = 64
optimizer = AdamW(model.parameters(),
                  lr = 1e-6, # args.learning_rate - default is 5e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8
                )
epochs = 30



In [33]:
import numpy as np
# function to get validation accuracy
def get_validation_performance(val_set):
    # Put the model in evaluation mode
    model.eval()

    # Tracking variables 
    total_eval_accuracy = 0
    total_eval_loss = 0
    correct = np.array([])

    num_batches = int(len(val_set)/batch_size) + 1

    total_correct = 0

    for i in range(num_batches):

      end_index = min(batch_size * (i+1), len(val_set))

      batch = val_set[i*batch_size:end_index]
      
      if len(batch) == 0: continue

      input_id_tensors = torch.stack([data[0] for data in batch])
      input_mask_tensors = torch.stack([data[1] for data in batch])
      label_tensors = torch.stack([data[2] for data in batch])
      
      # Move tensors to the GPU
      b_input_ids = input_id_tensors.to(device)
      b_input_mask = input_mask_tensors.to(device)
      b_labels = label_tensors.to(device)
        
      # Tell pytorch not to bother with constructing the compute graph during
      # the forward pass, since this is only needed for backprop (training).
      with torch.no_grad():        

        # Forward pass, calculate logit predictions.
        outputs = model(b_input_ids, 
                                token_type_ids=None, 
                                attention_mask=b_input_mask,
                                labels=b_labels)
        loss = outputs.loss
        logits = outputs.logits
            
        # Accumulate the validation loss.
        total_eval_loss += loss.item()
        
        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the number of correctly labeled examples in batch
        pred_flat = np.argmax(logits, axis=1).flatten()
        labels_flat = label_ids.flatten()
        correct =  np.append(correct, pred_flat== labels_flat)
        num_correct = np.sum(pred_flat == labels_flat)
        total_correct += num_correct
        
    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_correct / len(val_set)
    return avg_val_accuracy, correct



In [35]:
# Define the loss function
criterion = nn.CrossEntropyLoss()

step = 0

# For each epoch...
for epoch_i in range(0, epochs):
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Reset the total loss for this epoch.
    total_train_loss = 0

    # Put the model into training mode.
    model.train()

    wandb.log({"epoch": epoch_i+ 1})



    # For each batch of training data...
    num_batches = int(len(train_set) / batch_size) + 1

    for i in range(num_batches):
        end_index = min(batch_size * (i + 1), len(train_set))

        batch = train_set[i * batch_size:end_index]

        if len(batch) == 0: continue

        step += 1

        input_id_tensors = torch.stack([data[0] for data in batch])
        input_mask_tensors = torch.stack([data[1] for data in batch])
        label_tensors = torch.stack([data[2] for data in batch])

        # Move tensors to the GPU
        b_input_ids = input_id_tensors.to(device)
        b_input_mask = input_mask_tensors.to(device)
        b_labels = label_tensors.to(device)

        # Clear the previously calculated gradient
        model.zero_grad()

        # # Perform a forward pass (evaluate the model on this training batch).
        # outputs = model(b_input_ids,
        #                 token_type_ids=None,
        #                 attention_mask=b_input_mask,
        #                 labels=b_labels)

        outputs = model(b_input_ids,
                        token_type_ids=None,
                        attention_mask=b_input_mask,
                        labels=b_labels)
        logits = outputs.logits
        loss = criterion(logits, b_labels)
        #
        # loss = outputs.loss
        # logits = outputs.logits

        total_train_loss += loss.item()
        wandb.log({"train loss": loss.item()})
        wandb.log({"step": step})


        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Update parameters and take a step using the computed gradient.
        optimizer.step()

    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set. Implement this function in the cell above.
    print(f"Total loss: {total_train_loss}")
    val_acc, co = get_validation_performance(val_set)
    print(f"Validation accuracy: {val_acc}")
    wandb.log({"validation accuracy": val_acc})


    if (epoch_i + 1) % 5 == 0:
        print("Saving the checkpoint..")
        torch.save(model, '/content/drive/MyDrive/cs520-ml-toolkits/checkpoints/model.pt')

print("")
print("Training complete!")



Training...
Total loss: 185.54606139659882
Validation accuracy: 0.513

Training...
Total loss: 168.6389697790146
Validation accuracy: 0.513

Training...
Total loss: 159.30123031139374
Validation accuracy: 0.513

Training...
Total loss: 151.09190034866333
Validation accuracy: 0.548

Training...
Total loss: 141.2774149775505
Validation accuracy: 0.566
Saving the checkpoint..

Training...
Total loss: 132.83572667837143
Validation accuracy: 0.587

Training...
Total loss: 126.7534784078598
Validation accuracy: 0.605

Training...
Total loss: 122.06436908245087
Validation accuracy: 0.611

Training...
Total loss: 118.36743462085724
Validation accuracy: 0.618

Training...
Total loss: 114.95693343877792
Validation accuracy: 0.631
Saving the checkpoint..

Training...
Total loss: 112.61086148023605
Validation accuracy: 0.632

Training...
Total loss: 109.56425058841705
Validation accuracy: 0.636

Training...
Total loss: 106.63640505075455
Validation accuracy: 0.641

Training...
Total loss: 104.215