In [11]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.data import RandomSampler, SequentialSampler

!pip install transformers
!pip install wget
!pip install jsonlines

import transformers
from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig

from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import matthews_corrcoef

import wget
import json
import jsonlines
import os



In [2]:
##########    Downloading the Cola dataset   ###########

url = 'https://nyu-mll.github.io/CoLA/cola_public_1.1.zip' # link for dataset
if not os.path.exists('./cola_public_1.1.zip'):
    wget.download(url, './cola_public_1.1.zip') # download if not already exists

#########   Setting GPU device   #########

if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

print(device)

cuda


In [3]:
###########   Unzip the dataset   ############
if not os.path.exists('./cola_public/'):
    !unzip cola_public_1.1.zip

##########   Parse the data   ###########

def read_and_parse_data(path, turn):
    d = pd.read_csv(path, 
                    delimiter='\t', header=None,
                    names=['sentence_source', 'label', 'label_notes', 'sentence'])

    # data contains 8551 training sentences.
    # if label = 0 => sentence is gramatically incorrect
    d.sample(5) # sample 5 data randomly

    #####  extract sentences and labels ####

    train_data = d.sentence.values  # (8551,)
    label = d.label.values          # (8551,)

    ######  Process data into BERT acceptable format  #####

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', 
                                            do_lower_case=True)

    ######  showcasing the work of BERT tokenizer on one example  #######
    print(train_data[2])
    print(tokenizer.tokenize(train_data[2]))

    ######  Tokenize all the sentences, add special symbols at once  #######
    inputs = []  # will store vocab indices for each word in sentences
    for sentence in train_data:
        s = tokenizer.encode(sentence, add_special_tokens=True)
        inputs.append(s)

    print(inputs[2])
    print(len(inputs))  # all 8551 sentences tokenized
    #######   Padding, Mask   ########
    max_length = 64

    # Pad each data with 0's to make it const lenght(64) as needed for BERT
    inputs = pad_sequences(inputs, maxlen=max_length, dtype="long",
                        value=0, truncating="post", padding="post")

    print(inputs[2])

    # Mask differentiates actual word from padding in each sentence
    mask = []
    for s in inputs:
        m = [int(word_id > 0) for word_id in s]
        mask.append(m)

    print(mask[2])  # contains 1 for original words, 0 for padded words

    #######   split data into train and test set  ########
    test_size = 0.1 if turn == "train" else 0.001

    train_set, test_set, train_labels, test_labels = train_test_split(
        inputs, label, random_state=2018, test_size=test_size
    )
    
    train_mask, test_mask, _, _ = train_test_split(mask, label, 
                                                random_state=2018, 
                                                test_size=test_size)

    print("train set size",len(train_set))  # 0.9 times original num of sentences
    print("test set size", len(test_set))   # 0.1 times original num of sentences
    print()

    ########   convert NumPy arrays to Tensor data  ########

    train_set = torch.tensor(train_set)
    test_set = torch.tensor(test_set)
    train_labels = torch.tensor(train_labels)
    test_labels = torch.tensor(test_labels)
    train_mask = torch.tensor(train_mask)
    test_mask = torch.tensor(test_mask)

    print("train data shape", train_set.shape)
    print("test data shape", train_labels.shape)
    print("train mask shape", train_mask.shape)

    #######   Dataloader to load train and test data in batches  ########

    batch_size = 32  # Recommended in paper
    train = TensorDataset(train_set, train_mask, train_labels)
    sampler = RandomSampler(train)
    train_loader = DataLoader(train, batch_size=batch_size, sampler=sampler)

    test = TensorDataset(test_set, test_mask, test_labels)
    samp = RandomSampler(test)
    test_loader = DataLoader(test, batch_size=batch_size, sampler=samp)

    return train_loader, test_loader

    # Now our train and test data loader is processed and ready as needed

Archive:  cola_public_1.1.zip
   creating: cola_public/
  inflating: cola_public/README      
   creating: cola_public/tokenized/
  inflating: cola_public/tokenized/in_domain_dev.tsv  
  inflating: cola_public/tokenized/in_domain_train.tsv  
  inflating: cola_public/tokenized/out_of_domain_dev.tsv  
   creating: cola_public/raw/
  inflating: cola_public/raw/in_domain_dev.tsv  
  inflating: cola_public/raw/in_domain_train.tsv  
  inflating: cola_public/raw/out_of_domain_dev.tsv  


In [4]:
#######  Call the dataloader for training data  #########
train_loader, _ = read_and_parse_data("./cola_public/raw/in_domain_train.tsv",
                                   turn="train")

#######   define the model   ###########
bert_model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2,
    output_attentions = False,
    output_hidden_states = False,
)

bert_model = bert_model.to(device)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…


One more pseudo generalization or I'm giving up.
['one', 'more', 'pseudo', 'general', '##ization', 'or', 'i', "'", 'm', 'giving', 'up', '.']
[101, 2028, 2062, 18404, 2236, 3989, 2030, 1045, 1005, 1049, 3228, 2039, 1012, 102]
8551
[  101  2028  2062 18404  2236  3989  2030  1045  1005  1049  3228  2039
  1012   102     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
train set size 7695
test set size 856

train data shape torch.Size([7695, 64])
test data shape torch.Size([7695])
train mask shape torch.Size([7695, 64])


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




In [5]:
########  define optimizer, write accuracy fn  ###########

optimizer = AdamW(bert_model.parameters(), lr=2e-5, eps=1e-8)
num_epochs = 4
loss_arr = []

def compute_accuracy(preds, targets):
    return (torch.argmax(preds, dim=1) == targets).float().mean().item()
    

In [6]:
###############   TRAINING OF BERT MODEL  ##################

for ep in range(num_epochs):
    bert_model.train()
    epoch_loss = 0

    for i, data in enumerate(train_loader):
        batch_data = data[0].to(device).long()
        batch_mask = data[1].to(device).long()
        batch_labels = data[2].to(device).long()

        bert_model.zero_grad()
        pred = bert_model(batch_data,
                             token_type_ids=None,
                             attention_mask=batch_mask,
                             labels=batch_labels)
        
        # As we call the model with labels, it returns the loss in a tuple
        loss = pred[0]  
        epoch_loss += loss.item()
        loss.backward()  # Backprpagation

        # Clip Gradient norm to mitigate exploding of gradients
        torch.nn.utils.clip_grad_norm_(bert_model.parameters(), 1.0)
        optimizer.step()

    epoch_loss /= len(train_loader)
    print("train loss after %d epochs is %f " %(ep+1, epoch_loss))
    loss_arr.append(epoch_loss)

    ##########   Validation   ##########


train loss after 1 epochs is 0.478859 
train loss after 2 epochs is 0.302372 
train loss after 3 epochs is 0.186273 
train loss after 4 epochs is 0.135181 


In [7]:
########### Call the dataloader for loading test data  #############

test_loader, _ = read_and_parse_data("./cola_public/raw/out_of_domain_dev.tsv",
                                     turn="test")

print(len(test_loader))
test_acc = 0.0
steps = 0

If Sam was going, Sally would know where.
['if', 'sam', 'was', 'going', ',', 'sally', 'would', 'know', 'where', '.']
[101, 2065, 3520, 2001, 2183, 1010, 8836, 2052, 2113, 2073, 1012, 102]
516
[ 101 2065 3520 2001 2183 1010 8836 2052 2113 2073 1012  102    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
train set size 515
test set size 1

train data shape torch.Size([515, 64])
test data shape torch.Size([515])
train mask shape torch.Size([515, 64])
17


In [27]:
#########   Performance over Test set  ##############

bert_model.eval()
pred_list = []
true_labels = []

for batch in test_loader:
    batch = tuple(t.to(device) for t in batch)
    batch_data, batch_mask, batch_labels = batch

    with torch.no_grad():
        preds = bert_model(batch_data,
                                token_type_ids=None,
                                attention_mask=batch_mask)

    logits = preds[0]
    logits = logits.detach().cpu()
    targets = batch_labels.to('cpu')

    pred_list.append(logits.numpy())
    true_labels.append(targets.numpy())
    
    acc = compute_accuracy(logits, targets)
    test_acc += acc
    steps += 1

print("final test set accuracy is ", (test_acc / steps))


final test set accuracy is  0.8204656874432283


In [30]:
####### Evaluating the GLUE score(Mathew's Correlation for CoLA)  ########

matthews_set = []
for i in range(len(true_labels)):
  pred_labels_i = np.argmax(pred_list[i], axis=1).flatten()
  
  # Calculates and stores the coef for this batch.  
  matthews = matthews_corrcoef(true_labels[i], pred_labels_i)                
  matthews_set.append(matthews)

flat_predictions = [item for sublist in pred_list for item in sublist]
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
# Combine the correct labels for each batch into a single list.
flat_true_labels = [item for sublist in true_labels for item in sublist]
# Calculate Mathew's correlation
mcc = matthews_corrcoef(flat_true_labels, flat_predictions)

print('MCC: %.3f' % mcc)


MCC: 0.584
