In [1]:
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap

from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

# Splits data into batches of defined size
from torch.utils.data import DataLoader
from torch.utils.data.sampler import SubsetRandomSampler

from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
file_errors_location = 'ArithOpsTrain.xlsx'
df = pd.read_excel(file_errors_location)

In [9]:
dataset = []
for idx, row in enumerate(df.iterrows()):
    if idx == 0:
        print(row[1][1], row[1][2], row[1][3], row[1][4], row[1][5])
        continue 
    ans = row[1][3].replace("number0", "0", -1)
    ans = ans.replace("number1", "1", -1)
    ans = ans.replace("number2", "2", -1)
    dataset.append((row[1][1], row[1][2], ans, row[1][4], row[1][5]))

Description Question Equation Input Numbers Output


In [4]:
dataset[0]

('gino has number0 popsicle sticks . i have number1 popsicle sticks .',
 'what is the sum of our popsicle sticks ?',
 '+ number0 number1',
 '63 50',
 113)

In [4]:
def split_indices(n, val_pct):

    # Determine size of Validation set
    n_val = int(val_pct * n)

    # Create random permutation of 0 to n-1
    idxs = np.random.permutation(n)
    return idxs[n_val:], idxs[:n_val]

In [5]:
train_indices, val_indices = split_indices(len(dataset), 0.2)

In [6]:
from transformers import BartForConditionalGeneration, BartTokenizer

model = BartForConditionalGeneration.from_pretrained("facebook/bart-large", forced_bos_token_id=0)
tok = BartTokenizer.from_pretrained("facebook/bart-large")

In [10]:
# ----------- Batching the data -----------
def collate_fn(instn):

    qc = tok([x[0] + " </s> " + x[1] for x in instn], return_tensors="pt", truncation=False, padding=True)
    #question = [x[1] for x in instn]
    answer = tok([x[2] for x in instn], return_tensors="pt", truncation=False, padding=True)
    input_val = [x[3] for x in instn]
    output = [x[4] for x in instn]

    return (qc, answer, input_val, output)

batch_size = 64

train_sampler   = SubsetRandomSampler(train_indices)
trainloader    = DataLoader(dataset, batch_size, sampler=train_sampler, collate_fn=collate_fn)

val_sampler     = SubsetRandomSampler(val_indices)
valloader      = DataLoader(dataset, batch_size, sampler=val_sampler, collate_fn=collate_fn)

In [11]:
for i in trainloader:
    print(i)
    break

({'input_ids': tensor([[    0,   627, 10485,  ...,     1,     1,     1],
        [    0,   267, 11228,  ...,     1,     1,     1],
        [    0,   119,  4926,  ...,     1,     1,     1],
        ...,
        [    0,   102,  4716,  ...,     1,     1,     1],
        [    0, 38696,   352,  ...,     1,     1,     1],
        [    0,   196,    56,  ...,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}, {'input_ids': tensor([[   0,   12,  111,  321,  112,  132,    2,    1],
        [   0, 2744,  321,  112,    2,    1,    1,    1],
        [   0,   12,  321,  112,    2,    1,    1,    1],
        [   0, 2744,  321,  112,    2,    1,    1,    1],
        [   0, 3226, 2055,  112,  132,  321,    2,    1],
        [   0,   12, 2055,  112,  132,  321,    2,    1],
        [   0,   12,  

In [6]:
PRE_TRAINED_MODEL_NAME = 'bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [9]:
sample_txt = 'I want to learn how to do sentiment analysis using BERT and tokenizer.'

# Encode_plus method of tokenizer adds special tokens like seperator[SEP], classifier [CLS], performs padding [PAD] so that BERT knows we are doing classification
encoding = tokenizer.encode_plus(
  sample_txt,
  max_length=32,
  add_special_tokens=True, # Add '[CLS]' and '[SEP]'
  return_token_type_ids=False,
  pad_to_max_length=True,
  return_attention_mask=True,
  return_tensors='pt',  # Return PyTorch tensors
  truncation = True
)

encoding



{'input_ids': tensor([[  101,   146,  1328,  1106,  3858,  1293,  1106,  1202, 17024,  3622,
          1606,   139,  9637,  1942,  1105, 22559, 17260,   119,   102,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]])}

In [14]:
bert_model = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
last_hidden_state = bert_model(
  input_ids=encoding['input_ids'], 
  attention_mask=encoding['attention_mask']
).last_hidden_state

In [17]:
print(last_hidden_state.shape, pooled_output)

torch.Size([1, 32, 768]) pooler_output


In [112]:
example_english_phrase = "UN Chief Says There Is No mask in Syria <s> <pad> <unk> </s>"
batch = tok(example_english_phrase, return_tensors="pt", truncation=False, padding=True)
print(batch["input_ids"], batch)
generated_ids = torch.argmax(model(batch["input_ids"]).logits, dim =2)
print(generated_ids)
print(tok.batch_decode(generated_ids, skip_special_tokens=True))

example_english_phrase = "+ 0 1"
batch = tok(example_english_phrase, return_tensors="pt", truncation=False, padding=True)
print(batch["input_ids"])

loss_fn = F.cross_entropy
opt = torch.optim.Adam(model.parameters(), lr = 0.00001)

tensor([[    0,  4154,  1231, 15674,   345,  1534,   440, 11445,    11,  1854,
          1437,     0,  1437,     1,  1437,     3,  1437,     2,     2]]) {'input_ids': tensor([[    0,  4154,  1231, 15674,   345,  1534,   440, 11445,    11,  1854,
          1437,     0,  1437,     1,  1437,     3,  1437,     2,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


TypeError: cannot unpack non-iterable int object

In [25]:
device = torch.device("cuda:4")
model.to(device)

loss_fn = F.cross_entropy
opt = torch.optim.Adam(model.parameters(), lr = 0.001)

RuntimeError: CUDA out of memory. Tried to allocate 16.00 MiB (GPU 4; 3.82 GiB total capacity; 1.41 GiB already allocated; 10.62 MiB free; 1.42 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [24]:
for ep in range(10):

    model = model.train()
  
    for qc, ans, ip, out in tqdm(trainloader):
        loss = 0
        qc_input_ids = qc["input_ids"].to(device)
        qc_attention_mask = qc["attention_mask"].to(device)
        ans_input_ids = ans.to(device)
        #ans_attention_mask = ans["attention_mask"].to(device)

        outputs = model(input_ids=qc_input_ids, attention_mask=qc_attention_mask).logits

        pad = torch.zeros(64, qc_input_ids.shape[1]- ans_input_ids.shape[1]).to(device)
        target = torch.cat((ans_input_ids, pad), dim = 1)
        #print(outputs.shape, ans_input_ids.shape, target.shape)
        #ans_input_ids = ans_input_ids * ans_attention_mask
        #loss = loss_fn(outputs, target)

        #correct_predictions += torch.sum(preds.argmax(dim=1).squeeze() == target)
        #losses.append(loss.item())

        for i in range(1, outputs.shape[1]):
           loss += loss_fn(outputs[:, i], target[:, i-1].long())

        #print(tok.batch_decode(torch.argmax(outputs, dim =2), skip_special_tokens=True))

        loss.backward()
        opt.step()
        opt.zero_grad()
        print(loss)

  0%|          | 0/13 [00:00<?, ?it/s]  0%|          | 0/13 [00:00<?, ?it/s]


RuntimeError: CUDA out of memory. Tried to allocate 92.00 MiB (GPU 2; 39.59 GiB total capacity; 33.56 GiB already allocated; 58.62 MiB free; 34.09 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF