In [12]:
!pip install transformers



You should consider upgrading via the 'c:\users\mohni\appdata\local\programs\python\python38\python.exe -m pip install --upgrade pip' command.


In [14]:
import json
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, QuestionAnsweringPipeline
import torch
from torch.utils.data import DataLoader
from transformers import AdamW

In [15]:
class PolicyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {
            key: torch.tensor(val[idx]) for key, val in self.encodings.items()
        }

    def __len__(self):
        return len(self.encodings.input_ids)

In [16]:
def read_json(filename):
    with open(filename) as file:
        data = json.load(file)
    return data

In [17]:
tokenizer = AutoTokenizer.from_pretrained("deepset/bert-base-cased-squad2")
model = AutoModelForQuestionAnswering.from_pretrained("deepset/bert-base-cased-squad2")

In [18]:
def add_token_positions(encodings, answers):
    start_positions, end_positions = list(), list()
    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i, answers[i]["start_answer"]))
        end_positions.append(encodings.char_to_token(i, answers[i]["end_answer"]))

        # print(tokenizer.char_to_word(start_positions[-1]))
        # if start position is None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length

        # if end position is None, the 'char_to_token' function points to the space before the correct token - > add + 1

        if end_positions[-1] is None:
            end_positions[-1] = encodings.char_to_token(i, answers[i]['end_answer'] - 1)

    encodings.update({
      'start_positions': start_positions,
      'end_positions': end_positions
      })

In [19]:
data = read_json("training_set.json")

In [20]:
contexts, questions, answers = list(), list(), list()

for topicID in data:
    context = data[topicID]["context"]
    for qa in data[topicID]["qas"]:
        contexts.append(context)
        questions.append(data[topicID]["qas"][qa]["question"])
        answers.append(data[topicID]["qas"][qa]["answer"])


In [23]:
flag = True
for answer, context in zip(answers, contexts):
    if context[answer["start_answer"]:answer["end_answer"]] != answer["answer"]:
        flag = False
        break
print(flag)

True


In [22]:
train_encodings = tokenizer(contexts, questions, truncation=True, padding=True)
add_token_positions(train_encodings, answers)
print(train_encodings["start_positions"],"\n", train_encodings["end_positions"])

[38, 49, 38, 1, 47, 1, 14, 39, 57, 512, 1, 1, 1, 1, 1, 1, 512, 512, 1, 58, 58, 1, 27, 1, 14, 30, 125, 512, 512, 512, 45, 1, 35, 67, 512, 179, 242, 1, 40, 73, 512, 13, 512, 512, 41, 72, 512, 101, 132, 206, 255, 512, 1, 512, 34, 512, 181, 215, 234, 512, 512, 283, 78, 115, 306, 372, 189, 1, 37, 151, 196, 231, 1, 23, 111, 512, 300, 512, 1, 58, 512, 234, 512, 512, 38, 1] 
 [54, 54, 91, 46, 89, 9, 27, 52, 71, 91, 59, 43, 26, 26, 26, 69, 37, 69, 23, 83, 83, 38, 47, 13, 29, 86, 200, 227, 90, 90, 71, 34, 67, 96, 178, 214, 295, 36, 73, 92, 170, 26, 26, 40, 62, 100, 71, 132, 152, 231, 378, 434, 92, 212, 87, 152, 214, 234, 327, 260, 260, 327, 107, 182, 371, 391, 258, 36, 150, 195, 213, 279, 22, 61, 197, 267, 331, 420, 58, 163, 204, 306, 355, 70, 76, 76]


In [24]:
train_dataset = PolicyDataset(train_encodings)

In [26]:
# device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device = torch.device('cpu')
print(torch.cuda.is_available())
model.to(device)
model.train()

train_loader = DataLoader(train_dataset, batch_size = 16, shuffle = True)
optim = AdamW(model.parameters(), lr = 5e-5)

for epoch in range(10):
    loss_epoch = 0
    for batch in train_loader:
        print(1)
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        outputs = model(input_ids, attention_mask = attention_mask, start_positions = start_positions, end_positions = end_positions)
        loss = outputs[0]
        loss.backward()
        optim.step()
        loss_epoch += loss.item()
    print("epoch-->",epoch,"loss -->",loss_epoch/len(train_loader))


True
1


RuntimeError: [enforce fail at ..\c10\core\CPUAllocator.cpp:73] data. DefaultCPUAllocator: not enough memory: you tried to allocate 155520000 bytes. Buy new RAM!

In [21]:
model.save_pretrained("/content/drive/My Drive/bert_model/bert_policy_documents")