<a href="https://colab.research.google.com/github/krisdmitrieva/DL_HW/blob/main/DL_transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers



In [None]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import AdamW, get_linear_schedule_with_warmup
import numpy as np

from torch.utils.data import Dataset
from torch.utils.data import Dataset, DataLoader
import os
import json
import csv

import logging
logging.getLogger().setLevel(logging.CRITICAL)

import warnings
warnings.filterwarnings('ignore')

device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
model = GPT2LMHeadModel.from_pretrained('gpt2-medium')
model = model.to(device)

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
def choose_from_top(probs, n=5):
    ind = np.argpartition(probs, -n)[-n:]
    top_prob = probs[ind]
    top_prob = top_prob / np.sum(top_prob) # Normalize
    choice = np.random.choice(n, 1, p = top_prob)
    token_id = ind[choice][0]
    return int(token_id)

In [None]:
class DistractorDataset(Dataset):
    def __init__(self, distractor_dataset_path = 'train.json'):
        super().__init__()

        self.distractor_list = []
        self.end_of_text_token = "<|endoftext|>"

        with open('train.json') as json_file:
            train_data = json.load(json_file)

            x = 0
            for line in train_data:
                dist1_str = f"<question>{line['question']}<key>{line['correct_answer']}<distractor>{line['distractor1']}{self.end_of_text_token}"
                self.distractor_list.append(dist1_str)
                dist2_str = f"<question>{line['question']}<key>{line['correct_answer']}<distractor>{line['distractor2']}{self.end_of_text_token}"
                self.distractor_list.append(dist2_str)
                dist3_str = f"<question>{line['question']}<key>{line['correct_answer']}<distractor>{line['distractor3']}{self.end_of_text_token}"
                self.distractor_list.append(dist3_str)

    def __len__(self):
        return len(self.distractor_list)

    def __getitem__(self, item):
        return self.distractor_list[item]

In [16]:
dataset = DistractorDataset()
distractor_loader = DataLoader(dataset, batch_size=1, shuffle=True)

In [17]:
BATCH_SIZE = 16
EPOCHS = 5
LEARNING_RATE = 3e-5
WARMUP_STEPS = 5000
MAX_SEQ_LEN = 400

device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'

In [18]:
model = model.to(device)
model.train()
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=WARMUP_STEPS, num_training_steps = -1)
proc_seq_count = 0
sum_loss = 0.0
batch_count = 0

tmp_distractor_tens = None
models_folder = "trained_models"
if not os.path.exists(models_folder):
    os.mkdir(models_folder)

for epoch in range(EPOCHS):

    print(f"EPOCH {epoch} started" + '=' * 30)

    for idx, distractor in enumerate(distractor_loader):

        #################### "Fit as many distractor sequences into MAX_SEQ_LEN sequence as possible" logic start ####
        distractor_tens = torch.tensor(tokenizer.encode(distractor[0])).unsqueeze(0).to(device)
        #Skip sample from dataset if it is longer than MAX_SEQ_LEN
        if distractor_tens.size()[1] > MAX_SEQ_LEN:
            continue

        #The first distractor sequence in the sequence
        if not torch.is_tensor(tmp_distractor_tens):
            tmp_distractor_tens = distractor_tens
            continue
        else:
            #The next distractor does not fit in so we process the sequence and leave the last distractor
            #as the start for next sequence
            if tmp_distractor_tens.size()[1] + distractor_tens.size()[1] > MAX_SEQ_LEN:
                work_distractor_tens = tmp_distractor_tens
                tmp_distractor_tens = distractor_tens
            else:
                #Add the distractor to sequence, continue and try to add more
                tmp_distractor_tens = torch.cat([tmp_distractor_tens, distractor_tens[:,1:]], dim=1)
                continue
        ################## Sequence ready, process it trough the model ##################

        outputs = model(work_distractor_tens, labels=work_distractor_tens)
        loss, logits = outputs[:2]
        loss.backward()
        sum_loss = sum_loss + loss.detach().data

        proc_seq_count = proc_seq_count + 1
        if proc_seq_count == BATCH_SIZE:
            proc_seq_count = 0
            batch_count += 1
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
            model.zero_grad()

        if batch_count == 100:
            print(f"sum loss {sum_loss}")
            batch_count = 0
            sum_loss = 0.0

    # Store the model after each epoch to compare the performance of them
    torch.save(model.state_dict(), os.path.join(models_folder, f"gpt2_medium_distractor_{epoch}.pt"))


sum loss 4664.65673828125
sum loss 4202.18310546875
sum loss 3829.84765625
sum loss 3677.920166015625
sum loss 3562.058349609375
sum loss 3481.801025390625
sum loss 3400.420654296875
sum loss 3340.52197265625


In [19]:
MODEL_EPOCH = 4

models_folder = "trained_models"

model_path = os.path.join(models_folder, f"gpt2_medium_distractor_{MODEL_EPOCH}.pt")
model.load_state_dict(torch.load(model_path))

distractors_output_file_path = f'generated_{MODEL_EPOCH}.distractor'

model.eval()
if os.path.exists(distractors_output_file_path):
    os.remove(distractors_output_file_path)

distractor_num = 0

custom_questions = ['<question>During the period between 1979 and 1998, what is the farthest planet from the sun?<key>Neptune<distractor>',
             '<question>During the period between 1979 and 1998, what is the farthest planet from the sun?<key>Neptune<distractor>',
             '<question>During the period between 1979 and 1998, what is the farthest planet from the sun?<key>Neptune<distractor>',
             '<question>What gas is the main component of the atmosphere of Mars?<key>Carbon dioxide<distractor>',
             '<question>What gas is the main component of the atmosphere of Mars?<key>Carbon dioxide<distractor>',
             '<question>What gas is the main component of the atmosphere of Mars?<key>Carbon dioxide<distractor>',
             '<question>Who was the first scientist to discover Electrons?<key>J.J Thompson<distractor>',
             '<question>Who was the first scientist to discover Electrons?<key>J.J Thompson<distractor>',
             '<question>Who was the first scientist to discover Electrons?<key>J.J Thompson<distractor>'
 ]

with torch.no_grad():

        for question in custom_questions:

            distractor_finished = False

            cur_ids = torch.tensor(tokenizer.encode(question)).unsqueeze(0).to(device)

            for i in range(100):
                outputs = model(cur_ids, labels=cur_ids)
                loss, logits = outputs[:2]
                softmax_logits = torch.softmax(logits[0,-1], dim=0) #Take the first(from only one in this case) batch and the last predicted embedding
                if i < 3:
                    n = 20
                else:
                    n = 3
                next_token_id = choose_from_top(softmax_logits.to('cpu').numpy(), n=n) #Randomly(from the topN probability distribution) select the next word
                cur_ids = torch.cat([cur_ids, torch.ones((1,1)).long().to(device) * next_token_id], dim = 1) # Add the last word to the running sequence

                if next_token_id in tokenizer.encode('<|endoftext|>'):
                    distractor_finished = True
                    break


            if distractor_finished:

                distractor_num = distractor_num + 1

                output_list = list(cur_ids.squeeze().to('cpu').numpy())
                output_text = tokenizer.decode(output_list)

                print(output_text)

                with open(distractors_output_file_path, 'a') as f:
                    f.write(f"{output_text} \n\n")

<question>During the period between 1979 and 1998, what is the farthest planet from the sun?<key>Neptune<distractor>Uranus<|endoftext|>
<question>During the period between 1979 and 1998, what is the farthest planet from the sun?<key>Neptune<distractor>Venus<|endoftext|>
<question>During the period between 1979 and 1998, what is the farthest planet from the sun?<key>Neptune<distractor>Earth<|endoftext|>
<question>What gas is the main component of the atmosphere of Mars?<key>Carbon dioxide<distractor>Oxygen<|endoftext|>
<question>What gas is the main component of the atmosphere of Mars?<key>Carbon dioxide<distractor>Hydrogen<|endoftext|>
<question>What gas is the main component of the atmosphere of Mars?<key>Carbon dioxide<distractor>Nitrogen<|endoftext|>
<question>Who was the first scientist to discover Electrons?<key>J.J Thompson<distractor>Albert Einstein<|endoftext|>
<question>Who was the first scientist to discover Electrons?<key>J.J Thompson<distractor>J.B.S. Lewis<|endoftext|>
<qu