In [1]:
# import torch utils
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import pandas as pd
import numpy as np
import os

# nvidia-smi that clear gpu memory
torch.cuda.empty_cache()

# import transformers utils
from transformers import GPT2Tokenizer, GPT2Config, GPT2ForSequenceClassification, GPT2TokenizerFast

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
# View GPU memory
!nvidia-smi

In [2]:
# function to extract information from .vtt files
import webvtt

def convert_vtt(filenames):
    # create asset folder if one doesn't already exist
    if os.path.isdir('{}/text'.format(os.getcwd())) == False:
        os.makedirs('text')
    # extract the text and times from the vtt file
    for file in filenames:
        captions = webvtt.read(file)
        text_time = pd.DataFrame()
        text_time['text'] = [caption.text for caption in captions]
        text_time['start'] = [caption.start for caption in captions]
        text_time['stop'] = [caption.end for caption in captions]
        text_time.to_csv('transcripts/text/{}.csv'.format(file[18:-4]), index=False) # -4 to remove '.vtt'
        # remove files from local drive
        os.remove(file)
        
        
def prepare_data(return_text=False, save_text=True, vtt=False, path='transcripts/corpus.txt', num_samples:int = 10):
    from glob import glob
    paths = glob("./transcripts/vtt/*_large.vtt", recursive=True)
    if vtt:
        convert_vtt(paths)
    
    csv_files = [os.fsdecode(file) for file in os.listdir('transcripts/text') if os.fsdecode(file).endswith('.csv')]
    texts = []
    
    for file in csv_files[:num_samples]:
        df = pd.read_csv('transcripts/text/' + file)
        text = "".join(df.text)
        texts.append(text)
        
    if save_text:
        with open(path, 'w') as fp:
            fp.write(texts) # <== This line raises error warning. Fix it.

    if return_text:
        return texts

In [12]:
# gather file names of large vtt files only 
from glob import glob
paths = glob("./transcripts/text/*.csv", recursive=True)
# convert_vtt(paths)

In [None]:
csv_files = [os.fsdecode(file) for file in os.listdir('transcripts/text') if os.fsdecode(file).endswith('.csv')]
#take a look at a file name
csv_files[0]

In [None]:
# extract text
texts = []
for file in csv_files[:100]:
    df = pd.read_csv('transcripts/text/' + file)
    text = "".join(df.text)
    texts.append(text)

In [None]:
# Tokenization

# Instantiate tokenizer and pass `gpt2` to the `from_pretrained` method 
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

# Select token to uses as `pad_token`
tokenizer.pad_token = tokenizer.eos_token

# Default to right padding
tokenizer.padding_side = "right"

# Set context length
context_length = 128

# Process text
inputs = tokenizer(texts, 
                   padding='longest',
                   truncation=True,
                   return_tensors="pt",
                   max_length=context_length, # context size GPT-2: 1,024, GPT-3: 2,048
                   return_overflowing_tokens=True, # tokenize input and split into chunks
                   return_length=True, # return length of each created chunk
                   return_special_tokens_mask=True
                  )

print(f"Input IDs length: {len(inputs['input_ids'])}")
print(f"Input chunk lengths: {(inputs['length'])}")
print(f"Chunk mapping: {inputs['overflow_to_sample_mapping']}")
print("input_ids\n", inputs['input_ids'][0])
print("attention_mask\n", inputs['attention_mask'][0])

In [None]:
class DataLoads(Dataset):
    
    def __init__(self, X, Mask):
        self.x = X
        self.mask = Mask
        
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, idx):
        return {
            'input_ids':self.x[idx],
            'attention_mask':self.mask[idx],
        }

In [None]:
n1 = int(0.8*len(inputs['input_ids']))
n2 = int(0.9*len(inputs['input_ids']))
         
Xtr = inputs['input_ids'][:n1]
tr_mask = inputs['attention_mask'][:n1]

Xdev = inputs['input_ids'][n1:n2]
dev_mask = inputs['attention_mask'][n1:n2]

Xte = inputs['input_ids'][n2:]
te_mask = inputs['attention_mask'][n2:]

print(f"{Xtr.shape=}")
print(f"{Xdev.shape=}")
print(f"{Xte.shape=}")

In [None]:
# Prepare data for input to transformer
tr_loader = DataLoads(Xtr, tr_mask)
dev_loader = DataLoads(Xdev, dev_mask)

trainset = DataLoader(tr_loader, shuffle=True, batch_size=3)
devset = DataLoader(dev_loader, shuffle=False, batch_size=3)

In [None]:
# Model
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig

config = AutoConfig.from_pretrained(
    "gpt2", vocab_size=len(tokenizer),
    n_ctx=context_length,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

In [None]:
model = GPT2LMHeadModel.from_pretrained("gpt2", config=config)
model_size = sum(t.numel() for t in model.parameters())
print(f"The model has {model_size/1000**2:.1f}M parameters to tune")

In [None]:
# create data collator
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [None]:
# Optimization Loop
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    output_dir="artifacts",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy="steps",
    eval_steps=100,
    logging_steps=100,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    weight_decay=0.1,
    warmup_steps=10,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=200,
    fp16=True,
    push_to_hub=False,
)

trainer = Trainer(
model=model,
tokenizer=tokenizer,
args=args,
data_collator=data_collator,
train_dataset=tr_loader,
eval_dataset=dev_loader)

In [None]:
trainer.train()

In [None]:
# Inference
# How well does my trained model actually perform?
import torch
from transformers import pipeline

pipe = pipeline(
"text-generation", model="artifacts/checkpoint-200")

txt = "Artificial Intelligence will"
print(pipe(txt, num_return_sequences=1)[0]["generated_text"])

In [None]:
txt = "Artificial Intelligence will"
print(pipe(txt, num_return_sequences=1)[0]["generated_text"])

In [None]:
# ========== Advanced Training Loop ================
# Since I'm primarily interested in autocompletion for prompts around meaning,
# I will give more weight to training samples with "meaning" and its synonyms.
# Let's check for these words' existence in the tokenizer vocabulary

keytoken_ids = []
for keyword in [
    "connotation",
    "content",
    "context",
    "definition",
    "effect",
    "essence",
    "explanation",
    "hint",
    "implication",
    "interpretation",
    "nuance",
    "sense",
    "significance",
    "spirit",
    "purpose",
    "direction",
    "subject",
    "substance",
    "understanding",
    "value",
    "intention",
    "aim",
]:
    ids = tokenizer([keyword]).input_ids[0]
    if len(ids) == 1:
        keytoken_ids.append(ids[0])
    else:
        print(f"Keyword has not single token: {keyword}")

In [None]:
### You need to study the below code. You initially copied and pasted it into the notebook cell. 
### It's value is such that closer inspection is warranted, and will pay dividends later in the form
### of knowledge which can be applied to future problems.

from torch.nn import CrossEntropyLoss
import torch


def keytoken_weighted_loss(inputs, logits, keytoken_ids, alpha=1.0):
    # Shift so that tokens < n predict n
    shift_labels = inputs[..., 1:].contiguous()
    shift_logits = logits[..., :-1, :].contiguous()
    # Calculate per-token loss
    loss_fct = CrossEntropyLoss(reduce=False)
    loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
    # Resize and average loss per sample
    loss_per_sample = loss.view(shift_logits.size(0), shift_logits.size(1)).mean(axis=1)
    # Calculate and scale weighting
    weights = torch.stack([(inputs == kt).float() for kt in keytoken_ids]).sum(
        axis=[0, 2]
    )
    weights = alpha * (1.0 + weights)
    # Calculate weighted average
    weighted_loss = (loss_per_sample * weights).mean()
    return weighted_loss

In [None]:
from torch.utils.data.dataloader import DataLoader

train_dataloader = DataLoader(tr_loader, batch_size=32, shuffle=True)
eval_dataloader = DataLoader(dev_loader, batch_size=32)

In [None]:
weight_decay = 0.1

def get_grouped_params(model, no_decay=["bias", "LayerNorm.weight"]):
    params_with_wd, params_without_wd = [], []
    for n, p in model.named_parameters():
        if any(nd in n for nd in no_decay):
            params_without_wd.append(p)
        else:
            params_with_wd.append(p)
    return [
        {"params": params_with_wd, "weight_decay": weight_decay},
        {"params": params_without_wd, "weight_decay": 0.0},
    ]

In [None]:
def evaluate():
    model.eval()
    losses = []
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(batch["input_ids"], labels=batch["input_ids"])

        losses.append(accelerator.gather(outputs.loss))
    loss = torch.mean(torch.cat(losses))
    try:
        perplexity = torch.exp(loss)
    except OverflowError:
        perplexity = float("inf")
    return loss.item(), perplexity.item()

In [None]:
model = GPT2LMHeadModel(config)

In [None]:
from torch.optim import AdamW

optimizer = AdamW(get_grouped_params(model), lr=5e-4)

In [None]:
from accelerate import Accelerator

accelerator = Accelerator(fp16=True)

model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

In [None]:
from transformers import get_scheduler

num_train_epochs = 1
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=1_000,
    num_training_steps=num_training_steps,
)

In [None]:
from tqdm.notebook import tqdm

gradient_accumulation_steps = 8
eval_steps = 5000

model.train()
completed_steps = 0
for epoch in range(num_train_epochs):
    for step, batch in tqdm(
        enumerate(train_dataloader, start=1), total=num_training_steps
    ):
        logits = model(batch["input_ids"]).logits
        loss = keytoken_weighted_loss(batch["input_ids"], logits, keytoken_ids)
        if step % 100 == 0:
            accelerator.print(
                {
#                     "lr": get_lr(),
#                     "samples": step * samples_per_step,
#                     "steps": completed_steps,
                    "loss/train": loss.item() * gradient_accumulation_steps,
                }
            )
        loss = loss / gradient_accumulation_steps
        accelerator.backward(loss)
        if step % gradient_accumulation_steps == 0:
            accelerator.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            completed_steps += 1
        if (step % (eval_steps * gradient_accumulation_steps)) == 0:
            eval_loss, perplexity = evaluate()
            accelerator.print({"loss/eval": eval_loss, "perplexity": perplexity})
            model.train()
            accelerator.wait_for_everyone()
            unwrapped_model = accelerator.unwrap_model(model)
            unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
            if accelerator.is_main_process:
                tokenizer.save_pretrained(output_dir)
                repo.push_to_hub(
                    commit_message=f"Training in progress step {step}", blocking=False
                )

In [None]:
import torch
from transformers import pipeline

pipe = pipeline(
"text-generation", model="artifacts/checkpoint-200")

txt = "Artificial intelligence"
print(pipe(txt, num_return_sequences=1)[0]["generated_text"])