## Import libraries

In [None]:
# import torch utils
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import pandas as pd
import numpy as np
import os

# import transformers utils
from transformers import GPT2Tokenizer, GPT2Config, GPT2ForSequenceClassification, GPT2TokenizerFast

# set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# nvidia-smi that clears gpu memory
torch.cuda.empty_cache()

## Prepare data

In [None]:
from operations.utils import read_text

def read_text(path:str="./transcripts/corpus", file_name: str="corpus.txt"):
    with open(f'{path}/{file_name}', 'r', encoding='utf-8') as f:
        texts = f.read()
    return texts

In [None]:
# Tokenize
from operations.data import tokenize_text
from config.config import logger

def tokenize_text(text, model_name: str="gpt2", padding_side: str="right", context_length: int=256, tokenizer_only=False):
    
    # Instantiate tokenizer and pass `gpt2` to the `from_pretrained` method 
    tokenizer = GPT2TokenizerFast.from_pretrained(model_name)
    # Select token to uses as `pad_token`
    tokenizer.pad_token = tokenizer.eos_token
    # Default to right padding
    tokenizer.padding_side = padding_side
    # Set context length
    context_length = context_length
    
    if tokenizer_only == True:
        return tokenizer
    
    # Process text
    inputs = tokenizer(text, 
                       padding='longest',
                       truncation=True,
                       return_tensors="pt",
                       max_length=context_length, # context size GPT-2: 1,024, GPT-3: 2,048
                       return_overflowing_tokens=True, # tokenize input and split into chunks
                       return_length=True, # return length of each created chunk
                       return_special_tokens_mask=True
                      )
    
    logger.info(f"Input IDs length: {len(inputs['input_ids'])}")
    logger.info(f"Input chunk lengths: {(inputs['length'])}")
    logger.info(f"Chunk mapping: {inputs['overflow_to_sample_mapping']}")

    return inputs
    
inputs = tokenize_text(read_text())

In [None]:
# Train/Dev split
from typing import List, Dict
from operations.data import data_split

def data_split(tokens: Dict , train_split: float=0.8, validation_split: float=0.9):
    assert "input_ids" and "attention_mask" in tokens.keys()
    
    n1 = int(train_split * len(tokens['input_ids']))
    n2 = int(validation_split * len(tokens['input_ids']))
    
    Xtr = tokens['input_ids'][:n1]
    tr_mask = tokens['attention_mask'][:n1]
    
    Xval = tokens['input_ids'][n1:n2]
    val_mask = tokens['attention_mask'][n1:n2]
    
    logger.info(f"{Xtr.shape=}")
    logger.info(f"{tr_mask.shape=}")
    logger.info(f"{Xval.shape=}")
    logger.info(f"{val_mask.shape=}")
    
    return Xtr, tr_mask, Xval, val_mask


Xtr, tr_mask, Xval, val_mask = data_split(inputs)

In [None]:
# Date Staging
# from operations.data import DataLoads

class DataLoads(Dataset):
    "A barebones dataloader class for PyTorch"
    def __init__(self, X, Mask):
        self.x = X
        self.mask = Mask
        
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, idx):
        return {
            'input_ids':self.x[idx],
            'attention_mask':self.mask[idx],
        }

    
def stage_data(xtrain: torch.tensor, 
               xtrain_mask: torch.tensor, 
               xval: torch.tensor, 
               xval_mask: torch.tensor):
    
    train_loader = DataLoads(xtrain, xtrain_mask)
    val_loader = DataLoads(xval, xval_mask)
    
    return train_loader, val_loader

In [None]:
# Prepare data for input to transformer
tr_loader, dev_loader = stage_data(Xtr, tr_mask, Xval, val_mask)

trainset = DataLoader(tr_loader, shuffle=True, batch_size=3)
devset = DataLoader(dev_loader, shuffle=False, batch_size=3)

## Optimization loop
### To-do's:
1. Setup pipeline for tracking model metrics and hyperparameters in our designated log (TrainingArguments)
2. ✔ Write function train() which encapsulates the training loop
3. Set-up W&B integrations
4. Set-up Optuna integrations
5. Obtain optimal training parameters 
6. Set-up training of "[gpt2-large](https://huggingface.co/transformers/v2.2.0/pretrained_models.html)" on [Lambda cloud GPUs](https://lambdalabs.com/)
7. Write customized training loop (migrate from Training class)

In [None]:
from operations.utils import read_text
from operations.data import tokenize_text
from operations.data import get_tokenizer
from operations.data import data_split
from operations.data import stage_data
from operations.train import train
from config.config import logger
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
import transformers
import torch

args = TrainingArguments(
    output_dir="artifacts",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy="steps",
    eval_steps=5000,
    logging_steps=200,
    logging_dir='logs/',
    gradient_accumulation_steps=4,
    num_train_epochs=1,
    weight_decay=0.1,
    warmup_steps=10,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_strategy="steps",
    save_steps=1000,
    fp16=True,
    push_to_hub=False,
)

args.context_length=256

# Setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
inputs = tokenize_text(read_text())
xtrain, train_mask, xdev, dev_mask = data_split(inputs)

train_loader, dev_loader = stage_data(xtrain[:10], train_mask[:10], xdev[:10], dev_mask[:10])

# Tokenization
tokenizer = get_tokenizer()
config = AutoConfig.from_pretrained(
    tokenizer.name_or_path, 
    vocab_size=len(tokenizer),
    n_ctx=args.context_length, 
    bos_token_id=tokenizer.bos_token_id, 
    eos_token_id=tokenizer.eos_token_id
)

In [None]:
from transformers import ProgressCallback

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

class PrinterCallback(ProgressCallback):
    "A callback that logs a message at the end of each training epoch"
    def on_log(self, args, state, control, logs=None, **kwargs):
        _ = logs.pop("total_flos", None)
        if state.is_local_process_zero:
            logger.info(logs)
            logger.info(args)
            logger.info(state)
            print(logs['train_loss'])

callback = PrinterCallback()

# Model
model = GPT2LMHeadModel.from_pretrained(tokenizer.name_or_path, config=config)
model_size = sum(t.numel() for t in model.parameters())
logger.info(f"number of model parameters: {model_size/1000**2:.1f}M")

# Training Loop
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=train_loader,
    eval_dataset=dev_loader,
    callbacks=[callback]
)

# Train
trainer.train()

## Model
### To-Do's:
1. Implement a learning rate schedular 
2. Implement customized weight clipping
3. Implement customized optimizer
4. Implement key-token weighted loss function
5. Optimize hyperparameters
6. Train model for 10 epochs to reduce loss below 3.0

In [1]:
from operations.utils import read_text
from operations.data import tokenize_text
from operations.data import get_tokenizer
from operations.data import data_split
from operations.data import stage_data
from operations.train import train
from config.config import logger
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
import transformers

args = TrainingArguments(
    output_dir="artifacts",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy="steps",
    eval_steps=200,
    logging_strategy="epoch",
    logging_dir='logs/',
    gradient_accumulation_steps=4,
    num_train_epochs=5,
    weight_decay=0.1,
    warmup_steps=10,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_strategy="steps",
    save_steps=200,
    fp16=True,
    push_to_hub=False,
)

args.context_length=256

# def train(args: transformers.TrainingArguments=args, trial: bool=False, optimize=False):
#     """Training loop for the finetuning of model parameters."""

#     # Setup
#     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#     inputs = tokenize_text(read_text())
#     xtrain, train_mask, xdev, dev_mask = data_split(inputs)

#     if optimize == True:
#         train_loader, dev_loader = stage_data(xtrain[:100], train_mask[:100], xdev[:100], dev_mask[:100])
#     else:
#         train_loader, dev_loader = stage_data(xtrain, train_mask, xdev, dev_mask)

#     # Tokenization
#     tokenizer = get_tokenizer()
#     config = AutoConfig.from_pretrained(
#         tokenizer.name_or_path, 
#         vocab_size=len(tokenizer),
#         n_ctx=args.context_length, 
#         bos_token_id=tokenizer.bos_token_id, 
#         eos_token_id=tokenizer.eos_token_id
#     )
#     data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
    
#     # Model
#     model = GPT2LMHeadModel.from_pretrained(tokenizer.name_or_path, config=config)
#     model_size = sum(t.numel() for t in model.parameters())
#     logger.info(f"number of model parameters: {model_size/1000**2:.1f}M")
    
#     # Training Loop
#     trainer = Trainer(
#         model=model,
#         tokenizer=tokenizer,
#         args=args,
#         data_collator=data_collator,
#         train_dataset=train_loader,
#         eval_dataset=dev_loader
#     )
    
#     # Train
#     trainer.train()

train(args, optimize=False)

Using cuda_amp half precision backend
***** Running training *****
  Num examples = 26066
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 4
  Total optimization steps = 2035


  0%|          | 0/2035 [00:00<?, ?it/s]

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
200,No log,3.240121
400,No log,3.21725
600,3.284500,3.223501
800,3.284500,3.208983
1000,3.061900,3.230841
1200,3.061900,3.220767
1400,2.899000,3.25173
1600,2.899000,3.247832
1800,2.762900,3.269883
2000,2.762900,3.268486


***** Running Evaluation *****
  Num examples = 3258
  Batch size = 16


  0%|          | 0/204 [00:00<?, ?it/s]

{'eval_loss': 3.2401208877563477, 'eval_runtime': 23.6545, 'eval_samples_per_second': 137.733, 'eval_steps_per_second': 8.624, 'epoch': 0.49}


Saving model checkpoint to artifacts\checkpoint-200
Configuration saved in artifacts\checkpoint-200\config.json
Model weights saved in artifacts\checkpoint-200\pytorch_model.bin
tokenizer config file saved in artifacts\checkpoint-200\tokenizer_config.json
Special tokens file saved in artifacts\checkpoint-200\special_tokens_map.json
***** Running Evaluation *****
  Num examples = 3258
  Batch size = 16


  0%|          | 0/204 [00:00<?, ?it/s]

{'eval_loss': 3.217250108718872, 'eval_runtime': 23.6908, 'eval_samples_per_second': 137.522, 'eval_steps_per_second': 8.611, 'epoch': 0.98}


Saving model checkpoint to artifacts\checkpoint-400
Configuration saved in artifacts\checkpoint-400\config.json
Model weights saved in artifacts\checkpoint-400\pytorch_model.bin
tokenizer config file saved in artifacts\checkpoint-400\tokenizer_config.json
Special tokens file saved in artifacts\checkpoint-400\special_tokens_map.json


{'loss': 3.2845, 'learning_rate': 0.0004540624090114981, 'epoch': 1.0}


***** Running Evaluation *****
  Num examples = 3258
  Batch size = 16


  0%|          | 0/204 [00:00<?, ?it/s]

{'eval_loss': 3.223500967025757, 'eval_runtime': 23.6332, 'eval_samples_per_second': 137.857, 'eval_steps_per_second': 8.632, 'epoch': 1.47}


Saving model checkpoint to artifacts\checkpoint-600
Configuration saved in artifacts\checkpoint-600\config.json
Model weights saved in artifacts\checkpoint-600\pytorch_model.bin
tokenizer config file saved in artifacts\checkpoint-600\tokenizer_config.json
Special tokens file saved in artifacts\checkpoint-600\special_tokens_map.json
***** Running Evaluation *****
  Num examples = 3258
  Batch size = 16


  0%|          | 0/204 [00:00<?, ?it/s]

{'eval_loss': 3.2089829444885254, 'eval_runtime': 23.0157, 'eval_samples_per_second': 141.556, 'eval_steps_per_second': 8.864, 'epoch': 1.96}


Saving model checkpoint to artifacts\checkpoint-800
Configuration saved in artifacts\checkpoint-800\config.json
Model weights saved in artifacts\checkpoint-800\pytorch_model.bin
tokenizer config file saved in artifacts\checkpoint-800\tokenizer_config.json
Special tokens file saved in artifacts\checkpoint-800\special_tokens_map.json


{'loss': 3.0619, 'learning_rate': 0.0003294640787536245, 'epoch': 2.0}


***** Running Evaluation *****
  Num examples = 3258
  Batch size = 16


  0%|          | 0/204 [00:00<?, ?it/s]

{'eval_loss': 3.2308406829833984, 'eval_runtime': 23.0147, 'eval_samples_per_second': 141.562, 'eval_steps_per_second': 8.864, 'epoch': 2.46}


Saving model checkpoint to artifacts\checkpoint-1000
Configuration saved in artifacts\checkpoint-1000\config.json
Model weights saved in artifacts\checkpoint-1000\pytorch_model.bin
tokenizer config file saved in artifacts\checkpoint-1000\tokenizer_config.json
Special tokens file saved in artifacts\checkpoint-1000\special_tokens_map.json
***** Running Evaluation *****
  Num examples = 3258
  Batch size = 16


  0%|          | 0/204 [00:00<?, ?it/s]

{'eval_loss': 3.22076678276062, 'eval_runtime': 23.0281, 'eval_samples_per_second': 141.479, 'eval_steps_per_second': 8.859, 'epoch': 2.95}


Saving model checkpoint to artifacts\checkpoint-1200
Configuration saved in artifacts\checkpoint-1200\config.json
Model weights saved in artifacts\checkpoint-1200\pytorch_model.bin
tokenizer config file saved in artifacts\checkpoint-1200\tokenizer_config.json
Special tokens file saved in artifacts\checkpoint-1200\special_tokens_map.json


{'loss': 2.899, 'learning_rate': 0.00017422270212284725, 'epoch': 3.0}


***** Running Evaluation *****
  Num examples = 3258
  Batch size = 16


  0%|          | 0/204 [00:00<?, ?it/s]

{'eval_loss': 3.251729726791382, 'eval_runtime': 23.0129, 'eval_samples_per_second': 141.573, 'eval_steps_per_second': 8.865, 'epoch': 3.44}


Saving model checkpoint to artifacts\checkpoint-1400
Configuration saved in artifacts\checkpoint-1400\config.json
Model weights saved in artifacts\checkpoint-1400\pytorch_model.bin
tokenizer config file saved in artifacts\checkpoint-1400\tokenizer_config.json
Special tokens file saved in artifacts\checkpoint-1400\special_tokens_map.json
***** Running Evaluation *****
  Num examples = 3258
  Batch size = 16


  0%|          | 0/204 [00:00<?, ?it/s]

{'eval_loss': 3.2478322982788086, 'eval_runtime': 24.1979, 'eval_samples_per_second': 134.64, 'eval_steps_per_second': 8.43, 'epoch': 3.93}


Saving model checkpoint to artifacts\checkpoint-1600
Configuration saved in artifacts\checkpoint-1600\config.json
Model weights saved in artifacts\checkpoint-1600\pytorch_model.bin
tokenizer config file saved in artifacts\checkpoint-1600\tokenizer_config.json
Special tokens file saved in artifacts\checkpoint-1600\special_tokens_map.json


{'loss': 2.7629, 'learning_rate': 4.820267039763859e-05, 'epoch': 4.0}


***** Running Evaluation *****
  Num examples = 3258
  Batch size = 16


  0%|          | 0/204 [00:00<?, ?it/s]

{'eval_loss': 3.269883155822754, 'eval_runtime': 23.6433, 'eval_samples_per_second': 137.798, 'eval_steps_per_second': 8.628, 'epoch': 4.42}


Saving model checkpoint to artifacts\checkpoint-1800
Configuration saved in artifacts\checkpoint-1800\config.json
Model weights saved in artifacts\checkpoint-1800\pytorch_model.bin
tokenizer config file saved in artifacts\checkpoint-1800\tokenizer_config.json
Special tokens file saved in artifacts\checkpoint-1800\special_tokens_map.json
***** Running Evaluation *****
  Num examples = 3258
  Batch size = 16


  0%|          | 0/204 [00:00<?, ?it/s]

{'eval_loss': 3.2684857845306396, 'eval_runtime': 23.6799, 'eval_samples_per_second': 137.585, 'eval_steps_per_second': 8.615, 'epoch': 4.91}


Saving model checkpoint to artifacts\checkpoint-2000
Configuration saved in artifacts\checkpoint-2000\config.json
Model weights saved in artifacts\checkpoint-2000\pytorch_model.bin
tokenizer config file saved in artifacts\checkpoint-2000\tokenizer_config.json
Special tokens file saved in artifacts\checkpoint-2000\special_tokens_map.json




Training completed. Do not forget to share your model on huggingface.co/models =)




{'loss': 2.6797, 'learning_rate': 0.0, 'epoch': 5.0}


{'train_runtime': 2886.6057, 'train_samples_per_second': 45.15, 'train_steps_per_second': 0.705, 'train_loss': 2.93760269502457, 'epoch': 5.0}


## Inference
### To-do's:
1. Truncate printed text to last punctuation mark (to prevent user from seeing incomplete sentences)
2. Extend the length of the returned prompt


In [3]:
from transformers import pipeline

def generate_text(text):
    pipe = pipeline("text-generation", model="artifacts/checkpoint-2000")
    return(pipe(text, num_return_sequences=2)[0]["generated_text"])
    
prompt = "The meaning of life"
generate_text(prompt)

loading configuration file artifacts/checkpoint-2000\config.json
Model config GPT2Config {
  "_name_or_path": "artifacts/checkpoint-2000",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 256,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dtype": "float32",
  "transformers_version": "4.22.1",
  "use_cache": true,
  "vocab

'The meaning of life to be this mystery particle that permeates everything around it? Why does the universe die at all here when this is a matter of mystery everywhere around it? Where does death come from? The mystery of the universe is itself an incredibly'

## =========== Advanced Training Loop ==============

In [None]:
# Since I'm primarily interested in autocompletion for prompts around meaning,
# I will give more weight to training samples with "meaning" and its synonyms.
# Let's check for these words' existence in the tokenizer vocabulary

keytoken_ids = []
for keyword in [
    "connotation",
    "content",
    "context",
    "definition",
    "effect",
    "essence",
    "explanation",
    "hint",
    "implication",
    "interpretation",
    "nuance",
    "sense",
    "significance",
    "spirit",
    "purpose",
    "direction",
    "subject",
    "substance",
    "understanding",
    "value",
    "intention",
    "aim",
]:
    ids = tokenizer([keyword]).input_ids[0]
    if len(ids) == 1:
        keytoken_ids.append(ids[0])
    else:
        print(f"Keyword has not single token: {keyword}")

In [None]:
### You need to study the below code. You initially copied and pasted it into the notebook cell. 
### It's value is such that closer inspection is warranted, and will pay dividends later in the form
### of knowledge which can be applied to future problems.

from torch.nn import CrossEntropyLoss
import torch


def keytoken_weighted_loss(inputs, logits, keytoken_ids, alpha=1.0):
    # Shift so that tokens < n predict n
    shift_labels = inputs[..., 1:].contiguous()
    shift_logits = logits[..., :-1, :].contiguous()
    # Calculate per-token loss
    loss_fct = CrossEntropyLoss(reduce=False)
    loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
    # Resize and average loss per sample
    loss_per_sample = loss.view(shift_logits.size(0), shift_logits.size(1)).mean(axis=1)
    # Calculate and scale weighting
    weights = torch.stack([(inputs == kt).float() for kt in keytoken_ids]).sum(
        axis=[0, 2]
    )
    weights = alpha * (1.0 + weights)
    # Calculate weighted average
    weighted_loss = (loss_per_sample * weights).mean()
    return weighted_loss

In [None]:
from torch.utils.data.dataloader import DataLoader

train_dataloader = DataLoader(tr_loader, batch_size=32, shuffle=True)
eval_dataloader = DataLoader(dev_loader, batch_size=32)

In [None]:
weight_decay = 0.1

def get_grouped_params(model, no_decay=["bias", "LayerNorm.weight"]):
    params_with_wd, params_without_wd = [], []
    for n, p in model.named_parameters():
        if any(nd in n for nd in no_decay):
            params_without_wd.append(p)
        else:
            params_with_wd.append(p)
    return [
        {"params": params_with_wd, "weight_decay": weight_decay},
        {"params": params_without_wd, "weight_decay": 0.0},
    ]

In [None]:
def evaluate():
    model.eval()
    losses = []
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(batch["input_ids"], labels=batch["input_ids"])

        losses.append(accelerator.gather(outputs.loss))
    loss = torch.mean(torch.cat(losses))
    try:
        perplexity = torch.exp(loss)
    except OverflowError:
        perplexity = float("inf")
    return loss.item(), perplexity.item()

In [None]:
model = GPT2LMHeadModel(config)

In [None]:
from torch.optim import AdamW

optimizer = AdamW(get_grouped_params(model), lr=5e-4)

In [None]:
from accelerate import Accelerator

accelerator = Accelerator(fp16=True)

model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

In [None]:
from transformers import get_scheduler

num_train_epochs = 1
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=1_000,
    num_training_steps=num_training_steps,
)

In [None]:
from tqdm.notebook import tqdm

gradient_accumulation_steps = 8
eval_steps = 5000

model.train()
completed_steps = 0
for epoch in range(num_train_epochs):
    for step, batch in tqdm(
        enumerate(train_dataloader, start=1), total=num_training_steps
    ):
        logits = model(batch["input_ids"]).logits
        loss = keytoken_weighted_loss(batch["input_ids"], logits, keytoken_ids)
        if step % 100 == 0:
            accelerator.print(
                {
#                     "lr": get_lr(),
#                     "samples": step * samples_per_step,
#                     "steps": completed_steps,
                    "loss/train": loss.item() * gradient_accumulation_steps,
                }
            )
        loss = loss / gradient_accumulation_steps
        accelerator.backward(loss)
        if step % gradient_accumulation_steps == 0:
            accelerator.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            completed_steps += 1
        if (step % (eval_steps * gradient_accumulation_steps)) == 0:
            eval_loss, perplexity = evaluate()
            accelerator.print({"loss/eval": eval_loss, "perplexity": perplexity})
            model.train()
            accelerator.wait_for_everyone()
            unwrapped_model = accelerator.unwrap_model(model)
            unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
            if accelerator.is_main_process:
                tokenizer.save_pretrained(output_dir)
                repo.push_to_hub(
                    commit_message=f"Training in progress step {step}", blocking=False
                )

In [None]:
import torch
from transformers import pipeline

pipe = pipeline(
"text-generation", model="artifacts/checkpoint-400")

txt = "Artificial intelligence"
print(pipe(txt, num_return_sequences=1)[0]["generated_text"])