In [1]:
! pip install datasets
! pip install transformers
! pip install huggingface_hub
! pip install evaluate



# Import Essential Libraries

In [2]:
# Import Libraries
import numpy as np
import matplotlib.pyplot as plt
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW, lr_scheduler
from torch.cuda.amp import GradScaler, autocast
from transformers import AutoTokenizer
from datasets import load_dataset
from tqdm.auto import tqdm
from torch.nn.utils.rnn import pad_sequence
import evaluate

# Init the Hyperparameter

In [3]:
MASTER_CONFIG = {
    "vocab_size": 30592, # 30526 + 1
    "num_epochs":12,
    "batch_size": 8,
    "num_epochs": 12,
    "d_model": 2048,
    "nhead": 8,
    "n_q_head": 16,
    "n_kv_head": 8,
    "dim_feedforward": 2048,
    "num_layers": 12,
    "learning_rate": 3e-4,
    "max_seq_len": 1024,
    "device": 'cuda' if torch.cuda.is_available() else 'cpu'
}
device = MASTER_CONFIG['device']
from google.colab import drive
drive.mount('/content/drive')
llama_path = '/content/drive/MyDrive/MSML612/Final Project/Llama'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Load and Preprocess the Dataset

In [4]:
# Load the dataset
datasets = load_dataset('Wodeyuanbukongda/SQuard_Chatbot_Llama')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [5]:
# Init the tokenizer
tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-uncased')
tokenizer.add_special_tokens({'bos_token': '[BOS]'})
tokenizer.add_special_tokens({'eos_token': '[EOS]'})
tokenizer.add_special_tokens({'additional_special_tokens': ['[CON]', '[QUE]', '[ANS]']})

# Check the tokenizer
print(tokenizer.special_tokens_map)
print(tokenizer.all_special_ids)

# Split the dataset after preprocessing
datasets = datasets['train'].train_test_split(test_size=0.1)

# Check the structure
print(datasets)

{'bos_token': '[BOS]', 'eos_token': '[EOS]', 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]', 'additional_special_tokens': ['[CON]', '[QUE]', '[ANS]']}
[30522, 30523, 100, 102, 0, 101, 103, 30524, 30525, 30526]
DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'ans_index'],
        num_rows: 88191
    })
    test: Dataset({
        features: ['input', 'output', 'ans_index'],
        num_rows: 9800
    })
})


# Dataset class

In [6]:
# Here we do not use batch
# Init the dataset class
class Squad(Dataset):
    def __init__(self, squad):
        super().__init__()
        self.squad = squad

    def __getitem__(self, index):
        # Custom the get method
        item = self.squad[index]
        input = torch.tensor(item['input'])
        output = torch.tensor(item['output'])
        ans_index = torch.tensor(item['ans_index'])
        return {'input': input, 'output': output, 'ans_index': ans_index}

    def __len__(self):
        # Return the total number of training instances
        return len(self.squad)


def generate_mask(sequence):
    seq_len = len(sequence[0]) # Select one element and acqure the sequence
    padding_mask = (sequence == float(tokenizer.pad_token_id))
    attention_mask = (torch.tril(torch.ones(seq_len, seq_len)) == 0)
    return padding_mask, attention_mask


def collate_fn(batch):
    # Collate function used to ensure the input have the same sequence lenght (T)
    # Assume the input is a batch
    input = [item['input'] for item in batch] # [[],[],[]]
    output = [item['output'] for item in batch]
    ans_index = [item['ans_index'] for item in batch]

    # pad_sequence: Pad a list of variable length Tensors with padding_value.
    # The resulting Tensor will have shape (B,T)
    padded_input_seq = pad_sequence( input, batch_first=True, padding_value=tokenizer.pad_token_id )  # [B,T]
    # Generate the mask
    padding_mask_input, attention_mask_input = generate_mask(padded_input_seq)


    padded_output_seq = pad_sequence(output, batch_first=True, padding_value=tokenizer.pad_token_id)
    return {'input': padded_input_seq, 'output': padded_output_seq, 'padding_mask': padding_mask_input,
            'attention_mask': attention_mask_input, 'ans_index': ans_index}


# Create the dataloader
dataset_train = Squad(datasets['train'])
dataset_val = Squad(datasets['test'])
dataloader_train = DataLoader(dataset_train, batch_size=MASTER_CONFIG['batch_size'],
                                  collate_fn = collate_fn, shuffle=True, num_workers=2)
dataloader_val = DataLoader(dataset_val, batch_size=1,
                                  collate_fn = collate_fn, shuffle=False, num_workers=2)
# Check the shape after create the dataloader
for i, batch in enumerate(dataloader_train):
    print(batch.keys())
    print(batch['input'].shape)
    print(batch['output'].shape)
    print(batch['padding_mask'].shape)
    print(batch['attention_mask'].shape)
    break

  self.pid = os.fork()


dict_keys(['input', 'output', 'padding_mask', 'attention_mask', 'ans_index'])
torch.Size([8, 448])
torch.Size([8, 448])
torch.Size([8, 448])
torch.Size([448, 448])


# Constructe the Model

In [34]:
# Init the word embedding layer
class WordEmbedding(nn.Module):
    def __init__(self, vocab_size, d_model):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)

    def forward(self, x):
        return self.embedding(x)

# RMS norm layer
class RMSNorm(nn.Module):
    def __init__(self, d_model, eps=1e-6):
        super().__init__()
        self.alpha = nn.Parameter(torch.ones(d_model))
        self.eps = eps

    def forward(self, x):
        # x: [B,T,C]
        # torch.rsqrt = 1 / sqrt(input)
        rms = torch.rsqrt(torch.mean(torch.square(x), dim=-1, keepdim=True) + self.eps)
        return self.alpha * rms * x

# Rotary Embedding layer
class PosEmbedding(nn.Module):
    '''
    In the context of Llama
    We only do the positional encoding for the q, k matrix
    However, due to the multiquery attention, q,k matrix does not have the same shape
    Furthermore, since the Llama percieve each head as a new query
    we do the positional encoding in terms of the head_dim
    q.shape = [B, T, H, d//H]
    k.shape = [B, T, H_kv, d//H]
    '''
    # Here d_model is head_dim
    def __init__(self, seq_len, d_model):
        super().__init__()
        # \theta = 10000^( -2(i-1) / d_model )
        # self.theta.shape = [d/2]
        self.theta = torch.pow(10000.0, -2 * (torch.arange(1, d_model / 2 + 1) - 1) / d_model)

        # Acquire the position
        #self.m.shape = [T]
        self.m = torch.arange(0, seq_len)

        # Acquire the matrix
        # torch.outer(self,m, self.theta).shape = [T, d/2] (compute the outer product)
        # The result will looks like this:
        # [0\theta_1, 0\theta_2
        #  1\theta_1, 1\theta_2
        #  2\theta_1, 2\theta_2
        #  3\theta_1, 3\theta_2]

        # torch.polar:
        # Convert the Cartesian coordinates to polar coordinates
        # out = abs * cos(angle) + abs * sin(angle) * j
        # abs.shape = [T, d/2]
        # angle.shape = [T, d/2]
        # self.matrix.shape = [T, d/2]
        # self.matrix result will looks like:
        # [cos( 0\theta_1 )+sin( 0\theta_1 )*j, cos( 0\theta_2 )+sin( 0\theta_2 )*j
        #  cos( 1\theta_1 )+sin( 1\theta_1 )*j, cos( 1\theta_2 )+sin( 1\theta_2 )*j
        #  ...
        #  cos( T\theta_1 )+sin( T\theta_1 )*j, cos( T\theta_2 )+sin( T\theta_2 )*j]

        # Actual output looks like:
        # tensor([[ 1.0000+0.0000j,  1.0000+0.0000j],
        #         [ 0.5403+0.8415j,  0.9999+0.0100j],
        #         [-0.4161+0.9093j,  0.9998+0.0200j],
        #         [-0.9900+0.1411j,  0.9996+0.0300j]])
        self.matrix = torch.polar(abs=torch.ones_like(torch.outer(self.m, self.theta)),
                                  angle=torch.outer(self.m, self.theta))

    def forward(self, x):
        # x.shape = [B, T, H, d//H] if q
        # x.shape = [B, T, H_kv, d//H] if k
        B, T, h, C = x.shape

        # torch.view_as_complex: Returns a view of input as a complex tensor
        # suppose the input is:
        # tensor([[ 1.6116, -0.5772],
        #         [-1.4606, -0.9120],
        #         [ 0.0786, -1.7497],
        #         [-0.6561, -1.6623]])
        # The output will looks like this:
        # tensor([(1.6116-0.5772j), (-1.4606-0.9120j), (0.0786-1.7497j), (-0.6561-1.6623j)])

        # [B, T, H, d//H] -> [B, T, H, d//H/2, 2] -> [B, T, H, d//H/2]

        # To understand how this code work, I generate a small sample:
        # Suppose [B, T, d] is like this: assume B = 1
        # tensor([[[ 0,  1,  2,  3,  4,  5,  6,  7],
        #         [ 8,  9, 10, 11, 12, 13, 14, 15],
        #         [16, 17, 18, 19, 20, 21, 22, 23],
        #         [24, 25, 26, 27, 28, 29, 30, 31]]])

        # After reshape the [B, T, d] to [B, T, H, d//H] the output like:
        # tensor([[[[ 0.,  1.,  2.,  3.],
        #           [ 4.,  5.,  6.,  7.]],

        #           [[ 8.,  9., 10., 11.],
        #           [12., 13., 14., 15.]],

        #           [[16., 17., 18., 19.],
        #           [20., 21., 22., 23.]],

        #           [[24., 25., 26., 27.],
        #           [28., 29., 30., 31.]]]]

        # Then, we reshape the [B, T, H, d//H] to [B, T, H, d//H/2, 2] the output like:
        # tensor([[[[[ 0.,  1.],
        #           [ 2.,  3.]],

        #           [[ 4.,  5.],
        #           [ 6.,  7.]]],


        #         [[[ 8.,  9.],
        #           [10., 11.]],

        #           [[12., 13.],
        #           [14., 15.]]],


        #         [[[16., 17.],
        #           [18., 19.]],

        #           [[20., 21.],
        #           [22., 23.]]],


        #         [[[24., 25.],
        #           [26., 27.]],

        #           [[28., 29.],
        #           [30., 31.]]]]]

        # Finally, the x_complex is looks like:
        # tensor([[[[ 0.+1.j,  2.+3.j],
        #           [ 4.+5.j,  6.+7.j]],

        #         [[ 8.+9.j, 10.+11.j],
        #           [12.+13.j, 14.+15.j]],

        #         [[16.+17.j, 18.+19.j],
        #           [20.+21.j, 22.+23.j]],

        #         [[24.+25.j, 26.+27.j],
        #           [28.+29.j, 30.+31.j]]]]

        x_complex = torch.view_as_complex(x.reshape(B, T, h, -1, 2))  # [B,T,h,C/2]

        matrix_complex = self.matrix.unsqueeze(0).unsqueeze(2).to(x_complex.device)  # [1,seq_len,1,C/2]
        x_out = x_complex * matrix_complex[:, :T, :, :] #[B, T, H, d//H/2]
        x_out = torch.view_as_real(x_out).reshape(x.shape)
        return x_out

# FeedForward layer
class FeedForward(nn.Module):
  '''
  Here the Input x with shape [B, T, d]
  Note: in the Llama, we do not have a trainable parameter \beta
  but in the paper, we do have a trainable parameter \beta
  where swish = x * \sigma( \beta * x )
  '''
  def __init__(self, d_model, hidden_dim):
      super().__init__()
      self.W = nn.Linear(d_model, hidden_dim)
      self.W2 = nn.Linear(hidden_dim, d_model)
      self.V = nn.Linear(d_model, hidden_dim)

  def forward(self, x):
      swish = F.silu(self.W(x))
      x_v = self.V(x)
      return self.W2(swish * x_v)

# A independent function
def repeat_kv(x, n_rep):
  '''
  Use to let the k, v matrix have the same shape as the q matrix to fit the attention
  q.shape = [B, T, H, d//H]
  k.shape = [B, T, H_kv, d//H]
  v.shape = [B, T, H_kv, d//H]
  n_rep = numbers of times that k,v heads repeat
  '''
  B, T, n_kv_head, head_dim = x.shape
  if n_rep == 1:
      return x
  else:
      return (
          x.unsqueeze(3)
          .expand(B, T, n_kv_head, n_rep,
                  head_dim)  # Returns a new view of the self tensor with singleton dimensions expanded to a larger size.
          .reshape(B, T, n_kv_head * n_rep, head_dim)
      )


# The attention block:
class Attention(nn.Module):
    def __init__(self, d_model, n_q_head, n_kv_head, batch_size, seq_len):
        super().__init__()
        self.head_dim = d_model // n_q_head
        self.n_rep = n_q_head // n_kv_head
        self.wq = nn.Linear(d_model, self.head_dim * n_q_head, bias=False)
        self.wk = nn.Linear(d_model, self.head_dim * n_kv_head, bias=False)
        self.wv = nn.Linear(d_model, self.head_dim * n_kv_head, bias=False)
        self.wo = nn.Linear(n_q_head * self.head_dim, d_model, bias=False)

        self.cache_k = torch.zeros(batch_size, seq_len, n_kv_head, self.head_dim, device=device)
        self.cache_v = torch.zeros(batch_size, seq_len, n_kv_head, self.head_dim, device=device)

        self.pos_emb = PosEmbedding(seq_len, self.head_dim)

    def forward(self, x, attention_mask=None, padding_mask=None, start_pos=None):
        B, T, _ = x.shape
        xq = self.wq(x)  # [B,T,head_dim * n_q_head]
        xk = self.wk(x)  # [B,T,head_dim * n_kv_head]
        xv = self.wv(x)  # [B,T,head_dim * n_kv_head]

        xq = xq.reshape(B, T, -1, self.head_dim) # [B, T, n_q_head, head_dim]
        xk = xk.reshape(B, T, -1, self.head_dim) # [B, T, n_kv_head, head_dim]
        xv = xv.reshape(B, T, -1, self.head_dim) # [B, T, n_kv_head, head_dim]

        # apply rotary position embedding to query and key
        xq = self.pos_emb(xq) # [B, T, n_kv_head, head_dim]
        xk = self.pos_emb(xk) # [B, T, n_kv_head, head_dim]

        # apply kv-cache during inference
        # T is always 1 after prefix
        if not self.training:
            self.cache_k[:B, start_pos:start_pos + T, :, :] = xk
            self.cache_v[:B, start_pos:start_pos + T, :, :] = xv
            xk = self.cache_k[:B, :start_pos + T]
            xv = self.cache_v[:B, :start_pos + T]

        xk = repeat_kv(xk, self.n_rep)
        xv = repeat_kv(xv, self.n_rep)

        xq = xq.transpose(1, 2)  # [B, n_q_head, T, head_dim]
        xk = xk.transpose(1, 2)  # [B, n_q_head, T, head_dim]
        xv = xv.transpose(1, 2)  # [B, n_q_head, T, head_dim]

        combined_mask = None
        # attention mask: [T,T]
        if attention_mask is not None:
            attention_mask = attention_mask.unsqueeze(0).unsqueeze(0).logical_not()  # [1,1,T,T]
            attention_mask = attention_mask.expand(B, xq.shape[1], T, T)
            combined_mask = attention_mask
        if padding_mask is not None:
            padding_mask = padding_mask.unsqueeze(1).unsqueeze(2).logical_not()  # [B,1,1,T]
            padding_mask = padding_mask.expand(B, xq.shape[1], T, T)
            combined_mask = attention_mask & padding_mask
        output = F.scaled_dot_product_attention(xq, xk, xv, attn_mask=combined_mask) # [B, H, T, d // H]

        # attention_score = xq @ xk.transpose(2, 3) / math.sqrt(self.head_dim)  # [B,n_q_head,T,T]
        # # attention mask: [T,T]
        # if attention_mask is not None:
        #     attention_mask = attention_mask.unsqueeze(0).unsqueeze(0)  # [1,1,T,T]
        #     attention_mask = attention_mask.expand(B, xq.shape[1], T, T)
        #     attention_score = attention_score.masked_fill(attention_mask == 1, float('-inf'))
        # # padding_mask: [B,T]
        # if padding_mask is not None:
        #     padding_mask = padding_mask.unsqueeze(1).unsqueeze(2)  # [B,1,1,T]
        #     padding_mask = padding_mask.expand(B, xq.shape[1], T, T)
        #     attention_score = attention_score.masked_fill(padding_mask == 1, float('-inf'))
        # attention_score = F.softmax(attention_score, dim=-1)
        # output = attention_score @ xv  # [B,n_q_head,T,head_dim]

        output = output.transpose(1, 2).contiguous().reshape(B, T, -1)  # [B,T,d]
        return self.wo(output)

# Init the decoder block
class DecoderBlock(nn.Module):
    def __init__(self, d_model, n_q_head, n_kv_head, batch_size, seq_len, hidden_dim):
        super().__init__()
        self.rms_norm1 = RMSNorm(d_model)
        self.attention = Attention(d_model, n_q_head, n_kv_head, batch_size, seq_len)
        self.rms_norm2 = RMSNorm(d_model)
        self.ffw = FeedForward(d_model, hidden_dim)

    def forward(self, x, attention_mask=None, padding_mask=None, start_pos=None):
        x = x + self.attention(self.rms_norm1(x), attention_mask, padding_mask, start_pos)
        x = x + self.ffw(self.rms_norm2(x))
        return x

# Combine them together
class Llama(nn.Module):
    def __init__(self, d_model, n_q_head, n_kv_head, batch_size, seq_len, hidden_dim, vocab_size, n_layers):
        super().__init__()
        self.embd = WordEmbedding(vocab_size, d_model)
        decoder_layer = DecoderBlock(d_model, n_q_head, n_kv_head, batch_size, seq_len, hidden_dim)
        self.decoder = nn.ModuleList([decoder_layer for _ in range(n_layers)])
        self.rms_norm = RMSNorm(d_model)
        self.linear = nn.Linear(d_model, vocab_size)

    def forward(self, x, attention_mask=None, padding_mask=None, start_pos=None):
        x = self.embd(x)
        for layer in self.decoder:
            x = layer(x, attention_mask, padding_mask, start_pos)
        x = self.rms_norm(x)
        x = self.linear(x)
        return x

    def generate(self, context, question, tokenizer):
        self.eval()
        sequence = '[CON]' + context + '[QUE]' + question + '[ANS]'
        seq = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sequence))
        ans = seq
        start_pos = len(seq)
        new_token = None
        while new_token != tokenizer.eos_token_id:
            if len(ans) > 32: # max_seq_len:
                break
            input = torch.tensor(seq).unsqueeze(0).to(device)  # [B,T] and B is always 1
            # [B,C] since we only need the last token during auto-regression
            if input.shape[-1] != 1:
                logits = self(input, start_pos=0)[:, -1, :]
            else:
                logits = self(input, start_pos=start_pos)[:, -1, :]
                start_pos += 1
            probs = torch.softmax(logits, dim=-1)
            output = torch.max(probs, dim=-1)[1]  # [B]
            new_token = output.item()
            ans = ans + [new_token]
            seq = [new_token]
        return tokenizer.decode(ans)

    # Used to run the bleu score
    # def generate_evaluation(self, input, tokenizer):
    #     self.eval()
    #     seq = input
    #     ans = input.tolist()
    #     start_pos = len(ans)
    #     temp = start_pos
    #     new_token = None
    #     while new_token != tokenizer.eos_token_id:
    #         if len(ans)-temp > 32: # max_seq_len:
    #           del input
    #           break
    #         input = torch.tensor(seq).unsqueeze(0).to(device)  # [B,T] and B is always 1
    #         # [B,C] since we only need the last token during auto-regression
    #         if input.shape[-1] != 1:
    #             logits = self(input, start_pos=0)[:, -1, :]
    #         else:
    #             logits = self(input, start_pos=start_pos)[:, -1, :]
    #             start_pos += 1
    #         probs = torch.softmax(logits, dim=-1)
    #         output = torch.max(probs, dim=-1)[1]  # [B]
    #         new_token = output.item()
    #         ans = ans + [new_token]
    #         seq = [new_token]
    #     return tokenizer.decode(ans)

    def generate_evaluation(self, input, tokenizer):
      self.eval()
      device = next(self.parameters()).device  # Ensures compatibility with the model's device
      input = input.to(device)  # Move input tensor to the correct device once

      ans = input.tolist()
      start_pos = 0
      max_len = 32

      while True:
          # Generate logits for the last position only using cached states if available
          logits = self(input, start_pos=start_pos)[:, -1, :]
          probs = torch.softmax(logits, dim=-1)
          new_token = torch.argmax(probs, dim=-1).item()

          if new_token == tokenizer.eos_token_id or len(ans) >= max_len:
              break

          ans.append(new_token)
          new_token_tensor = torch.tensor([[new_token]], dtype=torch.long, device=device)
          input = torch.cat([input, new_token_tensor], dim=1)  # Efficiently expand input
          start_pos += 1  # Move start position forward for kv-cache
      return tokenizer.decode(ans)





def build_model(d_model, nhead, dim_feedforward, num_layers, vocab_size, max_seq_len,
                n_q_head, n_kv_head, batch_size):

    model = Llama(d_model, n_q_head, n_kv_head, batch_size, max_seq_len, dim_feedforward, vocab_size, num_layers)

    def initialize_weights(m):
        if isinstance(m, nn.Linear):
            # Apply Xavier initialization for Linear layers
            nn.init.xavier_uniform_(m.weight)
            if m.bias is not None:
                nn.init.constant_(m.bias, 0)
        elif isinstance(m, nn.Embedding):
            # Apply Xavier initialization for Embedding layers
            nn.init.xavier_uniform_(m.weight)
    model.apply(initialize_weights)

    return model


# Train Process

### Init the model

In [35]:
# Save checkpoint function
def save_checkpoint(model, optimizer, scheduler, epoch, path):
    checkpoint = {
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'scheduler_state_dict': scheduler.state_dict(),
        'epoch': epoch
    }
    torch.save(checkpoint, path)

# Load checkpoint function
def load_checkpoint(model, optimizer, scheduler, path):
    checkpoint = torch.load(path)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
    epoch = checkpoint['epoch']
    return model, optimizer, scheduler, epoch

# Init the model
model = build_model(MASTER_CONFIG['d_model'], MASTER_CONFIG['nhead'], MASTER_CONFIG['dim_feedforward'],
                    MASTER_CONFIG['num_layers'], MASTER_CONFIG['vocab_size'], MASTER_CONFIG['max_seq_len'], MASTER_CONFIG['n_q_head'],
                    MASTER_CONFIG['n_kv_head'], MASTER_CONFIG['batch_size'])

print('Number of parameters in this model: ', np.sum([p.numel() for p in model.parameters()]))
print('device: ', device)
# Send the model to GPU
model = model.to(device)
# Mix precision training
torch.set_float32_matmul_precision('high')
# Other optimize method
model = torch.compile(model)

optimizer = AdamW(model.parameters(), lr=MASTER_CONFIG['learning_rate'], fused=True)
scheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max=MASTER_CONFIG['num_epochs'])
scaler = GradScaler()
# loss_fn = Loss()
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=0)


Number of parameters in this model:  150513536
device:  cuda


### Load the model

In [9]:
try:
    # Attempt to load the checkpoint
    model, optimizer, scheduler, start_epoch = load_checkpoint(model, optimizer, scheduler, path=llama_path)
except Exception as e:
    # If an error occurs, print the error and continue
    print(f"Failed to load checkpoint from {llama_path}: {e}")
    # Optionally, set start_epoch to 0
    start_epoch = 0

### Training the model

In [10]:

# lossi = []
# count = 0

# for epoch in tqdm(range(start_epoch, MASTER_CONFIG['num_epochs']), desc='epoch'):
#     for batch in tqdm(dataloader_train, desc='training batch'):
#         model.train()
#         optimizer.zero_grad()

#         start = time.time()
#         input = batch['input'].to(device)
#         target = batch['output'].to(device)
#         attention_mask = batch['attention_mask'].to(device)
#         padding_mask = batch['padding_mask'].to(device)
#         with autocast():
#             predict = model(input, attention_mask, padding_mask)
#             loss = loss_fn(predict.transpose(1, 2), target)

#         scaler.scale(loss).backward()
#         scaler.unscale_(optimizer)
#         norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
#         scaler.step(optimizer)
#         scaler.update()

#         # loss.backward()
#         # norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
#         # optimizer.step()

#         torch.cuda.synchronize()

#         end = time.time()
#         dt = (end-start)*1000

#         if count % 300 == 0:
#             print(f'loss: {loss.item():.4f}', f'time: {dt:.2f}ms', f'gradient norm: {norm.item():.3f}')
#             lossi.append(loss.item())
#             context = "US national day is July 4th."
#             question = 'When is US national day?'
#             model.eval()
#             with torch.no_grad():
#                 result = model.generate(context, question, tokenizer)
#             print(result)
#         count += 1
#     if (epoch+1) % 3 == 0 and epoch != 0:
#       save_checkpoint(model, optimizer, scheduler, epoch, llama_path)

#     scheduler.step()

# np.save(r'training_loss.npy', np.array(lossi))
# plt.figure()
# plt.plot(range(len(lossi)), lossi, label='loss', marker='o')
# plt.xlabel('training process')
# plt.ylabel('loss')
# plt.title('loss over training process')
# plt.legend()
# plt.grid(True)
# plt.show()

# Evaluation

In [11]:
# prediction = []
# target = []

# for batch in tqdm(dataloader_val, desc='testing batch'):
#   # Find the index of the target number
#   indices = torch.nonzero(batch['input'] == 30526)
#   if indices.nelement() == 0:
#       print("Target number not found in the tensor.")
#   else:
#       # Get the first occurrence if there are multiple
#       target_index = indices[0, 1]  # indices[0, 1] because the shape is [B, T] and B=1

#       # Slice the tensor to include elements up to and including the target index
#       input = batch['input'][:, :target_index + 1]

#   pred = model.generate_evaluation(input = input.squeeze(0), tokenizer = tokenizer)
#   prediction.append(pred)

#   # Find the index of the target number
#   indices = torch.nonzero(batch['output'] == 30526)
#   if indices.nelement() == 0:
#       print("Target number not found in the tensor.")
#   else:
#       # Get the first occurrence if there are multiple
#       target_index = indices[0, 1]  # indices[0, 1] because the shape is [B, T] and B=1

#       # Slice the tensor to include elements up to and including the target index
#       output = batch['output'][:, target_index:-1]

#   target.append([tokenizer.decode(output[0].tolist())])
#   break

# bleu = evaluate.load("bleu")
# results = bleu.compute(predictions=prediction, references=target)
# print(results)

In [36]:
prediction = []
target = []
def extract_until_token(tensor, token_id):
    indices = torch.nonzero(tensor == token_id, as_tuple=False)
    # Get the first occurrence if there are multiple
    target_index = indices[0, 1]  # indices[0, 1] because the shape is [B, T] and B=1

    # Slice the tensor to include elements up to and including the target index
    output_tensor = tensor[:, :target_index + 1]
    return output_tensor

def extract_after_token(tensor, token_id):
    indices = torch.nonzero(tensor == token_id, as_tuple=False)
    # Get the first occurrence if there are multiple
    target_index = indices[0, 1]  # indices[0, 1] because the shape is [B, T] and B=1

    # Slice the tensor to include elements up to and including the target index
    output_tensor = tensor[:, target_index+1:]
    return output_tensor

for batch in tqdm(dataloader_val, desc='testing batch'):
    input_tensor = extract_until_token(batch['input'], 30526)
    output_tensor = extract_after_token(batch['output'], 30526)

    pred = model.generate_evaluation(input=input_tensor, tokenizer=tokenizer)
    prediction.append(pred)
    target.append([tokenizer.decode(output_tensor.tolist())])
    break
# bleu = evaluate.load("bleu")
# results = bleu.compute(predictions=prediction, references=target)
# print(results)

testing batch:   0%|          | 0/9800 [00:00<?, ?it/s]

TypeError: unsupported operand type(s) for +: 'NoneType' and 'int'