In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM

import torch
from datasets import load_dataset
from pprint import pprint

  from .autonotebook import tqdm as notebook_tqdm


Load Dataset

In [2]:
finetuning_dataset_name = "lamini/lamini_docs"
finetuning_dataset = load_dataset(finetuning_dataset_name, split="train")
pprint(finetuning_dataset[1])

{'answer': 'Yes, the code includes methods for submitting jobs, checking job '
           'status, and retrieving job results. It also includes a method for '
           'canceling jobs. Additionally, there is a method for sampling '
           'multiple outputs from a model, which could be useful for '
           'long-running tasks.',
 'attention_mask': [1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                 

In [3]:
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-70m")


In [16]:
def tokenize_function(examples):
    if "question" in examples and "answer" in examples:
      text = examples["question"]  + examples["answer"]
    elif "input" in examples and "output" in examples:
      text = examples["input"] + examples["output"]
    else:
      text = examples["text"][0]

    tokenizer.pad_token =  tokenizer.eos_token
    
    max_length = 2048
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=max_length
    )

    return tokenized_inputs#, text

In [17]:
tokenized = tokenize_function(finetuning_dataset[3])
print(tokenized)
# print(text)

{'input_ids': tensor([[10795,   253,  2634,  ...,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0]])}


In [18]:
def tokenize_and_split_dataset(dataset):
    tokenized_dataset = dataset.map(
        tokenize_function,
        # batched=True,
        # batch_size=1,
        drop_last_batch=True,
    )

    # print(tokenized_dataset)

    processed_data = tokenized_dataset.train_test_split(test_size=0.1, shuffle=True, seed=123)

    return processed_data
    # return tokenized_dataset

In [19]:
dataset = tokenize_and_split_dataset(finetuning_dataset)
dataset

Map: 100%|██████████| 1260/1260 [00:00<00:00, 1264.41 examples/s]


DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1134
    })
    test: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 126
    })
})

In [20]:
train_dataset = dataset["train"]
test_dataset = dataset["test"]
print(train_dataset)
print(test_dataset)

Dataset({
    features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1134
})
Dataset({
    features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 126
})


Load Model

In [21]:
base_model = AutoModelForCausalLM.from_pretrained("EleutherAI/pythia-70m")
device_count = torch.cuda.device_count()
if device_count > 0:
    print("Select GPU device")
    device = torch.device("cuda")
elif torch.mps.is_available():
    print("Select MPS device")
    device = torch.device("mps")
else:
    print("Select CPU device")
    device = torch.device("cpu")

base_model.to(device)

Select MPS device


GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 512)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXSdpaAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=512, out_features=1536, bias=True)
          (dense): Linear(in_features=512, out_features=512, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=512, out_features=2048, bias=True)
          (dense_4h_to_h): Linear(in_features=2048, out_features=512, bias=True)
        

In [22]:
def inference(text, model, tokenizer, max_input_tokens=1024, max_output_tokens=1024):
  # Tokenize
  input_ids = tokenizer.encode(
          text,
          return_tensors="pt",
          truncation=True,
          max_length=max_input_tokens
  )

  # Generate
  device = model.device
  generated_tokens_with_prompt = model.generate(
    input_ids=input_ids.to(device),
    max_length=max_output_tokens
  )

  # Decode
  generated_text_with_prompt = tokenizer.batch_decode(generated_tokens_with_prompt, skip_special_tokens=True)

  # Strip the prompt
  generated_text_answer = generated_text_with_prompt[0][len(text):]

  return generated_text_answer

Try Base Model

In [11]:
test_text = test_dataset[0]['question']
print("Question input (test):", test_text)
print(f"Correct answer from Lamini docs: {test_dataset[0]['answer']}")
print("Model's answer: ")
print(inference(test_text, base_model, tokenizer))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Question input (test): Does Lamini support named entity recognition and extraction?
Correct answer from Lamini docs: Yes, Lamini supports named entity recognition and extraction.
Model's answer: 


A:

I think you're looking for a way to extract the data from the database.  You can use the following to extract the data from the database:
SELECT * FROM `database` WHERE `id` = 1 AND `name` = 1 AND `name` = 1 AND `name` = 1 AND `name` = 1 AND `name` = 1 AND `name` = 1 AND `name`


Training

In [12]:
from transformers import Trainer, TrainingArguments

In [13]:
max_steps = 3
trained_model_name = f"lamini_docs_{max_steps}_steps"
output_dir = trained_model_name

In [23]:
training_args = TrainingArguments(
  learning_rate=1.0e-5,
  num_train_epochs=1,

  # Max steps to train for (each step is a batch of data)
  # Overrides num_train_epochs, if not -1
  max_steps=max_steps,
  per_device_train_batch_size=1,
  output_dir=output_dir,
  overwrite_output_dir=False,
  disable_tqdm=False, # Disable progress bars
  eval_steps=120, # Number of update steps between two evaluations
  save_steps=120, # After # steps model is saved
  warmup_steps=1, # Number of warmup steps for learning rate scheduler
  per_device_eval_batch_size=1,
  evaluation_strategy="steps",
  logging_strategy="steps",
  logging_steps=1,
  optim="adafactor",
  gradient_accumulation_steps = 4,
  gradient_checkpointing=False,

  # Parameters for early stopping
  load_best_model_at_end=True,
  save_total_limit=1,
  metric_for_best_model="eval_loss",
  greater_is_better=False
)



In [24]:
model_flops = (
    base_model.floating_point_ops(
        {
            "input_ids": torch.zeros((1, 2048)),
        }
    ) * training_args.gradient_accumulation_steps
)

print(base_model)
print("Memory footprint", base_model.get_memory_footprint() / 1e9, "GB")
print("Flops", model_flops / 1e9, "GFLOPs")

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 512)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXSdpaAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=512, out_features=1536, bias=True)
          (dense): Linear(in_features=512, out_features=512, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=512, out_features=2048, bias=True)
          (dense_4h_to_h): Linear(in_features=2048, out_features=512, bias=True)
        

In [27]:
trainer = Trainer(
    model=base_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

In [29]:
training_output = trainer.train()
print('Training complete')

ValueError: too many values to unpack (expected 3)

In [None]:
save_dir = f'{output_dir}/final'

trainer.save_model(save_dir)
print("Saved model to:", save_dir)

In [None]:
finetuned_slightly_model = AutoModelForCausalLM.from_pretrained(save_dir, local_files_only=True)
finetuned_slightly_model.to(device) 

In [None]:
test_question = test_dataset[0]['question']
print("Question input (test):", test_question)

print("Finetuned slightly model's answer: ")
print(inference(test_question, finetuned_slightly_model, tokenizer))

In [None]:
test_answer = test_dataset[0]['answer']
print("Target answer output (test):", test_answer)

In [None]:
## same models after 2 full epochs
finetuned_longer_model = AutoModelForCausalLM.from_pretrained("lamini/lamini_docs_finetuned")
tokenizer = AutoTokenizer.from_pretrained("lamini/lamini_docs_finetuned")

finetuned_longer_model.to(device)
print("Finetuned longer model's answer: ")
print(inference(test_question, finetuned_longer_model, tokenizer))