<a href="https://colab.research.google.com/github/htried/wikimedia-llm-finetune/blob/main/mpt_7b_instruct_finetune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

To prevent Colab from timing out, open console in the browser and paste/run the following:

```
interval = setInterval(function() {
    console.log("working")
    var selector = "#top-toolbar > colab-connect-button"
    document.querySelector(selector).shadowRoot.querySelector("#connect").click()
    setTimeout(function() {
            document.querySelector(selector).shadowRoot.querySelector("#connect").click()
    }, 1000)
}, 60*1000)
```

# Install

In [1]:
!pip install --upgrade transformers einops datasets accelerate

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m51.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting einops
  Downloading einops-0.6.1-py3-none-any.whl (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.14.3-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.1/519.1 kB[0m [31m53.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.21.0-py3-none-any.whl (244 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m33.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/

# Import

In [2]:
import torch
import transformers
from transformers import TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification, AdamW
from transformers import StoppingCriteria, StoppingCriteriaList, TextIteratorStreamer
from datasets import load_dataset
from torch.utils.data import DataLoader, Dataset
from typing import Any, Dict, Tuple, Union
import warnings
from threading import Event, Thread
import textwrap
from datasets import concatenate_datasets


import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Set up

In [3]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|
    
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) Y
Token is valid (permission: write).
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the '

In [4]:
prompt_template = {
    "description": "A shorter template to experiment with.",
    "prompt_input": "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n",
    "response_split": "### Response:"
}

class Prompter(object):
    def __init__(self, verbose: bool = False):
        self._verbose = verbose
        self.template = prompt_template
        if self._verbose:
            print(
                f"Using prompt template alpaca: {self.template['description']}"
            )

    def generate_prompt(
        self,
        instruction: str,
        input: str,
        output: str,
    ) -> str:
        # returns the full prompt from instruction and optional input
        # if a label (=response, =output) is provided, it's also appended.
        res = self.template["prompt_input"].format(
            instruction=instruction, input=input
        )

        if self._verbose:
            print(res)
        return f"{res}{output}"

    def get_response(self, output: str) -> str:
        return output.split(self.template["response_split"])[1].strip()

In [5]:
def train(
    base_model='mosaicml/mpt-7b-instruct',
    cutoff_len=256,
    data_path='htriedman/wiki-sparql',
    train_data_size=100_000,
    val_data_size=10_000,
    num_epochs=3,
    learning_rate=3e-4,
  ):
  prompter = Prompter()
  device_map = "auto"

  model = transformers.AutoModelForCausalLM.from_pretrained(
    base_model,
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
    device_map=device_map,
  )
  tokenizer = transformers.AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
  tokenizer.add_tokens(['{', '}', '[', ']', '\n', '\t' '(', ')'])

  if tokenizer.pad_token_id is None:
      warnings.warn(
          "pad_token_id is not set for the tokenizer. Using eos_token_id as pad_token_id."
      )
      tokenizer.pad_token = tokenizer.eos_token
  tokenizer.padding_side = "right" # "left"

  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  model.eval()
  model.to(device=device, dtype=torch.bfloat16)

  for name, param in model.named_parameters():
    if not ("31" in name or "30" in name or "29" in name or "28" in name or "27" in name or "26" in name or "25" in name or "24" in name or "23" in name or "22" in name):
        param.requires_grad = False


  def tokenize(prompt, add_eos_token=True):
      result = tokenizer(
          prompt,
          truncation=True,
          max_length=cutoff_len,
          padding=False,
          return_tensors=None,
      )
      if (
          result["input_ids"][-1] != tokenizer.eos_token_id
          and len(result["input_ids"]) < cutoff_len
          and add_eos_token
      ):
          result["input_ids"].append(tokenizer.eos_token_id)
          result["attention_mask"].append(1)

      result["labels"] = result["input_ids"].copy()

      return result

  def generate_and_tokenize_prompt(data_point):
      full_prompt = prompter.generate_prompt(
          data_point["instruction"],
          data_point["input"],
          data_point["output"],
      )
      return tokenize(full_prompt)

  data = load_dataset(data_path)
  train_data = (
      data['train'].shuffle()
      .select(range(train_data_size))
      .map(generate_and_tokenize_prompt)
  )

  val_data = (
      data['test'].shuffle()
      .select(range(val_data_size))
      .map(generate_and_tokenize_prompt)
  )

  trainer = transformers.Trainer(
      model=model,
      train_dataset=train_data,
      eval_dataset=val_data,
      args=transformers.TrainingArguments(
          output_dir='./results',
          per_device_train_batch_size=16,
          per_device_eval_batch_size=64,
          warmup_steps=100,
          num_train_epochs=num_epochs,
          learning_rate=learning_rate,
          # fp16=True,
          logging_steps=500,
          # optim="adamw_torch",
          evaluation_strategy="steps",
          save_strategy="steps",
          eval_steps=1500,
          save_steps=1500,
          save_total_limit=3,
          load_best_model_at_end=True,
      ),
      data_collator=transformers.DataCollatorForSeq2Seq(
          tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
      ),
  )
  model.config.use_cache = False
  trainer.train()

  model.save_pretrained("./results")
  model.push_to_hub('htriedman/mpt-7b-instruct-wiki-sparql')

# Train!

In [None]:
train(train_data_size=150_000, val_data_size=15_000)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

Downloading (…)configuration_mpt.py:   0%|          | 0.00/9.20k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/mosaicml/mpt-7b-instruct:
- configuration_mpt.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Downloading (…)main/modeling_mpt.py:   0%|          | 0.00/19.3k [00:00<?, ?B/s]

Downloading (…)solve/main/blocks.py:   0%|          | 0.00/2.55k [00:00<?, ?B/s]

Downloading (…)resolve/main/norm.py:   0%|          | 0.00/2.56k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/mosaicml/mpt-7b-instruct:
- norm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Downloading (…)ve/main/attention.py:   0%|          | 0.00/17.7k [00:00<?, ?B/s]

Downloading (…)flash_attn_triton.py:   0%|          | 0.00/28.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/mosaicml/mpt-7b-instruct:
- flash_attn_triton.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/mosaicml/mpt-7b-instruct:
- attention.py
- flash_attn_triton.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/mosaicml/mpt-7b-instruct:
- blocks.py
- norm.py
- attention.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Downloading (…)/custom_embedding.py:   0%|          | 0.00/305 [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/mosaicml/mpt-7b-instruct:
- custom_embedding.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Downloading (…)meta_init_context.py:   0%|          | 0.00/3.64k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/mosaicml/mpt-7b-instruct:
- meta_init_context.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Downloading (…)refixlm_converter.py:   0%|          | 0.00/27.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/mosaicml/mpt-7b-instruct:
- hf_prefixlm_converter.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Downloading (…)n/adapt_tokenizer.py:   0%|          | 0.00/1.75k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/mosaicml/mpt-7b-instruct:
- adapt_tokenizer.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Downloading (…)in/param_init_fns.py:   0%|          | 0.00/12.6k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/mosaicml/mpt-7b-instruct:
- param_init_fns.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/mosaicml/mpt-7b-instruct:
- modeling_mpt.py
- blocks.py
- custom_embedding.py
- meta_init_context.py
- hf_prefixlm_converter.py
- adapt_tokenizer.py
- param_init_fns.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Downloading (…)model.bin.index.json:   0%|          | 0.00/16.0k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00002.bin:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

Downloading (…)l-00002-of-00002.bin:   0%|          | 0.00/3.36G [00:00<?, ?B/s]

Instantiating an MPTForCausalLM model from /root/.cache/huggingface/modules/transformers_modules/mosaicml/mpt-7b-instruct/925e0d80e50e77aaddaf9c3ced41ca4ea23a1025/modeling_mpt.py
You are using config.init_device='cpu', but you can also use config.init_device="meta" with Composer + FSDP for fast initialization.




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/237 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]



Downloading readme:   0%|          | 0.00/423 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/50.1M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.18M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/240000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/6474 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/15103 [00:00<?, ? examples/s]

Map:   0%|          | 0/150000 [00:00<?, ? examples/s]

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

You're using a GPTNeoXTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
