In [2]:
import transformers
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
model_name = 'TheBloke/Mistral-7B-Instruct-v0.2-GPTQ'
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map='auto',  # automatically figures out how to best use CPU + GPU for loading model
    trust_remote_code=False,  # prevents running custom model files on your machine
    revision='main',
)  # which version of model to use in repo


def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f'trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}'
    )


print_trainable_parameters(model)

trainable params: 262410240 || all params: 262410240 || trainable%: 100.0


In [104]:
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

In [105]:
model.eval()  # model in evaluation mode (dropout modules are deactivated)

# craft prompt
comment = 'Great content, thank you!'
prompt = f"""[INST] {comment} [/INST]"""

# tokenize input
inputs = tokenizer(prompt, return_tensors='pt')

# generate output
outputs = model.generate(input_ids=inputs['input_ids'].to('cuda'), max_new_tokens=140)

print(tokenizer.batch_decode(outputs)[0])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<s> [INST] Great content, thank you! [/INST] I'm glad you found the content helpful! If you have any specific questions or topics you'd like me to cover in the future, feel free to ask. I'm here to help.

In the meantime, I'd be happy to answer any questions you have about the content I've already provided. Just let me know which article or blog post you're referring to, and I'll do my best to provide you with accurate and up-to-date information.

Thanks for reading, and I look forward to helping you with any questions you may have!</s>


In [106]:
model.train()  # model in training mode (dropout modules are activated)

# enable gradient check pointing
model.gradient_checkpointing_enable()

# enable quantized training
model = prepare_model_for_kbit_training(model)

In [5]:
# LoRA config
config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=['q_proj'],
    lora_dropout=0.05,
    bias='none',
    task_type='CAUSAL_LM',
)

# LoRA trainable version of model
model = get_peft_model(model, config)


def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f'trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}'
    )


print_trainable_parameters(model)

trainable params: 2097152 || all params: 264507392 || trainable%: 0.7928519441906561


In [108]:
# load dataset
data = load_dataset('shawhin/shawgpt-youtube-comments')

In [109]:
# create tokenize function
def tokenize_function(examples):
    # extract text
    text = examples['example']

    # tokenize and truncate text
    tokenizer.truncation_side = 'left'
    tokenized_inputs = tokenizer(text, return_tensors='np', truncation=True, max_length=512)

    return tokenized_inputs


# tokenize training and validation datasets
tokenized_data = data.map(tokenize_function, batched=True)

Map: 100%|██████████| 50/50 [00:00<00:00, 8537.50 examples/s]
Map: 100%|██████████| 9/9 [00:00<00:00, 2531.43 examples/s]


In [110]:
tokenized_data['train']

Dataset({
    features: ['example', 'input_ids', 'attention_mask'],
    num_rows: 50
})

In [72]:
tokenizer.pad_token = tokenizer.eos_token

In [111]:
# setting pad token
tokenizer.pad_token = tokenizer.eos_token
# data collator
data_collator = transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [6]:
# hyperparameters
lr = 2e-4
batch_size = 4
num_epochs = 10

# define training arguments
training_args = transformers.TrainingArguments(
    output_dir='shawgpt-ft',
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    logging_strategy='epoch',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    gradient_accumulation_steps=4,
    warmup_steps=2,
    fp16=True,
    optim='paged_adamw_8bit',
)

In [7]:
# configure trainer
trainer = transformers.Trainer(
    model=model,
    train_dataset=tokenized_data['train'],
    eval_dataset=tokenized_data['test'],
    args=training_args,
    data_collator=data_collator,
)

# train model
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

# renable warnings
model.config.use_cache = True

NameError: name 'tokenized_data' is not defined

In [56]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=lr)

In [57]:
import torch

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model.to(device)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(32000, 4096, padding_idx=0)
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralSdpaAttention(
              (rotary_emb): MistralRotaryEmbedding()
              (k_proj): QuantLinear()
              (o_proj): QuantLinear()
              (q_proj): lora.QuantLinear(
                (base_layer): QuantLinear()
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): Param

In [83]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_data['train'],
    shuffle=True,
    batch_size=4,
    collate_fn=data_collator,
)

val_dataloder = DataLoader(
    tokenized_data['test'],
    batch_size=4,
    collate_fn=data_collator,
)

In [84]:
for batch in train_dataloader:
    break
print({k: v.shape for k, v in batch.items()})

{'input_ids': torch.Size([4, 209]), 'attention_mask': torch.Size([4, 209]), 'labels': torch.Size([4, 209])}


In [89]:
from transformers import get_scheduler

num_epochs = 10
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    'linear',
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [90]:
from auto_gptq import exllama_set_max_input_length

model = exllama_set_max_input_length(model, max_input_length=19750)

progress = 1

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        print(f'{progress}/{num_training_steps}. Loss: {loss}')
        progress += 1



1/130. Loss: 4.336075305938721
2/130. Loss: 4.097919940948486
3/130. Loss: 4.206480979919434
4/130. Loss: 4.165226459503174
5/130. Loss: 4.2231526374816895
6/130. Loss: 4.438895225524902
7/130. Loss: 4.41405725479126
8/130. Loss: 3.7434208393096924
9/130. Loss: 4.538475513458252
10/130. Loss: 4.301823139190674
11/130. Loss: 4.640456676483154
12/130. Loss: 4.439744472503662
13/130. Loss: 4.172786235809326
14/130. Loss: 4.447360992431641
15/130. Loss: 4.042270183563232
16/130. Loss: 4.320154666900635
17/130. Loss: 4.1808905601501465
18/130. Loss: 4.14156436920166
19/130. Loss: 4.365614891052246
20/130. Loss: 4.297572612762451
21/130. Loss: 4.400135040283203
22/130. Loss: 4.1769819259643555
23/130. Loss: 4.057359218597412
24/130. Loss: 4.255882263183594
25/130. Loss: 4.404635429382324
26/130. Loss: 4.550881385803223
27/130. Loss: 4.52702522277832
28/130. Loss: 4.540689468383789
29/130. Loss: 4.124856948852539
30/130. Loss: 4.315665245056152
31/130. Loss: 4.03415584564209
32/130. Loss: 4.2

In [87]:
model.save_pretrained('finetune-model')



In [92]:
model.eval()  # model in evaluation mode (dropout modules are deactivated)

# craft prompt
comment = 'What is fat-tailedness?'
prompt = f"""[INST] {comment} [/INST]"""

# tokenize input
inputs = tokenizer(prompt, return_tensors='pt')

# generate output
outputs = model.generate(input_ids=inputs['input_ids'].to('cuda'), max_new_tokens=50)

print(tokenizer.batch_decode(outputs)[0])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<s> [INST] What is fat-tailedness? [/INST] Fat-tailedness is a statistical property of certain distributions where the tails of the distribution are heavier or fatter than what would be expected from a normal distribution. In other words, the probability of observing extreme values is higher than what
