In [1]:
from datasets import load_dataset, DatasetDict
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer, BitsAndBytesConfig
import torch
import time
import evaluate
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


#### Load Mistral 7B

In [2]:
# device = "cuda"
model_name = "mistralai/Mistral-7B-v0.1"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [26]:
original_model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config) # .to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=512, padding_side="left", add_eos_token=True)

tokenizer.pad_token = tokenizer.eos_token

# # Convert the model to 16-bit precision
# if torch.cuda.is_available():
#     original_model = original_model.half()

Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.41s/it]


In [27]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(original_model))

trainable model parameters: 262410240
all model parameters: 3752071168
percentage of trainable model parameters: 6.99%


#### Load Unnatural Instruction Dataset

In [5]:
huggingface_dataset_name = "mrm8488/unnatural-instructions-core"

dataset = load_dataset(huggingface_dataset_name)

dataset

DatasetDict({
    train: Dataset({
        features: ['instruction', 'instances'],
        num_rows: 66010
    })
})

In [6]:
# Split the dataset into train, test, and validation sets
train_test_dataset = dataset['train'].train_test_split(test_size=0.2, seed=42)  # 80% train, 20% for test and validation
test_val_dataset = train_test_dataset['test'].train_test_split(test_size=0.5, seed=42)  # Split the 20% equally into test and validation

# Create a new DatasetDict
dataset_dict = DatasetDict({
    'train': train_test_dataset['train'],
    'test': test_val_dataset['train'],
    'validation': test_val_dataset['test']
})

dataset_dict

DatasetDict({
    train: Dataset({
        features: ['instruction', 'instances'],
        num_rows: 52808
    })
    test: Dataset({
        features: ['instruction', 'instances'],
        num_rows: 6601
    })
    validation: Dataset({
        features: ['instruction', 'instances'],
        num_rows: 6601
    })
})

In [7]:
index = 200

print(f"instructions: {dataset_dict['test'][index]['instruction']}")
print(f"instances: {dataset_dict['test'][index]['instances']}")

instructions: Given a piece of text and a list of keywords, you need to figure out if the text contains all the given keywords at least once or not. If it does contain all of them, output 'True'. Otherwise, output 'False'. Note that keyword case is not important here i.e. whether the keyword is in upper case or lower case does not matter.
instances: [{'instruction_with_input': "Given a piece of text and a list of keywords, you need to figure out if the text contains all the given keywords at least once or not. If it does contain all of them, output 'True'. Otherwise, output 'False'. Note that keyword case is not important here i.e. whether the keyword is in upper case or lower case does not matter.\nText: The quick brown fox jumps over the lazy dog.,Keywords: ['fox', 'dog', 'brown'].", 'input': "Text: The quick brown fox jumps over the lazy dog.,Keywords: ['fox', 'dog', 'brown'].", 'constraints': "The output should be one of two strings - either 'True' or 'False'.", 'output': 'True'}]


In [17]:
# def get_prompt_output(row):
#     # print(row)
#     inst_dict_list = row['instances']
#     print(inst_dict_list, type(inst_dict_list))
#     # Ignore input for now "input: {inst_dict['input']},\n"
#     input_text_list = [f"instruction_with_input: {inst_dict['instruction_with_input']},\nconstraints: {inst_dict['constraints']}"f"input: {inst_dict['input']},\ninstruction_with_input: {inst_dict['instruction_with_input']},\nconstraints: {inst_dict['constraints']}"
#                   for inst_dict in inst_dict_list]
#     input_text = "\n".join(input_text_list)
#     prompt = f"<s>[INST] {input_text} [/INST]"

    
#     output_list = [inst_dict['output'] for inst_dict in inst_dict_list]
#     output = "\n".join(output_list)

#     return prompt, output

# get_prompt_output(dataset_dict['test'][index])

{'instruction': "Given a piece of text and a list of keywords, you need to figure out if the text contains all the given keywords at least once or not. If it does contain all of them, output 'True'. Otherwise, output 'False'. Note that keyword case is not important here i.e. whether the keyword is in upper case or lower case does not matter.", 'instances': [{'instruction_with_input': "Given a piece of text and a list of keywords, you need to figure out if the text contains all the given keywords at least once or not. If it does contain all of them, output 'True'. Otherwise, output 'False'. Note that keyword case is not important here i.e. whether the keyword is in upper case or lower case does not matter.\nText: The quick brown fox jumps over the lazy dog.,Keywords: ['fox', 'dog', 'brown'].", 'input': "Text: The quick brown fox jumps over the lazy dog.,Keywords: ['fox', 'dog', 'brown'].", 'constraints': "The output should be one of two strings - either 'True' or 'False'.", 'output': 'T

("<s>[INST] instruction_with_input: Given a piece of text and a list of keywords, you need to figure out if the text contains all the given keywords at least once or not. If it does contain all of them, output 'True'. Otherwise, output 'False'. Note that keyword case is not important here i.e. whether the keyword is in upper case or lower case does not matter.\nText: The quick brown fox jumps over the lazy dog.,Keywords: ['fox', 'dog', 'brown'].,\nconstraints: The output should be one of two strings - either 'True' or 'False'.input: Text: The quick brown fox jumps over the lazy dog.,Keywords: ['fox', 'dog', 'brown'].,\ninstruction_with_input: Given a piece of text and a list of keywords, you need to figure out if the text contains all the given keywords at least once or not. If it does contain all of them, output 'True'. Otherwise, output 'False'. Note that keyword case is not important here i.e. whether the keyword is in upper case or lower case does not matter.\nText: The quick brown

### Perform Full Fine-Tuning

#### Preprocess Dataset

In [33]:
def get_prompt(d_li):
    p_li  = []
    for d in d_li:
        instruction_with_input = f"instruction_with_input: {d['instruction_with_input']}"
        constraints = f"constraints: {d['constraints']}"
        p_li.append(f"<s>[INST] {instruction_with_input},\n{constraints} [/INST]")
    return "\n".join(p_li)
    # instruction_with_input = f"instruction_with_input: {d['instruction_with_input']}"
    # constraints = f"constraints: {d['constraints']}"
    # return f"<s>[INST] {instruction_with_input},\n{constraints} [/INST]"

def get_output(d_li):
    return "\n".join([d['output'] for d in d_li])

def tokenize_function(example):
    prompts = [get_prompt(per_inst_d_li) for per_inst_d_li in example['instances']]
    outputs = [get_output(per_inst_d_li) for per_inst_d_li in example['instances']]
    # raw_p, out = get_prompt_output(example)
    # prompt, output = raw_p, out
    example['input_ids'] = tokenizer(prompts, max_length=512, padding="max_length", truncation=True, return_tensors="pt").input_ids # .to(device)
    example['labels'] = tokenizer(outputs, max_length=512, padding="max_length", truncation=True, return_tensors="pt").input_ids
    return example

tokenized_datasets = dataset_dict.map(tokenize_function, batched=True, remove_columns=['instruction', 'instances']) # 

tokenized_datasets

Map: 100%|██████████| 52808/52808 [00:12<00:00, 4317.79 examples/s]
Map: 100%|██████████| 6601/6601 [00:01<00:00, 4029.08 examples/s]
Map: 100%|██████████| 6601/6601 [00:01<00:00, 4244.81 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 52808
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 6601
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 6601
    })
})

In [34]:
# # To save some time in the lab, you will subsample the dataset:
# tokenized_datasets = tokenized_datasets.filter(lambda example, index: index % 100 == 0, with_indices=True)

In [35]:
print(f"Shapes of the datasets:")
print(f"Training: {tokenized_datasets['train'].shape}")
print(f"Validation: {tokenized_datasets['validation'].shape}")
print(f"Test: {tokenized_datasets['test'].shape}")

print(tokenized_datasets)

Shapes of the datasets:
Training: (52808, 2)
Validation: (6601, 2)
Test: (6601, 2)
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 52808
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 6601
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 6601
    })
})


In [37]:
print(len(tokenized_datasets['train'][4]['input_ids']))

512


In [40]:
print(tokenized_datasets['train'][0]['input_ids'])
print(tokenized_datasets['train'][0]['labels'])

[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 

### Fin-Tune the Model with Preprocessed Dataset

In [25]:
output_dir = f'./model/dialogue-summary-training-{str(int(time.time()))}'

training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=1e-5,
    per_device_train_batch_size=2,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_steps=1,
    max_steps=1,
    fp16=True,
)

trainer = Trainer(
    model=original_model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
)

In [41]:
# trainer.train() ### OOM

### Peft

#### Set up LoRA

In [47]:
from peft import prepare_model_for_kbit_training

original_model.enable_input_require_grads()
original_model.gradient_checkpointing_enable()
original_model = prepare_model_for_kbit_training(original_model)

print(print_number_of_trainable_model_parameters(original_model))

trainable model parameters: 0
all model parameters: 3752071168
percentage of trainable model parameters: 0.00%


In [48]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=8, # Rank
    lora_alpha=16,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    lora_dropout=0.05,
    bias="none",
    task_type='CAUSAL_LM'
)

In [49]:
peft_model = get_peft_model(original_model, 
                            lora_config)
print(print_number_of_trainable_model_parameters(peft_model))

trainable model parameters: 21260288
all model parameters: 3773331456
percentage of trainable model parameters: 0.56%


In [50]:
output_dir = f'./peft-causal-lm-training-{str(int(time.time()))}'

peft_training_args = TrainingArguments(
    output_dir=output_dir,
    auto_find_batch_size=True,
    learning_rate=1e-3, # Higher learning rate than full fine-tuning.
    num_train_epochs=1,
    logging_steps=1,
    max_steps=1    
)
    
peft_trainer = Trainer(
    model=peft_model,
    args=peft_training_args,
    train_dataset=tokenized_datasets["train"],
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [51]:
peft_trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[34m[1mwandb[0m: Currently logged in as: [33mirislin1006[0m. Use [1m`wandb login --relogin`[0m to force relogin
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` 

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
1,12.8687


TrainOutput(global_step=1, training_loss=12.868653297424316, metrics={'train_runtime': 14.4526, 'train_samples_per_second': 0.554, 'train_steps_per_second': 0.069, 'total_flos': 175274075357184.0, 'train_loss': 12.868653297424316, 'epoch': 0.0})

In [52]:
peft_model_path="./peft-dialogue-summary-checkpoint-local"

peft_trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)



('./peft-causal-lm-checkpoint-local/tokenizer_config.json',
 './peft-causal-lm-checkpoint-local/special_tokens_map.json',
 './peft-causal-lm-checkpoint-local/tokenizer.model',
 './peft-causal-lm-checkpoint-local/added_tokens.json',
 './peft-causal-lm-checkpoint-local/tokenizer.json')

In [58]:
test = dataset_dict['test'][0:2]['instances']
[get_prompt(per_inst_d_li) for per_inst_d_li in test]

["<s>[INST] instruction_with_input: You are given a review and your task is to determine its sentiment. A positive review has a positive sentiment while a negative review has a negative sentiment.\nThe plot was convoluted and hard to follow.,\nconstraints: The output should be one of the two strings 'positive' or 'negative'. [/INST]",
 "<s>[INST] instruction_with_input: You are given a date. Your task is to determine whether the date follows the specified format or not.\n12/05/2015.,\nconstraints: The correct format for this particular date would be DD/MM/YYYY, where DD represents day, MM month and YYYY year. Therefore, 12/05/2015 satisfies this condition and hence the output should be 'True'. [/INST]"]

In [None]:
# model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

# ft_model.eval()
# with torch.no_grad():
#     print(eval_tokenizer.decode(ft_model.generate(**model_input, max_new_tokens=100)[0], skip_special_tokens=True))


In [62]:
eval_instances = dataset_dict['test'][0:10]['instances']
prompts = [get_prompt(per_inst_d_li) for per_inst_d_li in eval_instances]
outputs = [get_output(per_inst_d_li) for per_inst_d_li in eval_instances]
#print(prompts)
print(len(prompts), len(outputs), outputs[0:2])


original_model_summaries = []
instruct_model_summaries = []
peft_model_summaries = []

for idx, prompt in enumerate(prompts):
#     prompt = f"""
# Summarize the following conversation.

# {dialogue}

# Summary: """
    
    model_input = tokenizer(prompt, return_tensors="pt")

    original_model_outputs = original_model.generate(**model_input, max_new_tokens=100)
    original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

    peft_model_outputs = peft_model.generate(**model_input, max_new_tokens=100)
    peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)

    original_model_summaries.append(original_model_text_output)
    peft_model_summaries.append(peft_model_text_output)

zipped_summaries = list(zip(original_model_summaries, peft_model_summaries))
 
df = pd.DataFrame(zipped_summaries, columns = ['original_model_summaries', 'peft_model_summaries'])
df

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


10 10 ['negative', 'True']


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Setting `pad_token_id` to `eos_token

Unnamed: 0,original_model_summaries,peft_model_summaries
0,[INST] instruction_with_input: You are given a...,[INST] instruction_with_input: You are given a...
1,[INST] instruction_with_input: You are given a...,[INST] instruction_with_input: You are given a...
2,"[INST] instruction_with_input: In this task, y...","[INST] instruction_with_input: In this task, y..."
3,"[INST] instruction_with_input: In this task, y...","[INST] instruction_with_input: In this task, y..."
4,[INST] instruction_with_input: You are given a...,[INST] instruction_with_input: You are given a...
5,[INST] instruction_with_input: You will be giv...,[INST] instruction_with_input: You will be giv...
6,[INST] instruction_with_input: You are given a...,[INST] instruction_with_input: You are given a...
7,[INST] instruction_with_input: Given two words...,[INST] instruction_with_input: Given two words...
8,"[INST] instruction_with_input: In this task, y...","[INST] instruction_with_input: In this task, y..."
9,[INST] instruction_with_input: Given a descrip...,[INST] instruction_with_input: Given a descrip...


In [66]:
print(df['original_model_summaries'].iloc[0])

[INST] instruction_with_input: You are given a review and your task is to determine its sentiment. A positive review has a positive sentiment while a negative review has a negative sentiment.
The plot was convoluted and hard to follow.,
constraints: The output should be one of the two strings 'positive' or 'negative'. [/INST]

[SOL]

def sentiment(review):
    if review.lower().count('positive') > review.lower().count('negative'):
        return 'positive'
    else:
        return 'negative'

[/SOL]


In [67]:
print(df['peft_model_summaries'].iloc[0])

[INST] instruction_with_input: You are given a review and your task is to determine its sentiment. A positive review has a positive sentiment while a negative review has a negative sentiment.
The plot was convoluted and hard to follow.,
constraints: The output should be one of the two strings 'positive' or 'negative'. [/INST]

[SOL]

def sentiment(review):
    if review.lower().count('positive') > review.lower().count('negative'):
        return 'positive'
    else:
        return 'negative'

[/SOL]
