In [1]:
from datasets import load_dataset, DatasetDict
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch
import time
import evaluate
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


#### Load Mistral 7B

In [2]:
device = "cuda"
model_name = "mistralai/Mistral-7B-v0.1"

In [3]:
original_model = AutoModelForCausalLM.from_pretrained(model_name) # .to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# # Convert the model to 16-bit precision
# if torch.cuda.is_available():
#     original_model = original_model.half()

Loading checkpoint shards: 100%|██████████| 2/2 [00:09<00:00,  4.60s/it]


In [4]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(original_model))

trainable model parameters: 7241732096
all model parameters: 7241732096
percentage of trainable model parameters: 100.00%


#### Load Unnatural Instruction Dataset

In [5]:
huggingface_dataset_name = "mrm8488/unnatural-instructions-core"

dataset = load_dataset(huggingface_dataset_name)

dataset

DatasetDict({
    train: Dataset({
        features: ['instruction', 'instances'],
        num_rows: 66010
    })
})

In [6]:
# Split the dataset into train, test, and validation sets
train_test_dataset = dataset['train'].train_test_split(test_size=0.2, seed=42)  # 80% train, 20% for test and validation
test_val_dataset = train_test_dataset['test'].train_test_split(test_size=0.5, seed=42)  # Split the 20% equally into test and validation

# Create a new DatasetDict
dataset_dict = DatasetDict({
    'train': train_test_dataset['train'],
    'test': test_val_dataset['train'],
    'validation': test_val_dataset['test']
})

dataset_dict

DatasetDict({
    train: Dataset({
        features: ['instruction', 'instances'],
        num_rows: 52808
    })
    test: Dataset({
        features: ['instruction', 'instances'],
        num_rows: 6601
    })
    validation: Dataset({
        features: ['instruction', 'instances'],
        num_rows: 6601
    })
})

In [7]:
index = 200

print(f"instructions: {dataset_dict['test'][index]['instruction']}")
print(f"instances: {dataset_dict['test'][index]['instances']}")

instructions: Given a piece of text and a list of keywords, you need to figure out if the text contains all the given keywords at least once or not. If it does contain all of them, output 'True'. Otherwise, output 'False'. Note that keyword case is not important here i.e. whether the keyword is in upper case or lower case does not matter.
instances: [{'instruction_with_input': "Given a piece of text and a list of keywords, you need to figure out if the text contains all the given keywords at least once or not. If it does contain all of them, output 'True'. Otherwise, output 'False'. Note that keyword case is not important here i.e. whether the keyword is in upper case or lower case does not matter.\nText: The quick brown fox jumps over the lazy dog.,Keywords: ['fox', 'dog', 'brown'].", 'input': "Text: The quick brown fox jumps over the lazy dog.,Keywords: ['fox', 'dog', 'brown'].", 'constraints': "The output should be one of two strings - either 'True' or 'False'.", 'output': 'True'}]


In [8]:
def get_prompt_output(row):
    inst_dict_list = row['instances']
    # Ignore input for now "input: {inst_dict['input']},\n"
    input_text_list = [f"instruction_with_input: {inst_dict['instruction_with_input']},\nconstraints: {inst_dict['constraints']}"f"input: {inst_dict['input']},\ninstruction_with_input: {inst_dict['instruction_with_input']},\nconstraints: {inst_dict['constraints']}"
                  for inst_dict in inst_dict_list]
    input_text = "\n".join(input_text_list)
    prompt = f"<s>[INST] {input_text} [/INST]"

    
    output_list = [inst_dict['output'] for inst_dict in inst_dict_list]
    output = "\n".join(output_list)

    return prompt, output

get_prompt_output(dataset_dict['test'][index])

("<s>[INST] instruction_with_input: Given a piece of text and a list of keywords, you need to figure out if the text contains all the given keywords at least once or not. If it does contain all of them, output 'True'. Otherwise, output 'False'. Note that keyword case is not important here i.e. whether the keyword is in upper case or lower case does not matter.\nText: The quick brown fox jumps over the lazy dog.,Keywords: ['fox', 'dog', 'brown'].,\nconstraints: The output should be one of two strings - either 'True' or 'False'.input: Text: The quick brown fox jumps over the lazy dog.,Keywords: ['fox', 'dog', 'brown'].,\ninstruction_with_input: Given a piece of text and a list of keywords, you need to figure out if the text contains all the given keywords at least once or not. If it does contain all of them, output 'True'. Otherwise, output 'False'. Note that keyword case is not important here i.e. whether the keyword is in upper case or lower case does not matter.\nText: The quick brown

### Perform Full Fine-Tuning

#### Preprocess Dataset

In [9]:
def tokenize_function(example):
    raw_p, out = get_prompt_output(dataset_dict['test'][index])
    prompt, output = [raw_p], [out]
    example['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt")['input_ids'] # .to(device)
    example['labels'] = tokenizer(output, padding="max_length", truncation=True, return_tensors="pt")['input_ids']
    return example

tokenized_datasets = dataset_dict.map(tokenize_function, batched=True, remove_columns=['instruction', 'instances'])

tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 53
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 7
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 7
    })
})

In [10]:
# tokenized_datasets = tokenized_datasets.filter(lamb/da example, index: index % 100 == 0, with_indices=True)

In [11]:
print(f"Shapes of the datasets:")
print(f"Training: {tokenized_datasets['train'].shape}")
print(f"Validation: {tokenized_datasets['validation'].shape}")
print(f"Test: {tokenized_datasets['test'].shape}")

print(tokenized_datasets)

Shapes of the datasets:
Training: (53, 2)
Validation: (7, 2)
Test: (7, 2)
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 53
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 7
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 7
    })
})


In [12]:
tokenized_datasets['train'][0]

{'input_ids': [1,
  1,
  733,
  16289,
  28793,
  13126,
  28730,
  3415,
  28730,
  2537,
  28747,
  12628,
  264,
  5511,
  302,
  2245,
  304,
  264,
  1274,
  302,
  28049,
  28725,
  368,
  927,
  298,
  5248,
  575,
  513,
  272,
  2245,
  5876,
  544,
  272,
  2078,
  28049,
  438,
  2429,
  2327,
  442,
  459,
  28723,
  1047,
  378,
  1235,
  7001,
  544,
  302,
  706,
  28725,
  3825,
  464,
  4365,
  4135,
  15510,
  28725,
  3825,
  464,
  6995,
  4135,
  6096,
  369,
  23247,
  1222,
  349,
  459,
  2278,
  1236,
  613,
  28723,
  28706,
  28723,
  3161,
  272,
  23247,
  349,
  297,
  7280,
  1222,
  442,
  3889,
  1222,
  1235,
  459,
  3209,
  28723,
  13,
  1874,
  28747,
  415,
  2936,
  9060,
  285,
  1142,
  461,
  10575,
  754,
  272,
  17898,
  3914,
  2063,
  2064,
  10366,
  28747,
  5936,
  23312,
  647,
  464,
  17693,
  647,
  464,
  28726,
  3329,
  1421,
  2063,
  13,
  514,
  17005,
  28747,
  415,
  3825,
  1023,
  347,
  624,
  302,
  989,
  11272,
  387

### Fin-Tune the Model with Preprocessed Dataset

In [25]:
output_dir = f'./model/dialogue-summary-training-{str(int(time.time()))}'

training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=1e-5,
    per_device_train_batch_size=2,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_steps=1,
    max_steps=1,
    fp16=True,
)

trainer = Trainer(
    model=original_model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
)

In [27]:
trainer.train()

OutOfMemoryError: CUDA out of memory. Tried to allocate 74.00 MiB. GPU 0 has a total capacty of 24.00 GiB of which 0 bytes is free. Including non-PyTorch memory, this process has 17179869184.00 GiB memory in use. Of the allocated memory 53.65 GiB is allocated by PyTorch, and 647.54 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

#### Peft