# (QLora) Fine-tuning Mistral-7b on NLI Tasks

Hyper-parameters - batch_size up, lora_rank down, lr up

- lora_rank=4
- lora_alpha=32
- train_data=50
- validation_data=2000
- epoch=10
- lr-2e^4
- decay=0.01
- batch_size=32
- max_len=1024


In [1]:
# UNCOMMENT IF USING GOOGLE-DRIVE:
from google.colab import drive
drive.mount('/content/drive')
root = '/content/drive/My Drive/Colab Notebooks/COMP34812/'

# UNCOMMENT IF RUN-POD OR LOCAL
# root = ''

Mounted at /content/drive


In [12]:
config = {
  'lora_rank': 16,
  'lora_alpha': 64,
  'epochs': 11,
  'lr': 3e-4,
  'decay': 0.01,
  'batch_size': 4,
  'max_len': 512
}

In [3]:
!pip install auto-gptq
!pip install optimum
!pip install bitsandbytes

Collecting auto-gptq
  Downloading auto_gptq-0.7.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (23.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.5/23.5 MB[0m [31m64.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate>=0.26.0 (from auto-gptq)
  Downloading accelerate-0.29.3-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.6/297.6 kB[0m [31m32.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets (from auto-gptq)
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m50.7 MB/s[0m eta [36m0:00:00[0m
Collecting rouge (from auto-gptq)
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Collecting gekko (from auto-gptq)
  Downloading gekko-1.1.1-py3-none-any.whl (13.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.2/13.2 MB[0m [31m59.4 MB/s[0m eta [36m0:00:00[0m
Collecting

In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from peft import prepare_model_for_kbit_training
from peft import LoraConfig, get_peft_model
from datasets import load_dataset
import transformers

### Load model

In [5]:
model_name = "TheBloke/Mistral-7B-v0.1-GPTQ"
model = AutoModelForCausalLM.from_pretrained(model_name,
                                             device_map="auto", # automatically figures out how to best use CPU + GPU for loading model
                                             trust_remote_code=False, # prevents running custom model files on your machine
                                             revision="main") # which version of model to use in repo

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/963 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/4.16G [00:00<?, ?B/s]



generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

### Load tokenizer

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

tokenizer_config.json:   0%|          | 0.00/962 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

### Prepare Model for Training

In [7]:
model.train() # model in training mode (dropout modules are activated)

# enable gradient check pointing
model.gradient_checkpointing_enable()

# enable quantized training
model = prepare_model_for_kbit_training(model)

In [8]:
# LoRA config
lora_config = LoraConfig(
    r=config['lora_rank'],
    # r=4,
    lora_alpha=config['lora_alpha'],
    # lora_alpha=16,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
        ],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

# LoRA trainable version of model
model = get_peft_model(model, lora_config)

# trainable parameter count
model.print_trainable_parameters()

# LoRA trainable version of model
model = get_peft_model(model, lora_config)

# trainable parameter count
model.print_trainable_parameters()

trainable params: 42,520,576 || all params: 304,930,816 || trainable%: 13.944335491497192
trainable params: 42,520,576 || all params: 304,930,816 || trainable%: 13.944335491497192


### Preparing Training Dataset

In [17]:
from datasets import Dataset
import pandas as pd

train_df = pd.read_csv(root + 'data/training_data/ft_train.csv', names=['id', 'prompt']).iloc[1:]
dev_df = pd.read_csv(root + 'data/training_data/ft_dev.csv', names=['id', 'prompt']).iloc[1:]
train_data = Dataset.from_pandas(train_df)
dev_data = Dataset.from_pandas(dev_df)

dev_df

Unnamed: 0,id,prompt
1,3898.0,[INST] You are an expert in natural language r...
2,13664.0,[INST] You are an expert in natural language r...
3,12191.0,[INST] You are an expert in natural language r...
4,5755.0,[INST] You are an expert in natural language r...
5,5533.0,[INST] You are an expert in natural language r...
...,...,...
3102,5437.0,[INST] You are an expert in natural language r...
3103,5722.0,[INST] You are an expert in natural language r...
3104,3901.0,[INST] You are an expert in natural language r...
3105,4270.0,[INST] You are an expert in natural language r...


In [18]:
# create tokenize function
def tokenize_function(examples):
    # extract text
    prompts = examples["prompt"]

    #tokenize and truncate text
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        prompts,
        return_tensors="np",
        truncation=True,
        max_length=config['max_len']
    )

    return tokenized_inputs

# tokenize training and validation datasets
tokenized_train_data = train_data.map(tokenize_function, batched=True)
tokenized_dev_data = dev_data.map(tokenize_function, batched=True)

Map:   0%|          | 0/12424 [00:00<?, ? examples/s]

Map:   0%|          | 0/3106 [00:00<?, ? examples/s]

In [19]:
# setting pad token
tokenizer.pad_token = tokenizer.eos_token
# data collator
data_collator = transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)


### Fine-tuning Model

In [29]:
# hyperparameters
lr = config['lr']
batch_size = config['batch_size']
num_epochs = config['epochs']

# define training arguments
training_args = transformers.TrainingArguments(
    output_dir= "mistral-7b-nli_cot",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size*2,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    gradient_accumulation_steps=4,
    warmup_steps=2

)

In [30]:
# configure trainer
trainer = transformers.Trainer(
    model=model,
    train_dataset=tokenized_train_data,
    eval_dataset=tokenized_dev_data,
    args=training_args,
    data_collator=data_collator
)


# train model
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

# renable warnings
model.config.use_cache = True



ValueError: Attempting to unscale FP16 gradients.

In [None]:
import matplotlib.pyplot as plt

history = trainer.state.log_history
epochs = []
training_loss = []
validation_loss = []

for entry in history:
    if 'loss' in entry:
        epochs.append(entry['epoch'])
        training_loss.append(entry['loss'])
    if 'eval_loss' in entry:
        validation_loss.append(entry['eval_loss'])

# Plotting the training and validation loss
plt.figure(figsize=(10, 5))
plt.plot(epochs, training_loss, label='Training Loss')
plt.plot(epochs, validation_loss, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training vs Validation Loss')
plt.legend()
plt.grid(True)
plt.show()

### Push model to hub

In [None]:
!pip install huggingface_hub
from huggingface_hub.hf_api import HfFolder; HfFolder.save_token('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') # WRITE TOKEN



In [None]:
hf_name = 'jd0g' # your hf username or org name
model_id = hf_name + "/" + "Mistral-7B-NLI-v0.1"

In [None]:
model.push_to_hub(model_id)
trainer.push_to_hub(model_id)

### Load Fine-tuned Model

In [None]:
# load model from hub
from transformers import AutoModelForCausalLM

model_name = "TheBloke/Mistral-7B-v0.1-GPTQ"
base_model = AutoModelForCausalLM.from_pretrained(model_name,
                                             device_map="auto",
                                             trust_remote_code=False,
                                             revision="main")

# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)



### Use Fine-tuned Model

In [None]:
instructions_string = """ You are an expert in natural language reasoning and inference. Your task is to analyze pairs of sentences and determine if the second sentence (hypothesis) can be logically inferred from the first sentence (premise). For each example, I will provide the premise and hypothesis. Your response should be in the following JSON format:
{
  "thought_process":
    "Step 1. <Identify key information and relationships in the premise, considering logical connections, commonsense understanding, and factual consistency>.
    Step 2. <Analyze how the hypothesis relates to or contradicts the premise based on the information identified in Step 1. Evaluate if the hypothesis can be reasonably inferred from the premise>.
    Step 3. <Explain your final reasoning and conclusion on whether the hypothesis is entailed by the premise or not>",
  "label": "<0 for no entailment, 1 for entailment>"
}
Please provide a clear multi-step reasoning chain explaining how you arrived at your final answer, breaking it down into logical components. Ground your response in the given information, logical principles and common-sense reasoning.

Example:

Premise: The dog chased the cat up the tree. Hypothesis: The cat climbed the tree. Label:

{
    "thought_process": "
        Step 1: the premise indicates a scenario where a dog chases a cat, resulting in the cat moving up a tree. The movement 'up the tree' suggests a vertical ascent, typical of climbing behavior. It is common sense that a cat would climb a tree to escape a chasing dog, and there are no known facts that contradict the premise or hypothesis.
        Step 2: 'The cat climbed the tree' can be logically inferred from the premise because the action of climbing is a reasonable and necessary part of the cat moving 'up the tree' as described. Thus, the hypothesis logically follows from the premise.
        Step 3: Based on the logical reasoning, common sense, and lack of contradictory facts, the hypothesis can be inferred from the premise.
        ",
    "label": 1
}
"""
prompt_template = lambda task: f'''[INST] {instructions_string} \n{task} \n[/INST]'''

task = "Premise: 'I can't believe you told him that!' she exclaimed. Hypothesis: A secret was shared."

prompt = prompt_template(task)
print(prompt)

In [None]:
from peft import PeftModel, PeftConfig

def set_model(model_name="mistral-7b-nli_cot_qkv"):
  model_id = hf_name + "/" + model_name
  config = PeftConfig.from_pretrained(model_id)
  model = PeftModel.from_pretrained(base_model, model_id)
  model.eval()
  return model

def generate_text(prompt):
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(
        input_ids=inputs["input_ids"].to("cuda"),
        max_new_tokens=512
    )
    print(f'********** input {id} **********')
    print('Completion: ', tokenizer.batch_decode(outputs)[0])

model = set_model()
generate_text(prompt)

In [None]:
!pip install pandas
import pandas as pd

dropped = pd.read_csv(root + 'data/training_data/dropped_data.csv')
dropped

dropped['prompts'] = dropped.iloc[:10].apply(lambda x: prompt_template(f'''Premise: {x['premise']} Hypothesis: {x['hypothesis']} Label: '''), axis=1)
dropped['completions'] = dropped['prompts'].apply(generate_text)