# Get Dependencies

In [2]:
import os
import torch
from datasets import load_dataset
import bitsandbytes as bnb
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, logging, Trainer, DataCollatorForSeq2Seq, pipeline
from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training, AutoPeftModelForCausalLM
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from accelerate import Accelerator, PartialState
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm
  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'


# Get model and prepare for finetuning

In [3]:
################################################################################
# Model label
################################################################################

# Defining the pre-trained model to be used
model_name = "mistralai/Mistral-7B-Instruct-v0.2"

# Fine-tuned model name
new_model = "Schema-Links-mistral-7B-v0.1"

# Access token
token = "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"

# Load the entire model on the GPU
device_map = {"" : PartialState().process_index}

# Trust remote code for loading model
trust_remote_code = True

# specifiying whether Cache should be used
use_cache = False

################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
load_in_4bit = True

# Activate nested quantization for 4-bit base models (double quantization)
bnb_4bit_use_double_quant = False 

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Data type for computation
bnb_4bit_compute_dtype = torch.bfloat16

In [4]:
# Configuring the BitsAndBytes quantization for the model
bnb_config = BitsAndBytesConfig(
    load_in_4bit=load_in_4bit,
    bnb_4bit_use_double_quant=bnb_4bit_use_double_quant,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=bnb_4bit_compute_dtype
)

# Initialize tokenizer for the model
tokenizer = AutoTokenizer.from_pretrained(model_name, token = token)
# Set pad token to end-of-sequence token
tokenizer.pad_token = "<PAD>"
# Fix weird overflow issue with fp16 training
tokenizer.padding_side = "right"

# Load the pre-trained model for causal language modeling
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    token = token,
    device_map = device_map,
    trust_remote_code = trust_remote_code,
    quantization_config = bnb_config,
    use_cache = use_cache,
)


ImportError: Using `bitsandbytes` 8-bit quantization requires Accelerate: `pip install accelerate` and the latest version of bitsandbytes: `pip install -i https://pypi.org/simple/ bitsandbytes`

In [None]:
################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension
lora_r = 32 # I had it on 8

# Alpha parameter for LoRA scaling
lora_alpha = 64

# Dropout probability for LoRA layers
lora_dropout = 0.15

# Bias
bias = "none"

# Task type (Causal Language Modeling)
task_type = "CAUSAL_LM"

# Target modules for Lora
target_modules = "all-linear"

# uses Rank-Stabilized LoRA which sets the adapter scaling factor to lora_alpha/math.sqrt(r)
use_rslora = True

In [None]:
# Configuring the LoraConfig for the model
peft_config = LoraConfig(
    r=lora_r,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    bias=bias,
    task_type=task_type,
    target_modules=target_modules,
    use_rslora=use_rslora
)

# needs fewer VRAM, because checkpoints are safed along the way
model.gradient_checkpointing_enable()

# prepares model for training (don't know what is done exactly, but it's in QLORA doku)
model = prepare_model_for_kbit_training(model)

# creates peft model
model = get_peft_model(model, peft_config)

# params printing (function only works on peft model)
model.print_trainable_parameters()

trainable params: 13,631,488 || all params: 7,255,363,584 || trainable%: 0.18788152850204565


# Prepare Dataset



The same prompt that was used to chunk the schemas is reused again here to prepare the dataset

In [6]:
def format_instruction_schema_links(sample):
  result = f"""<s>[INST]
You are a powerful text-to-SQL model. Generate a schema link for constructing a SQL query that answers the given question, using the information provided below. Select the most relevant columns from the schema.
### Question:
{sample['question']}
### Schema:{sample["schema"]}### Hint:
{sample["hint"]}
[/INST] Schema_link:{sample['schema_linking']}
"""
  result += tokenizer.eos_token
  # Check if eos token is the same as </s>
  sample["text"] = result
  return sample

Get dataset method, is for different users in Colab

In [15]:
def get_datasets(file_training, file_eval):
    path_training = f"Prepared_Data/{file_training}"
    path_eval = f"Prepared_Data/{file_eval}"

    train_dataset = load_dataset("csv",data_files=path_training, split="train")
    eval_dataset = load_dataset("csv",data_files=path_eval, split="train")

    return train_dataset, eval_dataset

Loads already prepared data

In [16]:
train_dataset, eval_dataset = get_datasets("train.csv", "eval.csv")

Generating train split: 3353 examples [00:00, 22057.18 examples/s]


Constructs the new dataset, only consisting of the prompt & in the prompt itself the answers

In [17]:
train_dataset = train_dataset.map(format_instruction_schema_links, remove_columns=[f for f in train_dataset.features if not f == 'text'])
eval_dataset = eval_dataset.map(format_instruction_schema_links, remove_columns=[f for f in eval_dataset.features if not f == 'text'])

Map: 100%|██████████████████████| 13075/13075 [00:00<00:00, 14807.96 examples/s]
Map: 100%|████████████████████████| 3353/3353 [00:00<00:00, 15234.34 examples/s]


# Training

In [None]:
################################################################################
# TrainingArguments parameters
################################################################################

# Output directory where the model predictions and checkpoints will be stored
output_dir = f"Models/{new_model}/checkpoints"

# Number of training epochs
num_train_epochs = 5

# Batch size per GPU for training
per_device_train_batch_size = 3
per_device_eval_batch_size = 3

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 2

# Optimizer to use
optim = "paged_adamw_32bit"

# Evaluation after every X updates steps
eval_delay = 500

# Save checkpoint every X updates steps
save_steps = 500

# Log every X updates steps
logging_steps = 500

# Initial learning rate (AdamW optimizer)
learning_rate = 4e-5

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Maximum gradient normal (gradient clipping)
max_grad_norm = 1

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = False
bf16 = True

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Learning rate schedule (constant a bit better than cosine)
lr_scheduler_type = "constant"

################################################################################
# SFT parameters
################################################################################

# Define a response template string that contains the prefix "### Schema_links:"
response_template = "Schema_link:"
# Encode the response template string using the tokenizer, excluding special tokens, and get the token IDs
# The [1:] index is used to exclude the initial token, as it's not necessary for the completion-only LM
response_template_ids = tokenizer.encode(response_template, add_special_tokens=False)[1:]
# Create a DataCollatorForCompletionOnlyLM object, which is used to collate data for completion-only language modeling tasks
# It takes the token IDs of the response template and the tokenizer as inputs
collator = DataCollatorForCompletionOnlyLM(response_template_ids, tokenizer=tokenizer)

# Maximum sequence length to use
max_seq_length = 7200

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False

In [None]:
# Set training parameters
training_arguments = TrainingArguments(
    output_dir = output_dir,
    num_train_epochs = num_train_epochs,
    per_device_train_batch_size = per_device_train_batch_size,
    gradient_accumulation_steps = gradient_accumulation_steps,
    gradient_checkpointing_kwargs= {'use_reentrant':False},
    optim = optim,
    evaluation_strategy = "steps",
    eval_delay = eval_delay,
    save_steps = save_steps,
    logging_steps = logging_steps,
    learning_rate = learning_rate,
    weight_decay = weight_decay,
    max_grad_norm = max_grad_norm,
    bf16 = bf16,
    fp16 = fp16,
    warmup_ratio = warmup_ratio,
    group_by_length = group_by_length,
    lr_scheduler_type = lr_scheduler_type,
    report_to="tensorboard"
)

# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model = model,
    train_dataset = train_dataset.shuffle(seed=42),
    eval_dataset = eval_dataset.shuffle(),
    dataset_text_field = "text",
    tokenizer = tokenizer,
    data_collator = collator,
    args = training_arguments,
    max_seq_length = max_seq_length,
    packing = packing,
)

Map:   0%|          | 0/13075 [00:00<?, ? examples/s]

Map:   0%|          | 0/3353 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
# Train model
trainer.train()

OutOfMemoryError: CUDA out of memory. Tried to allocate 2.96 GiB. GPU 0 has a total capacity of 14.75 GiB of which 1.01 GiB is free. Process 2720 has 13.74 GiB memory in use. Of the allocated memory 13.48 GiB is allocated by PyTorch, and 147.46 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

# Saving the Trained Model

In [None]:
# Defining the pre-trained model to be used
model_name = "mistralai/Mistral-7B-Instruct-v0.2"

# Fine-tuned model name
new_model = "SQL-Generation-mistral-perfect-7B-v0.1"

# Access token
token = "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"

In [None]:
# Reload model and merge it with LoRA weights
peft_model_name = "Models/Schema-Links-mistral-7B-v0.1/checkpoints/checkpoint-3000"
model = AutoPeftModelForCausalLM.from_pretrained(peft_model_name, local_files_only=True, torch_dtype = torch.bfloat16)
model = model.merge_and_unload()

# Save fine-tuned model at a new location
output_merged_dir = f"Models/{new_model}-merged"
os.makedirs(output_merged_dir, exist_ok = True)
model.save_pretrained(output_merged_dir, safe_serialization = True)

# Save tokenizer for easy inference
tokenizer = AutoTokenizer.from_pretrained(peft_model_name, local_files_only=True)
tokenizer.save_pretrained(output_merged_dir)


In [None]:
# pushes model an tokenizer to HuggingFace
model.push_to_hub(f"BotoxBernd/{new_model}",  token=token)
tokenizer.push_to_hub(f"BotoxBernd/{new_model}",  token=token)