# Exploring the creation of a LoRA checkpoint

Based on this excellent blog post: https://brev.dev/blog/fine-tuning-llama-2-your-own-data

In [3]:
# https://brev.dev/blog/fine-tuning-llama-2-your-own-data

## Step 0. Setup the running environment

- Make sure we're using the correct GPU
- Instantiate the accelerator library

In [1]:
from accelerate import FullyShardedDataParallelPlugin, Accelerator
from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig
import sqlite3
import json
from datasets import load_dataset
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Switch cuda to second GPU
torch.cuda.set_device(1)

In [3]:
fsdp_plugin = FullyShardedDataParallelPlugin(
    state_dict_config=FullStateDictConfig(offload_to_cpu=True, rank0_only=False),
    optim_state_dict_config=FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=False),
)

accelerator = Accelerator(fsdp_plugin=fsdp_plugin)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


## Step 2. Convert the Sqlite database to `.jsonl` format

This is a one-time operation. We need to get the data in the format that the original Llama model was trained on.

In [6]:
# {"input": "What color is the sky?", "output": "The sky is blue."}
# {"input": "Where is the best place to get cloud GPUs?", "output": "Brev.dev"}
conn = sqlite3.connect('dataset/commits.db')
c = conn.cursor()

In [8]:

# # Open the .jsonl file for writing
# with open('dataset/commits.jsonl', 'w') as jsonl_file:
#     # We will fetch a limited number of rows at a time to avoid memory issues
#     batch_size = 1000  # Adjust batch size as per your system's memory constraints
#     offset = 0
    
#     while True:
#         # Retrieve a batch of rows
#         c.execute(
#             'SELECT git_diff, commit_message FROM Commits LIMIT ? OFFSET ?',
#             (batch_size, offset)
#         )
#         rows = c.fetchall()
        
#         # If no more rows are fetched, break the loop
#         if not rows:
#             break
        
#         # For each row, write the JSON object to the file
#         for git_diff, commit_message in rows:
#             # Construct the JSON object
#             json_obj = json.dumps({
#                 "input": git_diff,
#                 "output": commit_message
#             })
#             # Write the JSON object to the file with a newline
#             jsonl_file.write(json_obj + '\n')
        
#         # Update the offset to fetch the next batch
#         offset += batch_size

# # Close the database connection
# conn.close()


KeyboardInterrupt: 

In [9]:
# Open the .jsonl files for writing in train, validation, and test
train_file = open('dataset/commits_train.jsonl', 'w')
val_file = open('dataset/commits_val.jsonl', 'w')
test_file = open('dataset/commits_test.jsonl', 'w')

# We will fetch a limited number of rows at a time to avoid memory issues
batch_size = 1000  # Adjust batch size as per your system's memory constraints
offset = 0

while True:
    # Retrieve a batch of rows
    c.execute(
        'SELECT git_diff, commit_message FROM Commits LIMIT ? OFFSET ?',
        (batch_size, offset)
    )
    rows = c.fetchall()
    
    # If no more rows are fetched, break the loop
    if not rows:
        break
    
    # Determine the number of rows for each split in this batch
    train_batch_size = int(len(rows) * 0.7)
    val_batch_size = int(len(rows) * 0.1)
    test_batch_size = len(rows) - train_batch_size - val_batch_size
    
    # Split the batch into train, val, and test
    train_rows = rows[:train_batch_size]
    val_rows = rows[train_batch_size:train_batch_size + val_batch_size]
    test_rows = rows[train_batch_size + val_batch_size:]
    
    # Write the rows to their respective files
    for row in train_rows:
        train_file.write(json.dumps({"input": row[0], "output": row[1]}) + '\n')
    for row in val_rows:
        val_file.write(json.dumps({"input": row[0], "output": row[1]}) + '\n')
    for row in test_rows:
        test_file.write(json.dumps({"input": row[0], "output": row[1]}) + '\n')
    
    # Update the offset to fetch the next batch
    offset += batch_size

# Close the files and the database connection
train_file.close()
val_file.close()
test_file.close()
conn.close()


KeyboardInterrupt: 

## Step 3. Time to train!

In [4]:
train_dataset = load_dataset('json', data_files='dataset/commits_train.jsonl', split='train')
eval_dataset = load_dataset('json', data_files='dataset/commits_val.jsonl', split='train')

  table = cls._concat_blocks(blocks, axis=0)


In [5]:
# base_model_id = "meta-llama/Llama-2-7b-hf"
base_model_id = "codellama/CodeLlama-13b-hf"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(base_model_id, quantization_config=bnb_config)

Loading checkpoint shards: 100%|██████████| 3/3 [00:09<00:00,  3.08s/it]


In [8]:
def formatting_func(example):
    text = f"""
    The user has requested that you write a commit message based on the diff below.

    Git diff:
    ```
    { example['input'] }
    ```

    Commit message:
    ```
    { example['output'] }
    """
    return text

In [9]:
tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    padding_side="left",
    add_eos_token=True,
    add_bos_token=True,
)
tokenizer.pad_token = tokenizer.eos_token

def generate_and_tokenize_prompt(prompt):
    return tokenizer(formatting_func(prompt), return_tensors="pt", padding=True, truncation=True) # TODO: Are these the right params?

In [None]:
def plot_data_lengths(tokenize_train_dataset, tokenized_val_dataset):
    lengths = [len(x['input_ids']) for x in tokenized_train_dataset]
    lengths += [len(x['input_ids']) for x in tokenized_val_dataset]
    print(len(lengths))

    # Plotting the histogram
    plt.figure(figsize=(10, 6))
    plt.hist(lengths, bins=20, alpha=0.7, color='blue')
    plt.xlabel('Length of input_ids')
    plt.ylabel('Frequency')
    plt.title('Distribution of Lengths of input_ids')
    plt.show()

plot_data_lengths(tokenized_train_dataset, tokenized_val_dataset)

In [10]:
tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt)
tokenized_val_dataset = eval_dataset.map(generate_and_tokenize_prompt)

Map:   0%|          | 0/147369 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map:   1%|          | 1270/147369 [49:24<17:42:42,  2.29 examples/s]  