In [None]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    GenerationConfig,
    set_seed,
    DataCollatorForLanguageModeling,
)
from datasets import Dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel
from trl import SFTTrainer
from tqdm import tqdm
import torch
import time
import pandas as pd
from functools import partial
import numpy as np
from dotenv import load_dotenv
from huggingface_hub import login
import os

In [2]:
load_dotenv()

RANDOM_SEED = int(os.getenv("RANDOM_SEED"))
HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")

set_seed(RANDOM_SEED)
login(token=HUGGINGFACE_TOKEN)

In [3]:
print(torch.cuda.get_device_name(torch.cuda.current_device()))

NVIDIA GeForce RTX 4060


In [4]:
compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

In [5]:
model_name='microsoft/phi-2'
device_map = {"": 0}
original_model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    device_map=device_map,
    quantization_config=bnb_config,
    trust_remote_code=True,
    token=True,
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True,
    padding_side="left",
    add_eos_token=True,
    add_bos_token=True,
    use_fast=False,
)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
def generate(model, prompt, max_length=2048, tokenizer=tokenizer):
    """
    Generate text using the provided model and prompt
    
    Args:
        model: The language model
        prompt: Input text prompt
        max_length: Maximum length of generated text
        tokenizer: Pre-instantiated tokenizer (uses global tokenizer by default)
    
    Returns:
        List containing generated text
    """
    # Tokenize the input
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True)
    
    # Move to same device as model
    device = next(model.parameters()).device
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # Generate text
    with torch.no_grad():
        outputs = model.generate(
            inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            num_return_sequences=1,
            temperature=0.4,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            max_length=max_length,
            use_cache=False,
        )
    
    # Decode the generated text
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)[len(prompt):].strip()
    
    print(f'INPUT [{len(prompt)}]:\n{prompt}')
    print(200*'-')
    print(f'OUTPUT [{len(generated_text)}]:\n{generated_text}')
    print(200*'-')

    return generated_text[len(prompt):].strip()

In [None]:
# prompt = "What does the 's' character in '-rwsr-xr-x' represent in file permissions?"
# output = generate(original_model, prompt)

In [None]:
# prompt = "How can the %n format specifier in printf be misused?"
# output = generate(original_model, prompt)

In [None]:
# prompt = "How does SEV prevent a hypervisor from accessing VM data?"
# output = generate(original_model, prompt)

## Data Preparation

In [None]:
GENERATED_DATA_DIR = os.getenv('GENERATED_DATA_DIR')

def format_text(entry):
    entry['text'] = f'Instruct: {entry["input"]}\nOutput: {entry["output"]}'

def compile_dataset(source_dir, tokenizer=tokenizer):
    source_csv_paths = os.listdir(source_dir)
    df = pd.DataFrame()
    for csv_path in source_csv_paths:
        csv = pd.read_csv(os.path.join(source_dir, csv_path))
        df = pd.concat([df, csv])

    df['text'] = df.apply(lambda entry: f'Instruct: {entry["input"]}\nOutput: {entry["output"]}', 1)
    return df.sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)

full_data = compile_dataset(GENERATED_DATA_DIR)
full_data

Unnamed: 0,input,output,text
0,Describe the fundamental nature of a Trusted P...,A Trusted Platform Module (TPM) is a chip inte...,Instruct: Describe the fundamental nature of a...
1,What is the objective of spyware?,The objective of spyware is to gather informat...,Instruct: What is the objective of spyware?\nO...
2,How can an Access Control Matrix be updated?,An Access Control Matrix can be updated in sev...,Instruct: How can an Access Control Matrix be ...
3,Provide an example of a normal SQL query and a...,"A normal SQL query, given a username 'Bob', wo...",Instruct: Provide an example of a normal SQL q...
4,How do different types of program execution af...,There are two main types of program execution:...,Instruct: How do different types of program ex...
...,...,...,...
1995,How does coverage-guided fuzzing work?,Coverage-guided fuzzing uses traditional fuzzi...,Instruct: How does coverage-guided fuzzing wor...
1996,What is the purpose of overwriting EIP in a st...,"In a stack smashing attack, EIP (the Instructi...",Instruct: What is the purpose of overwriting E...
1997,"Explain the concept of ""widthness overflow"" as...",Widthness overflow occurs when a value is assi...,"Instruct: Explain the concept of ""widthness ov..."
1998,Who is typically responsible for granting or r...,Granting permissions is typically done by the ...,Instruct: Who is typically responsible for gra...


In [12]:
full_tokenized_data = [tokenizer(entry) for entry in full_data['text']]
full_tokenized_data

[{'input_ids': [50256, 43993, 25, 39373, 4892, 262, 7531, 3450, 286, 257, 833, 8459, 19193, 19937, 357, 51, 5868, 737, 198, 26410, 25, 317, 833, 8459, 19193, 19937, 357, 51, 5868, 8, 318, 257, 11594, 11521, 656, 257, 3859, 11, 15025, 355, 257, 11266, 763, 12, 41341, 13, 6363, 1994, 16704, 318, 326, 663, 1181, 2314, 307, 19066, 416, 17412, 2583, 1080, 3788, 11, 13359, 257, 5713, 2858, 13], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]},
 {'input_ids': [50256, 43993, 25, 1867, 318, 262, 9432, 286, 13997, 1574, 30, 198, 26410, 25, 383, 9432, 286, 13997, 1574, 318, 284, 6431, 1321, 546, 262, 2836, 290, 3758, 340, 284, 262, 15250, 13], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]},
 {'input_ids': [50256, 43993, 25, 1374, 460, 281, 8798, 6779, 24936

In [13]:
TRAINING_PROPORTION = float(os.getenv("TRAINING_PROPORTION"))
TRAINING_ENTRIES = int(len(full_tokenized_data) * TRAINING_PROPORTION)

TESTING_PROPORTION = int((1 - TRAINING_PROPORTION) * 10) / 10
TESTING_ENTRIES = len(full_tokenized_data) - TRAINING_ENTRIES

print(f"""Training Proportion = {TRAINING_PROPORTION}
Training Entries = {TRAINING_ENTRIES}

Testieng Proportion = {TESTING_PROPORTION}
Testieng Entries = {TESTING_ENTRIES}""")

Training Proportion = 0.7
Training Entries = 1400

Testieng Proportion = 0.3
Testieng Entries = 600


In [14]:
training_data = full_tokenized_data[:TRAINING_ENTRIES].copy()
testing_data = full_tokenized_data[TRAINING_ENTRIES:].copy()

## Training Base Model

In [15]:
original_model = prepare_model_for_kbit_training(original_model)

In [16]:
config = LoraConfig(
    r=32,
    lora_alpha=32,
    target_modules=[
        'q_proj',
        'k_proj',
        'v_proj',
        'dense'
    ],
    bias="none",
    lora_dropout=0.05,
    task_type="CAUSAL_LM",
)

original_model.gradient_checkpointing_enable()

peft_model = get_peft_model(original_model, config)
peft_model.print_trainable_parameters()

trainable params: 20,971,520 || all params: 2,800,655,360 || trainable%: 0.7488


In [None]:
TRAINING_OUTPUT_DIR = os.getenv('TRAINING_OUTPUT_DIR')
TRAINING_STEPS = int(os.getenv('TRAINING_STEPS'))

peft_training_args = TrainingArguments(
    output_dir = TRAINING_OUTPUT_DIR,
    warmup_steps=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    max_steps=TRAINING_STEPS,
    learning_rate=2e-4,
    optim="paged_adamw_8bit",
    logging_steps=25,
    logging_dir="./logs",
    save_strategy="steps",
    save_steps=25,
    eval_steps=25,
    do_eval=True,
    gradient_checkpointing=True,
    report_to="none",
    overwrite_output_dir = 'True',
    group_by_length=True,
)

peft_model.config.use_cache = False

peft_trainer = Trainer(
    model=peft_model,
    train_dataset=training_data,
    eval_dataset=testing_data,
    args=peft_training_args,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

In [18]:
peft_trainer.train()

Step,Training Loss
25,2.0868
50,1.8732
75,1.7934
100,1.7648
125,1.7678
150,1.6658
175,1.7312
200,1.6895
225,1.7358
250,1.5926


TrainOutput(global_step=1000, training_loss=1.5210958309173583, metrics={'train_runtime': 1548.8026, 'train_samples_per_second': 2.583, 'train_steps_per_second': 0.646, 'total_flos': 5246708369909760.0, 'train_loss': 1.5210958309173583, 'epoch': 2.857142857142857})

In [27]:
eval_tokenizer = AutoTokenizer.from_pretrained(model_name,
                                               add_bos_token=True,
                                               trust_remote_code=True,
                                               use_fast=False)
eval_tokenizer.pad_token = eval_tokenizer.eos_token

In [35]:
tuned_model = PeftModel.from_pretrained(original_model, 
                                        f"{TRAINING_OUTPUT_DIR}/checkpoint-{TRAINING_STEPS}",
                                        torch_dtype=torch.float16,
                                        is_trainable=False)

## Manual Testing

In [None]:
prompt = "How can the %n format specifier in printf be misused?"
generate(tuned_model, prompt)

## ROUGE Testing

In [None]:
original_model = AutoModelForCausalLM.from_pretrained(model_name, 
                                                      device_map='auto',
                                                      quantization_config=bnb_config,
                                                      trust_remote_code=True,
                                                      use_auth_token=True)

In [None]:
dialogues = dataset['test'][0:10]['dialogue']
human_baseline_summaries = dataset['test'][0:10]['summary']

original_model_summaries = []
instruct_model_summaries = []
peft_model_summaries = []

for idx, dialogue in enumerate(dialogues):
    human_baseline_text_output = human_baseline_summaries[idx]
    prompt = f"Instruct: Summarize the following conversation.\n{dialogue}\nOutput:\n"
    
    original_model_res = generate(original_model,prompt,100,)
    original_model_text_output = original_model_res[0].split('Output:\n')[1]
    
    peft_model_res = generate(tuned_model,prompt,100,)
    peft_model_output = peft_model_res[0].split('Output:\n')[1]
    print(peft_model_output)
    peft_model_text_output, success, result = peft_model_output.partition('###')

    original_model_summaries.append(original_model_text_output)
    peft_model_summaries.append(peft_model_text_output)

zipped_summaries = list(zip(human_baseline_summaries, original_model_summaries, peft_model_summaries))
 
df = pd.DataFrame(zipped_summaries, columns = ['human_baseline_summaries', 'original_model_summaries', 'peft_model_summaries'])
df