# Getting Started
#### To start this notebook, you must have a huggingface account and request access from Meta to use Llama 2.
https://huggingface.co/

#### In huggingface, create an access token
https://huggingface.co/docs/hub/security-tokens

#### Inside your home directory access .apikeys and create a huggingface_api_key.txt and paste you access token inside the file
path - /home/{your username}/.apikeys
#### Using the link below request access from Meta
https://huggingface.co/meta-llama/Llama-2-7b-hf

#### Once you recieve access from Meta inside terminal create a conda environment using
conda create --name {environment_name} python=3.10

#### Then Install ipykernel using
conda install ipykernel

#### To allow your environment to be used in the notebook run the following line and select your environment on the top right besides the debugging symbol
python -m ipykernel install --user --name={environment_name}

#### Go back to terminal and install all the packages with
pip install -r packages.txt

#### Edit the data set, test set, and validation set under Load Datasets with the path and you are good to go!
##### All imported data must be a csv

## Access Huggingface API Key

In [1]:
# Set cache directory and load Huggingface api key
# Clean up notebook when creating new repo
import os

username = os.getenv('USER')
directory_path = os.path.join('/scratch', username)

# Set Huggingface cache directory to be on scratch drive
if os.path.exists(directory_path):
    hf_cache_dir = os.path.join(directory_path, 'hf_cache')
    if not os.path.exists(hf_cache_dir):
        os.mkdir(hf_cache_dir)
    print(f"Okay, using {hf_cache_dir} for huggingface cache. Models will be stored there.")
    assert os.path.exists(hf_cache_dir)
    os.environ['TRANSFORMERS_CACHE'] = f'/scratch/{username}/hf_cache/'
else:
    error_message = f"Are you sure you entered your username correctly? I couldn't find a directory {directory_path}."
    raise FileNotFoundError(error_message)

# Load Huggingface api key
api_key_loc = os.path.join('/home', username, '.apikeys', 'huggingface_api_key.txt')

if os.path.exists(api_key_loc):
    print('Huggingface API key loaded.')
    with open(api_key_loc, 'r') as api_key_file:
        huggingface_api_key = api_key_file.read().strip()  # Read and store the contents
else:
    error_message = f'Huggingface API key not found. You need to get an HF API key from the HF website and store it at {api_key_loc}.\n' \
                    'The API key will let you download models from Huggingface.'
    raise FileNotFoundError(error_message)

# Now you can use the `huggingface_api_key` variable wherever you need it.


Okay, using /scratch/kwamea/hf_cache for huggingface cache. Models will be stored there.
Huggingface API key loaded.


In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaTokenizer, LlamaForCausalLM, GenerationConfig, pipeline
from langchain import PromptTemplate, LLMChain
from langchain.llms import HuggingFacePipeline
from configs import fsdp_config, train_config
from peft import get_peft_model, prepare_model_for_int8_training
from utils.dataset_utils import get_preprocessed_dataset
from utils.train_utils import (
    train,
    freeze_transformer_layers,
    setup,
    setup_environ_flags,
    clear_gpu_cache,
    print_model_size,
    #get_policies
)
from utils.config_utils import (
    update_config,
    generate_peft_config,
    generate_dataset_config,
)
from datasets import load_dataset
from datasets import Dataset
from pathlib import Path
import sys
import csv
from configs.datasets import samsum_dataset, alpaca_dataset, grammar_dataset
from ft_datasets.utils import Concatenator

## Load model from cache

In [3]:
# Assuming you have already loaded the huggingface_api_key variable
'''
model = AutoModelForCausalLM.from_pretrained(
        train_config.model_name,
        load_in_8bit=True if train_config.quantization else None,
        device_map="auto" if train_config.quantization else None,
    )
    
#LlamaTokenizer.from_pretrained
tokenizer = AutoTokenizer.from_pretrained(train_config.model_name)
tokenizer.add_special_tokens(
    {
        "pad_token": "<PAD>",
    }
)
'''
tokenizer = AutoTokenizer.from_pretrained(
        "meta-llama/Llama-2-7b-hf",
        cache_dir=os.path.join('/scratch', username),
        load_in_8bit=True if train_config.quantization else None,
        token=huggingface_api_key,
)

tokenizer.add_special_tokens(
    {
        "pad_token": "<PAD>",
    }
)

model = AutoModelForCausalLM.from_pretrained(
        "meta-llama/Llama-2-7b-hf",
        load_in_8bit=True if train_config.quantization else None,
        device_map="auto" if train_config.quantization else None,
        cache_dir=os.path.join('/scratch', username),
        token=huggingface_api_key
)
#the code will output "Error displaying widget: model not found" it is not an error just the code failing to create a loading bar

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



## Load in your dataset

In [6]:
#add testset and rename current test set to validation set 

#edit file path to your unique dataset
dataset = load_dataset('csv', data_files='samsum-data/samsum-train.csv',split = 'train')
valset = load_dataset('csv', data_files='samsum-data/samsum-validation.csv',split = 'train')
testset = load_dataset('csv', data_files='samsum-data/samsum-test.csv',split = 'train')

sample_fraction = 0.01

num_samples = int(len(dataset) * sample_fraction)

# Sample 1/10 of the data randomly
dataset = dataset.shuffle(seed=42).select(list(range(num_samples)))
valset = valset.shuffle(seed=42).select(list(range(num_samples)))
testset = testset.shuffle(seed=42).select(list(range(num_samples)))

#Edit the prompt to tell the model what to do
prompt = (
    #f"Guess the score based on the text:\n{{text}}\n---\nScore:\n{{score}}{{eos_token}}"
    f"Summarize this dialog:\n{{dialog}}\n---\nSummary:\n{{summary}}{{eos_token}}"
)

#edit the variables in prompt.format to match your data: essentially what you what the model to read
def apply_prompt_template(sample):
    return {
        "text": prompt.format(
            dialog = sample["dialogue"],
            summary = sample["summary"],
            eos_token=tokenizer.eos_token,
        )
    }

dataset = dataset.map(apply_prompt_template, remove_columns=list(dataset.features))
valset = valset.map(apply_prompt_template, remove_columns=list(valset.features))
testset = testset.map(apply_prompt_template, remove_columns=list(testset.features))

dataset = dataset.map(
    lambda sample: tokenizer(sample["text"]),
    batched=True,
    remove_columns=list(dataset.features), #dataset['train'].features
).map(Concatenator(), batched=True)
valset = valset.map(
    lambda sample: tokenizer(sample["text"]),
    batched=True,
    remove_columns=list(valset.features), #dataset['train'].features
).map(Concatenator(), batched=True)
testset = testset.map(
    lambda sample: tokenizer(sample["text"]),
    batched=True,
    remove_columns=list(testset.features), #dataset['train'].features
).map(Concatenator(), batched=True)


train_dataset = dataset
val_dataset = valset
test_dataset = testset

Found cached dataset csv (/home/kwamea/.cache/huggingface/datasets/csv/default-04ed86c0369d6106/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
Found cached dataset csv (/home/kwamea/.cache/huggingface/datasets/csv/default-98e7740e620c6c6f/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
Found cached dataset csv (/home/kwamea/.cache/huggingface/datasets/csv/default-ed9c43e8e741b026/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
Loading cached shuffled indices for dataset at /home/kwamea/.cache/huggingface/datasets/csv/default-04ed86c0369d6106/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-cf14eefb8d952c92.arrow
Loading cached shuffled indices for dataset at /home/kwamea/.cache/huggingface/datasets/csv/default-98e7740e620c6c6f/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-801b7b4a51fce617.arrow
Loading cached shuffled indices for dataset at /home/kwamea/.c

Map:   0%|          | 0/147 [00:00<?, ? examples/s]

Map:   0%|          | 0/147 [00:00<?, ? examples/s]

Map:   0%|          | 0/147 [00:00<?, ? examples/s]

Map:   0%|          | 0/147 [00:00<?, ? examples/s]

Map:   0%|          | 0/147 [00:00<?, ? examples/s]

Map:   0%|          | 0/147 [00:00<?, ? examples/s]

Map:   0%|          | 0/147 [00:00<?, ? examples/s]

Map:   0%|          | 0/147 [00:00<?, ? examples/s]

Map:   0%|          | 0/147 [00:00<?, ? examples/s]

## Test the model before finetuning

In [7]:
#Edit eval_prompt to match your data
eval_prompt = """
Summarize this dialog:
A: Hi Tom, are you busy tomorrow’s afternoon?
B: I’m pretty sure I am. What’s up?
A: Can you go with me to the animal shelter?.
B: What do you want to do?
A: I want to get a puppy for my son.
B: That will make him so happy.
A: Yeah, we’ve discussed it many times. I think he’s ready now.
B: That’s good. Raising a dog is a tough issue. Like having a baby ;-) 
A: I'll get him one of those little dogs.
B: One that won't grow up too big;-)
A: And eat too much;-))
B: Do you know which one he would like?
A: Oh, yes, I took him there last Monday. He showed me one that he really liked.
B: I bet you had to drag him away.
A: He wanted to take it home right away ;-).
B: I wonder what he'll name it.
A: He said he’d name it after his dead hamster – Lemmy  - he's  a great Motorhead fan :-)))
---
Summary:
"""

model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

model.eval()
with torch.no_grad():
    print(tokenizer.decode(model.generate(**model_input, max_new_tokens=100)[0], skip_special_tokens=True))


Summarize this dialog:
A: Hi Tom, are you busy tomorrow’s afternoon?
B: I’m pretty sure I am. What’s up?
A: Can you go with me to the animal shelter?.
B: What do you want to do?
A: I want to get a puppy for my son.
B: That will make him so happy.
A: Yeah, we’ve discussed it many times. I think he’s ready now.
B: That’s good. Raising a dog is a tough issue. Like having a baby ;-) 
A: I'll get him one of those little dogs.
B: One that won't grow up too big;-)
A: And eat too much;-))
B: Do you know which one he would like?
A: Oh, yes, I took him there last Monday. He showed me one that he really liked.
B: I bet you had to drag him away.
A: He wanted to take it home right away ;-).
B: I wonder what he'll name it.
A: He said he’d name it after his dead hamster – Lemmy  - he's  a great Motorhead fan :-)))
---
Summary:
- I'm pretty sure I am.
- I want to get a puppy for my son.
- Raising a dog is a tough issue.
- One that won't grow up too big;-)
- Do you know which one he would like?
- Oh, 

## Enables Parameter Efficient Finetuning (PEFT)

In [8]:
#reduces the parameters needed to train
model.train()
def create_peft_config(model):
    from peft import (
        get_peft_model,
        LoraConfig,
        TaskType,
        prepare_model_for_int8_training,
    )

    peft_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        inference_mode=False,
        r=8,
        lora_alpha=32,
        lora_dropout=0.05,
        bias= "none",
        target_modules = ["q_proj", "v_proj"]
    )
 
    kwargs = {
        'use_peft': True, 
        'peft_method': 'lora', 
        'quantization': True, 
        'use_fp16': True, 
        'model_name': os.path.join('/scratch', username, 'models--meta-llama--Llama-2-7b-hf/snapshots/6fdf2e60f86ff2481f2241aaee459f85b5b0bbb9'), 
        'output_dir': os.path.join('/scratch', username)
    }
    
    update_config((train_config, fsdp_config), **kwargs)
    
    model = prepare_model_for_int8_training(model)
    peft_config = generate_peft_config(train_config, kwargs)
    model = get_peft_model(model, peft_config)
    model.print_trainable_parameters()
    return model, peft_config

# create peft config
model, lora_config = create_peft_config(model)



trainable params: 4,194,304 || all params: 6,742,609,920 || trainable%: 0.06220594176090199


In [9]:
torch.cuda.empty_cache()
from transformers import TrainerCallback
from contextlib import nullcontext
enable_profiler = False
output_dir = os.path.join('/scratch', username ,'llama-output')
#set up the configurations for training
config = {
    'lora_config': lora_config,
    'learning_rate': 1e-4,
    'num_train_epochs': 1,
    'gradient_accumulation_steps': 2,
    'per_device_train_batch_size': 2,
    'gradient_checkpointing': False,
}

# Set up profiler
if enable_profiler:
    wait, warmup, active, repeat = 1, 1, 2, 1
    total_steps = (wait + warmup + active) * (1 + repeat)
    schedule =  torch.profiler.schedule(wait=wait, warmup=warmup, active=active, repeat=repeat)
    profiler = torch.profiler.profile(
        schedule=schedule,
        on_trace_ready=torch.profiler.tensorboard_trace_handler(f"{output_dir}/logs/tensorboard"),
        record_shapes=True,
        profile_memory=True,
        with_stack=True)
    
    class ProfilerCallback(TrainerCallback):
        def __init__(self, profiler):
            self.profiler = profiler
            
        def on_step_end(self, *args, **kwargs):
            self.profiler.step()

    profiler_callback = ProfilerCallback(profiler)
else:
    profiler = nullcontext()

## Defines training arguments and trains the model

In [10]:
torch.cuda.empty_cache()
from transformers import default_data_collator, Trainer, TrainingArguments

# Define training args
training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    bf16=True, 
    logging_dir=f"{output_dir}/logs",
    logging_strategy="steps",
    evaluation_strategy="steps",
    logging_steps=5,  # 10
    save_strategy="no",
    optim="adamw_torch_fused",
    auto_find_batch_size = True, 
    max_steps=total_steps if enable_profiler else -1,
    **{k:v for k,v in config.items() if k != 'lora_config'},
    remove_unused_columns=False
)

with profiler:
    # Create Trainer instance
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        data_collator=default_data_collator,
        callbacks=[profiler_callback] if enable_profiler else [],
    )
    
# Start training
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss,Validation Loss


TrainOutput(global_step=4, training_loss=2.0321404933929443, metrics={'train_runtime': 39.7157, 'train_samples_per_second': 0.403, 'train_steps_per_second': 0.101, 'total_flos': 1299881247375360.0, 'train_loss': 2.0321404933929443, 'epoch': 1.0})

## Save the model to output directory

In [11]:
model.save_pretrained(output_dir)

## Test model on the same input as before

In [12]:
model.eval()
with torch.no_grad():
    print(tokenizer.decode(model.generate(**model_input, max_new_tokens=100)[0], skip_special_tokens=True))



Summarize this dialog:
A: Hi Tom, are you busy tomorrow’s afternoon?
B: I’m pretty sure I am. What’s up?
A: Can you go with me to the animal shelter?.
B: What do you want to do?
A: I want to get a puppy for my son.
B: That will make him so happy.
A: Yeah, we’ve discussed it many times. I think he’s ready now.
B: That’s good. Raising a dog is a tough issue. Like having a baby ;-) 
A: I'll get him one of those little dogs.
B: One that won't grow up too big;-)
A: And eat too much;-))
B: Do you know which one he would like?
A: Oh, yes, I took him there last Monday. He showed me one that he really liked.
B: I bet you had to drag him away.
A: He wanted to take it home right away ;-).
B: I wonder what he'll name it.
A: He said he’d name it after his dead hamster – Lemmy  - he's  a great Motorhead fan :-)))
---
Summary:
A: Hi Tom, are you busy tomorrow’s afternoon?
B: I’m pretty sure I am. What’s up?
A: Can you go with me to the animal shelter?
B: What do you want to do?
A: I want to get a pu

In [13]:
print(output_dir)

/scratch/kwamea/llama-output


In [19]:
#scratch
model.config

LlamaConfig {
  "_name_or_path": "meta-llama/Llama-2-7b-hf",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 4096,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pretraining_tp": 1,
  "quantization_config": {
    "bnb_4bit_compute_dtype": "float32",
    "bnb_4bit_quant_type": "fp4",
    "bnb_4bit_use_double_quant": false,
    "llm_int8_enable_fp32_cpu_offload": false,
    "llm_int8_has_fp16_weight": false,
    "llm_int8_skip_modules": null,
    "llm_int8_threshold": 6.0,
    "load_in_4bit": false,
    "load_in_8bit": true,
    "quant_method": "bitsandbytes"
  },
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "tie_word_embeddings": false,
  "torch_dtype": "float16",
  "transformers_version": "4.32.1",
  "use_cache": true,
  "vocab_size": 32000

## Load Saved Model

In [17]:
#load and test- get metrics and automatically show random 5 model input and output
#LlamaForCausalLM
#AutoModelForCausalLM

model = LlamaForCausalLM.from_pretrained(output_dir)

model.eval


OSError: /scratch/kwamea/llama-output does not appear to have a file named config.json. Checkout 'https://huggingface.co//scratch/kwamea/llama-output/main' for available files.

In [None]:
with torch.no_grad():
    print(tokenizer.decode(model.generate(**model_input, max_new_tokens=100)[0], skip_special_tokens=True))
