In [1]:
# Set cache directory and load Huggingface api key
import os

username = os.getenv('USER')
directory_path = os.path.join('/scratch', username)
output_dir = os.path.join('/scratch', username ,'llama-output')

# Set Huggingface cache directory to be on scratch drive
if os.path.exists(directory_path):
    hf_cache_dir = os.path.join(directory_path, 'hf_cache')
    if not os.path.exists(hf_cache_dir):
        os.mkdir(hf_cache_dir)
    print(f"Okay, using {hf_cache_dir} for huggingface cache. Models will be stored there.")
    assert os.path.exists(hf_cache_dir)
    os.environ['TRANSFORMERS_CACHE'] = f'/scratch/{username}/hf_cache/'
else:
    error_message = f"Are you sure you entered your username correctly? I couldn't find a directory {directory_path}."
    raise FileNotFoundError(error_message)

# Load Huggingface api key
api_key_loc = os.path.join('/home', username, '.apikeys', 'huggingface_api_key.txt')

if os.path.exists(api_key_loc):
    print('Huggingface API key loaded.')
    with open(api_key_loc, 'r') as api_key_file:
        huggingface_api_key = api_key_file.read().strip()  # Read and store the contents
else:
    error_message = f'Huggingface API key not found. You need to get an HF API key from the HF website and store it at {api_key_loc}.\n' \
                    'The API key will let you download models from Huggingface.'
    raise FileNotFoundError(error_message)

Okay, using /scratch/kwamea/hf_cache for huggingface cache. Models will be stored there.
Huggingface API key loaded.


In [2]:
import torch
import wandb
from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaTokenizer, LlamaForCausalLM, GenerationConfig, pipeline, DataCollatorWithPadding, default_data_collator, Trainer, TrainingArguments
from langchain import PromptTemplate, LLMChain
from langchain.llms import HuggingFacePipeline
from configs import fsdp_config, train_config
from peft import get_peft_model, prepare_model_for_int8_training, PeftModelForCausalLM, LoraConfig, TaskType, prepare_model_for_int8_training, PeftModel
from utils.dataset_utils import get_preprocessed_dataset
from utils.train_utils import (
    train,
    freeze_transformer_layers,
    setup,
    setup_environ_flags,
    clear_gpu_cache,
    print_model_size,
)
from utils.config_utils import (
    update_config,
    generate_peft_config,
    generate_dataset_config,
)
from datasets import Dataset,load_dataset, ClassLabel, Features, Array2D
from pathlib import Path
import sys
import csv
import random
import json
from configs.datasets import samsum_dataset, alpaca_dataset, grammar_dataset
from ft_datasets.utils import Concatenator
import huggingface_hub
from huggingface_hub import notebook_login, Repository, HfApi, create_repo, delete_repo
huggingface_hub.login(token = huggingface_api_key)



Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/kwamea/.cache/huggingface/token
Login successful


In [3]:
tokenizer = AutoTokenizer.from_pretrained(
        "meta-llama/Llama-2-70b-hf", #mistralai/Mistral-7B-v0.1 mistralai/Mixtral-8x7B-v0.1 meta-llama/Llama-2-70b-hf
        resume_download=True,
        cache_dir=os.path.join('/scratch', username),
        load_in_8bit=True if train_config.quantization else None,
        token=huggingface_api_key,
)

tokenizer.add_special_tokens(
    {
        "pad_token": "<PAD>",
    }
)

1

In [8]:
#loads plain model
from transformers import AutoModelForCausalLM, BitsAndBytesConfig

plain_model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-2-70b-hf",
    resume_download=True,
    quantization_config=BitsAndBytesConfig(
        float16_bits=8,  # Adjust as needed
        float32_bits=32,  # Adjust as needed
    ),
    device_map="auto" if train_config.quantization else None,
    cache_dir=os.path.join('/scratch', username),
    trust_remote_code=True,
    token=huggingface_api_key,
)

Loading checkpoint shards:   0%|          | 0/15 [00:00<?, ?it/s]

You are calling `save_pretrained` to a 4-bit converted model, but your `bitsandbytes` version doesn't support it. If you want to save 4-bit models, make sure to have `bitsandbytes>=0.41.3` installed.


In [None]:
#loads datasets

#add testset and rename current test set to validation set 

#edit file path to your unique dataset

user_input = input("Do you want to input your own test splits? (yes/no): ")

if user_input.lower() == "yes":
    dataset = load_dataset('csv', data_files='samsum-data/samsum-train.csv',split = 'train')
    valset = load_dataset('csv', data_files='samsum-data/samsum-validation.csv',split = 'train')
    testset = load_dataset('csv', data_files='samsum-data/samsum-test.csv',split = 'train')

#creates train, test, and validation splits
else:
    # Load the dataset from a CSV file
    full_dataset = load_dataset('csv', data_files='combined_info.csv')

    # Get the number of examples in the dataset
    num_examples = len(full_dataset["train"])

    # Define the split ratios
    train_ratio = 0.8
    validation_ratio = 0.1
    test_ratio = 0.1

    # Calculate the number of examples for each split
    num_train_examples = int(num_examples * train_ratio)
    num_validation_examples = int(num_examples * validation_ratio)
    num_test_examples = int(num_examples * test_ratio)

    # Split the dataset
    splits = full_dataset["train"].train_test_split(
        test_size=num_test_examples,
        train_size=num_train_examples,
        shuffle=True
    )

    # Assign the splits to variables
    dataset = splits["train"]
    valset = splits["test"]

    # If you want a separate test split, you can use the test split from the original split
    testset = full_dataset["train"].train_test_split(
        test_size=num_test_examples,
        train_size=num_train_examples
    )["test"]

# Now you can use train_dataset, validation_dataset, and test_dataset for training, validation, and testing

#####TODO change the prompt to ask based on the given cat what theme best fits
#Edit the prompt to tell the model what to do including the variables from prompt.format
prompt = (
    f"Generate labels that best fit the following text with respect topic:\n{{text}}\n---\nLabel:{{label}}\nLabels:\n"
)

#prompt for testing
test_prompt = (
    f"Tell me which labels best fit the following text:\n{{text}}\n---\nLabel:\n"
)

#edit the variables in prompt.format to match your data: essentially what you what the model to read
def apply_prompt_template(sample):
    return {
        "text": prompt.format(
            text = sample["text"],
            label = sample["label"],
        )
    }

#Only include what you want the model to see during testing
def apply_prompt_template_TEST(sample):
    return {
        "text": test_prompt.format(
            text = sample["text"],
        )
    }

data = dataset.map(apply_prompt_template, remove_columns=list(dataset.features))
val = valset.map(apply_prompt_template, remove_columns=list(valset.features))
test = testset.map(apply_prompt_template_TEST, remove_columns=list(testset.features))

train_dataset = data.map(
    lambda sample: tokenizer(sample["text"]),
    batched=True,
    remove_columns=list(data.features), 
).map(Concatenator(), batched=True)
val_dataset = val.map(
    lambda sample: tokenizer(sample["text"]),
    batched=True,
    remove_columns=list(val.features), 
).map(Concatenator(), batched=True)
test_dataset = test.map(
    lambda sample: tokenizer(sample["text"]),
    batched=True,
    remove_columns=list(test.features), 
).map(Concatenator(), batched=True)

In [7]:
#prompt for plain model
eval_prompt = """
Summarize this dialog:
A: Hi Tom, are you busy tomorrow’s afternoon?
B: I’m pretty sure I am. What’s up?
A: Can you go with me to the animal shelter?.
B: What do you want to do?
A: I want to get a puppy for my son.
B: That will make him so happy.
A: Yeah, we’ve discussed it many times. I think he’s ready now.
B: That’s good. Raising a dog is a tough issue. Like having a baby ;-) 
A: I'll get him one of those little dogs.
B: One that won't grow up too big;-)
A: And eat too much;-))
B: Do you know which one he would like?
A: Oh, yes, I took him there last Monday. He showed me one that he really liked.
B: I bet you had to drag him away.
A: He wanted to take it home right away ;-).
B: I wonder what he'll name it.
A: He said he’d name it after his dead hamster – Lemmy  - he's  a great Motorhead fan :-)))
---
Summary:
"""

model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

plain_model.eval()
with torch.no_grad():
    print(tokenizer.decode(model.generate(**model_input, max_new_tokens=100)[0], skip_special_tokens=True))


Summarize this dialog:
A: Hi Tom, are you busy tomorrow’s afternoon?
B: I’m pretty sure I am. What’s up?
A: Can you go with me to the animal shelter?.
B: What do you want to do?
A: I want to get a puppy for my son.
B: That will make him so happy.
A: Yeah, we’ve discussed it many times. I think he’s ready now.
B: That’s good. Raising a dog is a tough issue. Like having a baby ;-) 
A: I'll get him one of those little dogs.
B: One that won't grow up too big;-)
A: And eat too much;-))
B: Do you know which one he would like?
A: Oh, yes, I took him there last Monday. He showed me one that he really liked.
B: I bet you had to drag him away.
A: He wanted to take it home right away ;-).
B: I wonder what he'll name it.
A: He said he’d name it after his dead hamster – Lemmy  - he's  a great Motorhead fan :-)))
---
Summary:
A: A wants to get a puppy for her son.
B: B suggests to go to the animal shelter.
A: A asks B to go with her to the animal shelter.
B: B asks what A wants to do.
A: A wants to

In [6]:
#loads finetuned model
finetuned_model = PeftModelForCausalLM.from_pretrained(plain_model, output_dir)

In [None]:
#prompt for finetuned model
eval_prompt = """
Summarize this dialog:
A: Hi Tom, are you busy tomorrow’s afternoon?
B: I’m pretty sure I am. What’s up?
A: Can you go with me to the animal shelter?.
B: What do you want to do?
A: I want to get a puppy for my son.
B: That will make him so happy.
A: Yeah, we’ve discussed it many times. I think he’s ready now.
B: That’s good. Raising a dog is a tough issue. Like having a baby ;-) 
A: I'll get him one of those little dogs.
B: One that won't grow up too big;-)
A: And eat too much;-))
B: Do you know which one he would like?
A: Oh, yes, I took him there last Monday. He showed me one that he really liked.
B: I bet you had to drag him away.
A: He wanted to take it home right away ;-).
B: I wonder what he'll name it.
A: He said he’d name it after his dead hamster – Lemmy  - he's  a great Motorhead fan :-)))
---
Summary:
"""

model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

finetuned_model.eval()
with torch.no_grad():
    print(tokenizer.decode(model.generate(**model_input, max_new_tokens=100)[0], skip_special_tokens=True))