In [84]:
%pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


## Load libraries

In [162]:
import xml.etree.ElementTree as ET
import urllib.request
import pandas as pd
import re
import os, torch, logging
from transformers import AutoTokenizer, BitsAndBytesConfig, AutoModelForCausalLM, TrainingArguments, pipeline, HfArgumentParser
from peft import LoraConfig, PeftModel
from trl import SFTTrainer
from datasets import Dataset
import os

## Fetch training data

In [163]:
def fetch_papers():
    """Fetches papers from the arXiv API and returns them as a list of strings."""
    url = 'http://export.arxiv.org/api/query?search_query=ti:llama&start=0&max_results=70'
    response = urllib.request.urlopen(url)
    data = response.read().decode('utf-8')
    root = ET.fromstring(data)

    papers_list = []
    for entry in root.findall('{http://www.w3.org/2005/Atom}entry'):
        title = entry.find('{http://www.w3.org/2005/Atom}title').text
        summary = entry.find('{http://www.w3.org/2005/Atom}summary').text
        paper_info = f"Title: {title}\nSummary: {summary}\n"
        papers_list.append(paper_info)

    return papers_list

In [164]:
try:
    paper_list = fetch_papers()
except:
    paper_list = pd.read_csv('df.csv')
    paper_list = list(paper_list[paper_list.columns.values[1]])

## Clean/prepare training data

In [165]:
def clean_text(text):
    text = re.sub(r'Title:','<s>[INST]',text)
    text = re.sub(r'Summary:','[/INST]',text)
    text = re.sub(r'\n\n','',text)
    text = re.sub(r'\n',' ',text)
    text = re.sub(r"http\S+","",text)
    text = re.sub(r"http\S+","",text)
    text = re.sub(r'@[^\s]+',"",text)
    text = re.sub(r'\s+'," ",text)
    text += ' </s>'
    return re.sub(r'\^[^ ]+',"",text)

In [166]:
for i in range(len(paper_list)):
    paper_list[i] = clean_text(paper_list[i])

In [167]:
paper_df = pd.DataFrame({'Text':paper_list})

In [185]:
# pandas -> hugging face
paper_hf = Dataset.from_pandas(paper_df)

## Load base mdel (llama-2 7B params)
In order to load the base model (Llama 2 7B), we need the following:
1. Tokenizer
2. Quantization config

In [186]:
# pre-trained model name
base_model_name = "NousResearch/Llama-2-7b-chat-hf"

In [187]:
# Download vocab from huggingface
llama_tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
llama_tokenizer.pad_token = llama_tokenizer.eos_token
llama_tokenizer.padding_side = "right"

As per QLoRA, we load the model in 4bit (`load_in_4bit=True`), specifically using NF4 quantization (`bnb_4bit_quant_dtype="nf4"`). Our model would suffer greatly from information loss if we were to keep the parameters in 4bit. For this reason, we compute in 16bit (`bnb_4bit_compute_dtype=torch.bfloat16`)

In [189]:
# Quantization Config
quant_config = BitsAndBytesConfig(load_in_4bit=True,
                                  bnb_4bit_quant_dtype="nf4", # to load model in 4bit using NF4 quantization
                                  bnb_4bit_compute_dtype=torch.bfloat16, # forward / backward pass can be in 16, 32. (bnb_4bit_compute_type=torch.float16.... difference?)
                                  bnb_4bit_use_double_quant=False # if true, uses second quantization to save an additional 0.4 bits per param
                                  )

In [196]:
# Load the base model
# Will fail if no GPU
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config = quant_config,
    device_map = {"": 0},
    use_safetensors=True
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 86.00 MiB. GPU 0 has a total capacty of 21.99 GiB of which 9.00 MiB is free. Process 2963607 has 21.97 GiB memory in use. Of the allocated memory 21.18 GiB is allocated by PyTorch, and 487.98 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
base_model.config.use_cache = False

# setting to value diff than 1 -> more accurate but slower computation of linear layers
base_model.config.pretraining_tp = 1

## Training

In order to finetune with LoRA, we first need to set the config. Two hyper-parameters worth experimenting with are `lora_alpha` and `r`.

`r` is the rank for our decomposed matrices. For example, suppose the original weight matrix in our base model has dimensions 100x100. Then, if we go with `r = 8`, our A and B matrices will be 100x8 and 8x100.

`lora_alpha` ($\alpha$\) is used to scale our learned weight matrix, by a factor of $\dfrac{\alpha}{r}$. By assigning $\alpha$ at a value greater (less) than our rank $r$, we are in effect putting more (less) importance of our learned weights than the original weights. Note that this can also be achieved by changing the learning rate with a fixed $\alpha$.

In [192]:
# LoRA Config
peft_parameters = LoraConfig(
    lora_alpha=8, # experiment with different values
    lora_dropout=0.1,
    r=8, # experiment with different values
    bias="none",
    task_type="CAUSAL_LM",
    target_modules = ["q_proj","k_proj","v_proj"]
    # [
      #  "dense",
       # "dense_h_to_4h",
        #"dense_4h_to_h" ]
     #uncomment for maximum performance
)

In [193]:
# Training Params
train_params = TrainingArguments(
    output_dir="./results_modified",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant"
)

In [194]:
# Trainer
fine_tuning = SFTTrainer(
    model=base_model,
    train_dataset=paper_hf,
    peft_config=peft_parameters, # without this arg, we finetune entire base model
    dataset_text_field="Text",
    tokenizer=llama_tokenizer,
    args=train_params
)



Map:   0%|          | 0/59 [00:00<?, ? examples/s]

In [198]:
# train
fine_tuning.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


OutOfMemoryError: CUDA out of memory. Tried to allocate 34.00 MiB. GPU 0 has a total capacty of 21.99 GiB of which 9.00 MiB is free. Process 2963607 has 21.97 GiB memory in use. Of the allocated memory 21.18 GiB is allocated by PyTorch, and 487.93 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [179]:
# save model
path = os.getcwd()+'/'+'llama-7b-alg'
fine_tuning.model.save_pretrained(path)

In [181]:
# load model
model_peft = PeftModel.from_pretrained(base_model, path)
model_peft

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 4096, padding_idx=0)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): Linear4bit(
                in_features=4096, out_features=4096, bias=False
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): Linear4bit(
                in_features=4096, out_features=4096, bias=False
 

## Inference

Let's ask our model a question and see its response, both from the base model as well as our finetuned model.


In [182]:
query = "For which tasks has Llama-2 already been used successfully?"
text_gen = pipeline(task="text-generation", model=model_peft, tokenizer=llama_tokenizer, max_length=200)
output = text_gen(f"<s>[INST] {query} [/INST]")
print(output[0]['generated_text'])

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MusicgenForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PLBartForCausalLM', 'ProphetNetForCausalLM', 'QDQBertLMHeadModel', 'ReformerModelWithLMHead', 'RemBertForCausal

<s>[INST] For which tasks has Llama-2 already been used successfully? [/INST]  Llama-2 has been used successfully for a wide range of natural language processing (NLP) tasks, including but not limited to:
 Unterscheidung von Textsorten: Llama-2 has been used to classify text into different categories, such as news articles, social media posts, and product reviews.

 Sentiment Analysis: Llama-2 has been used to analyze the sentiment of text, such as determining whether a piece of text is positive, negative, or neutral.

 Named Entity Recognition: Llama-2 has been used to identify and classify named entities in text, such as people, organizations, and locations.

 Part-of-Speech Tagging: Llama-2 has been used to assign part-of-speech tags to words in text, such as determining whether
