In [84]:
%pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [162]:
import xml.etree.ElementTree as ET
import urllib.request
import pandas as pd
import re
import os, torch, logging
from transformers import AutoTokenizer, BitsAndBytesConfig, AutoModelForCausalLM, TrainingArguments, pipeline, HfArgumentParser
from peft import LoraConfig, PeftModel
from trl import SFTTrainer
from datasets import Dataset
import os

In [163]:
def fetch_papers():
    """Fetches papers from the arXiv API and returns them as a list of strings."""
    url = 'http://export.arxiv.org/api/query?search_query=ti:llama&start=0&max_results=70'
    response = urllib.request.urlopen(url)
    data = response.read().decode('utf-8')
    root = ET.fromstring(data)

    papers_list = []
    for entry in root.findall('{http://www.w3.org/2005/Atom}entry'):
        title = entry.find('{http://www.w3.org/2005/Atom}title').text
        summary = entry.find('{http://www.w3.org/2005/Atom}summary').text
        paper_info = f"Title: {title}\nSummary: {summary}\n"
        papers_list.append(paper_info)

    return papers_list

In [164]:
try:
    paper_list = fetch_papers()
except:
    paper_list = pd.read_csv('df.csv')
    paper_list = list(paper_list[paper_list.columns.values[1]])

In [165]:
def clean_text(text):
    text = re.sub(r'Title:','<s>[INST]',text)
    text = re.sub(r'Summary:','[/INST]',text)
    text = re.sub(r'\n\n','',text)
    text = re.sub(r'\n',' ',text)
    text = re.sub(r"http\S+","",text)
    text = re.sub(r"http\S+","",text)
    text = re.sub(r'@[^\s]+',"",text)
    text = re.sub(r'\s+'," ",text)
    text += ' </s>'
    return re.sub(r'\^[^ ]+',"",text)

In [166]:
for i in range(len(paper_list)):
    paper_list[i] = clean_text(paper_list[i])

In [167]:
paper_df = pd.DataFrame({'Text':paper_list})

In [168]:
# pre-trained model name
base_model_name = "NousResearch/Llama-2-7b-chat-hf"

In [169]:
# Download vocab from huggingface
llama_tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
llama_tokenizer.pad_token = llama_tokenizer.eos_token
llama_tokenizer.padding_side = "right"

In [170]:
# Quantization Config
quant_config = BitsAndBytesConfig(load_in_4bit=True,
                                  bnb_4bit_quant_dtype="nf4", # to load model in 4bit using NF4 quantization
                                  bnb_4bit_compute_dtype=torch.bfloat16, # forward / backward pass can be in 16, 32. (bnb_4bit_compute_type=torch.float16.... difference?)
                                  bnb_4bit_use_double_quant=False # if true, uses second quantization to save an additional 0.4 bits per param
                                  )

In [None]:
# Load the base model
# Will fail if no GPU
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config = quant_config,
    device_map = {"": 0},
    use_safetensors=True
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
base_model.config.use_cache = False

# setting to value diff than 1 -> more accurate but slower computation of linear layers
base_model.config.pretraining_tp = 1

In [None]:
# LoRA Config
peft_parameters = LoraConfig(
    lora_alpha=8, # experiment with different values
    lora_dropout=0.1,
    r=8, # experiment with different values
    bias="none",
    task_type="CAUSAL_LM",
    target_modules = ["q_proj","k_proj","v_proj"]
    # [
      #  "dense",
       # "dense_h_to_4h",
        #"dense_4h_to_h" ]
     #uncomment for maximum performance
)

In [None]:
# Training Params
train_params = TrainingArguments(
    output_dir="./results_modified",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant"
)

In [None]:
paper_hf = Dataset.from_pandas(paper_df)

In [None]:
# Trainer
fine_tuning = SFTTrainer(
    model=base_model,
    train_dataset=paper_hf,
    peft_config=peft_parameters, # without this arg, we finetune entire base model
    dataset_text_field="Text",
    tokenizer=llama_tokenizer,
    args=train_params
)

In [None]:
query = "For which tasks has Llama-2 already been used successfully?"
text_gen = pipeline(task="text-generation", model=base_model, tokenizer=llama_tokenizer, max_length=200)
output = text_gen(f"<s>[INST] {query} [/INST]")
print(output[0]['generated_text'])

In [None]:
fine_tuning.train()

In [None]:
path = os.getcwd()+'/'+'llama-7b-alg'

In [None]:
fine_tuning.model.save_pretrained(path)

In [None]:
model_peft = PeftModel.from_pretrained(base_model, path)
model_peft

In [None]:
query = "For which tasks has Llama-2 already been used successfully?"
text_gen = pipeline(task="text-generation", model=model_peft, tokenizer=llama_tokenizer, max_length=200)
output = text_gen(f"<s>[INST] {query} [/INST]")
print(output[0]['generated_text'])