In [None]:
!pip install torch==2.0.1
!pip install transformers @ git+https://github.com/huggingface/transformers@de9255de27abfcae4a1f816b904915f0b1e23cd9
!pip install tokenizers==0.13.3
!pip install peft
!pip install jsonargparse[signatures]  # CLI
!pip install bitsandbytes==0.39.1 # quantize
!pip install accelerate @ git+https://github.com/huggingface/accelerate@e0f5e030098aada5e112708eee3537475dea3a83
!pip install datasets==2.13.1  # quantize/gptq.py
!pip install zstandard==0.19.0  # prepare_redpajama.py
!pip install scipy
!pip install loralib==0.1.1
!pip install einops==0.6.1

Collecting torch==2.0.1
  Downloading torch-2.0.1-cp310-cp310-manylinux1_x86_64.whl (619.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m619.9/619.9 MB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu11==11.7.99 (from torch==2.0.1)
  Downloading nvidia_cuda_nvrtc_cu11-11.7.99-2-py3-none-manylinux1_x86_64.whl (21.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.0/21.0 MB[0m [31m81.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu11==11.7.99 (from torch==2.0.1)
  Downloading nvidia_cuda_runtime_cu11-11.7.99-py3-none-manylinux1_x86_64.whl (849 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m849.3/849.3 kB[0m [31m71.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-cupti-cu11==11.7.101 (from torch==2.0.1)
  Downloading nvidia_cuda_cupti_cu11-11.7.101-py3-none-manylinux1_x86_64.whl (11.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.8/11.8

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(
    "tiiuae/falcon-7b-instruct",
    load_in_4bit=True,
    trust_remote_code=True,
)

tokenizer = AutoTokenizer.from_pretrained(
    "tiiuae/falcon-7b-instruct",
)

In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
gen_config = model.generation_config
gen_config.max_new_tokens = 200
gen_config.temperature = 0.0
gen_config.num_return_sequences = 1
gen_config.pad_token_id = tokenizer.eos_token_id
gen_config.eos_token_id = tokenizer.eos_token_id

In [None]:
# prompt = 'a'

# encoding = tokenizer(prompt, return_tensors= "pt").to(model.device)

In [None]:
# Execute for testing purposes
# import torch

# with torch.inference_mode():
#    outputs = model.generate(input_ids = encoding.input_ids, attention_mask = encoding.attention_mask,generation_config = gen_config )
# print(tokenizer.decode(outputs[0], skip_special_tokens = True))

In [None]:
import transformers
from datasets import load_dataset, Dataset
import pandas as pd


def gen_prompt(text_input):
    return f"""
    <human>: {text_input["question"]}
    <assistant>: {text_input["answer"]}
    """.strip()

def gen_and_tok_prompt(text_input):
    full_input = gen_prompt(text_input)
    tok_full_prompt = tokenizer(full_input, padding = True , truncation =True)
    return tok_full_prompt

X = {
    'question': ['a', 'c', 'e'],
    'answer': ['b', 'd', 'f']
}

df = pd.DataFrame(X)

data = Dataset.from_pandas(df[['question', 'answer']])

In [None]:
tokenizer.pad_token = tokenizer.eos_token

In [None]:
data = data.map(gen_and_tok_prompt)

In [None]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
torch.cuda.empty_cache()

In [None]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["query_key_value"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

In [None]:
training_args = transformers.TrainingArguments(
    gradient_accumulation_steps=4,
    num_train_epochs=20,
    learning_rate=2e-4,
    fp16=True,
    save_total_limit=4,
    logging_steps=25,
    output_dir="output_dir", # give the location where you want to store checkpoints
    save_strategy='epoch',
    optim="paged_adamw_8bit",
    lr_scheduler_type = 'cosine',
    warmup_ratio = 0.05,
)

trainer = transformers.Trainer(
    model=model,
    train_dataset=data,
    args=training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

In [None]:
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

In [None]:
model.save_pretrained('output_dir/llm_concrete_model')

In [None]:
torch.cuda.empty_cache()

In [None]:
from peft import PeftConfig, PeftModel
from transformers import AutoTokenizer, AutoModelForCausalLM

config = PeftConfig.from_pretrained('output_dir/llm_concrete_model')
model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    load_in_4bit=True,
#     device_map='auto',
    trust_remote_code=True,

)

tokenizer = AutoTokenizer.from_pretrained(
    config.base_model_name_or_path)

model_inf = PeftModel.from_pretrained(model, 'output_dir/llm_concrete_model')


In [None]:
import torch

prompt = f"""
    <human>: a
    <assistant>:
    """.strip()

# encode the prompt
encoding = tokenizer(prompt, return_tensors= "pt").to(model.device)

gen_config = model_inf.generation_config
gen_config.max_new_tokens = 200
gen_config.temperature = 0.0
gen_config.num_return_sequences = 1
gen_config.pad_token_id = tokenizer.eos_token_id
gen_config.eos_token_id = tokenizer.eos_token_id

# do the inference
with torch.inference_mode():
    outputs = model.generate(input_ids = encoding.input_ids, attention_mask = encoding.attention_mask,generation_config = gen_config )
print(tokenizer.decode(outputs[0], skip_special_tokens = True))