In [1]:
import transformers
from datasets import load_dataset, Dataset
import pandas as pd
from typing import Dict

Get Question/Answer pairs

## Prepare dataset for Instruct Fine Tuning
Data is in the form
{"prompt": "A user on Stack Overflow asked the following question <QUESTION>, which of the following answers were the accepted answer? <ANSWER 1> <ANSWER 2>, . . ", "response": "Answer 3 was the accepted answer"}

In [2]:
import os
from ACL2024.modules.util.get_root_dir import get_project_root

df = pd.read_csv(os.path.join(get_project_root(), "modules", "dataset", "test.csv"))
df

Unnamed: 0,STACKOVERFLOW_QUESTION:,STACKOVERFLOW_RESPONSE,accepted_index
0,I am about to build a piece of a project that ...,"RESPONSE 0 Personally, I've played with severa...",0
1,I am using the Photoshop's javascript API to f...,RESPONSE 0 open up a terminal (Applications->U...,2
2,I am starting to work on a hobby project with ...,RESPONSE 0 One possibility is Hudson. It's wr...,1
3,I don't remember whether I was dreaming or not...,"RESPONSE 0 No, you were not dreaming. Python ...",6
4,"Django view points to a function, which can be...",RESPONSE 1 If you're simply displaying data fr...,5
...,...,...,...
95,That's it. If you want to document a function ...,"RESPONSE 0 It's easy, you just add a docstring...",1
96,What is the best way to layout a large django ...,RESPONSE 0 This page does a good job of addres...,1
97,"In Python, given a module X and a class Y, how...",RESPONSE 0 Here's one way to do it:\nimport in...,0
98,What would be your preferred way to concatenat...,"RESPONSE 0 my_list = ['a', 'b', 'c', 'd']\nmy_...",0


In [3]:

def gen_prompt(text_input: Dict):
    return f"""
    <human>: {text_input["STACKOVERFLOW_QUESTION"]}
    <assistant>: {text_input["STACKOVERFLOW_RESPONSE"]}
    """.strip()

def gen_and_tok_prompt(text_input):
    full_input = gen_prompt(text_input)
    tok_full_prompt = tokenizer(full_input, padding = True , truncation =True)
    return tok_full_prompt


data = Dataset.from_pandas(df)

In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM
# model_name = "tiiuae/falcon-7b-instruct"
model = AutoModelForCausalLM.from_pretrained(
    "tiiuae/falcon-7b-instruct",
    load_in_8bit=True,  #if you want to load the 8-bit model
#     device_map='auto',
    trust_remote_code=True,
)

tokenizer = AutoTokenizer.from_pretrained(
    "tiiuae/falcon-7b-instruct",
)





RuntimeError: No GPU found. A GPU is needed for quantization.

In [None]:
tokenizer.pad_token = tokenizer.eos_token
data = data.map(gen_and_tok_prompt)

In [None]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["query_key_value"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

Step 5: Fine-tune

In [None]:
training_args = transformers.TrainingArguments(
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    learning_rate=2e-4,
    fp16=True,
    save_total_limit=4,
    logging_steps=25,
    output_dir="output_dir", # give the location where you want to store checkpoints
    save_strategy='epoch',
    optim="paged_adamw_8bit",
    lr_scheduler_type = 'cosine',
    warmup_ratio = 0.05,
)

trainer = transformers.Trainer(
    model=model,
    train_dataset=data,
    args=training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

In [None]:
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

Step 6: Save model

In [None]:
model.save_pretrained('location where you  want the model to be stored')

Step 7: Inference

In [None]:
config = PeftConfig.from_pretrained("location where new model is stored")
model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
#     load_in_8bit=True,
#     device_map='auto',
    trust_remote_code=True,

)

tokenizer = AutoTokenizer.from_pretrained(
    config.base_model_name_or_path)

model_inf = PeftModel.from_pretrained(model,"location where new model is stored" )


In [None]:
# create your own prompt
prompt = f"""
    <human>: How can i use BDB Data Science LAB?
    <assistant>:
    """.strip()

# encode the prompt
encoding = tokenizer(prompt, return_tensors= "pt").to(model.device)

# set teh generation configuration params
gen_config = model_inf.generation_config
gen_config.max_new_tokens = 200
gen_config.temperature = 0.2
gen_config.top_p = 0.7
gen_config.num_return_sequences = 1
gen_config.pad_token_id = tokenizer.eos_token_id
gen_config.eos_token_id = tokenizer.eos_token_id

# do the inference
with torch.inference_mode():
    outputs = model.generate(input_ids = encoding.input_ids, attention_mask = encoding.attention_mask,generation_config = gen_config )
print(tokenizer.decode(outputs[0], skip_special_tokens = True ))