## Base Model Loading


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
#Load base model(Mistral 7B)

base_model = 'davidkim205/komt-mistral-7b-v1'
bnb_config = BitsAndBytesConfig(
    load_in_4bit= True,
    bnb_4bit_quant_type= "nf4",
    bnb_4bit_compute_dtype= torch.bfloat16,
    bnb_4bit_use_double_quant= False,
)
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map={"": 0}
)


model.config.use_cache = False
model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()

#Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.padding_side = 'right'
tokenizer.pad_token = tokenizer.unk_token
tokenizer.add_eos_token = True

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  return self.fget.__get__(instance, owner)()


## Train Dataset 전처리, Prompt 만들기

In [None]:
import pandas as pd
train = pd.read_csv("./data/train.csv")

In [None]:
from itertools import product

train_data = []

for i in range(len(train)):
    for q,a in list(product([f"질문_{x}" for x in range(1,3)],[f"답변_{x}" for x in range(1,6)])):
        train_data.append([train.at[i,q], train.at[i,a]])


train = pd.DataFrame(train_data, columns=['질문', '답변'])

In [None]:
train.describe()

Unnamed: 0,질문,답변
count,6440,6440
unique,1286,3219
top,오리지널징크의 장점은 무엇인가요?,누수에 의해 도배지가 젖어 있는 상태가 지속되면 곰팡이가 발생할 수 있습니다.
freq,10,4


In [None]:
import pandas as pd

train['chat_sample'] = "너는 실내 인테리어 회사 상담직원이야 다음 질문에 대답해줘 [INST] " + train['질문'] + " [/INST] " + train['답변']


In [None]:
from datasets import Dataset, DatasetDict

train_dataset = Dataset.from_pandas(train)

## Training With Lora

In [None]:
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
model = prepare_model_for_kbit_training(model)

In [None]:
print(model)

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )
    )
   

In [None]:
import bitsandbytes as bnb
def find_all_linear_names(model):
  cls = bnb.nn.Linear4bit
  lora_module_names = set()
  for name, module in model.named_modules():
    if isinstance(module, cls):
      names = name.split('.')
      lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names:
      lora_module_names.remove('lm_head')
  return list(lora_module_names)

In [None]:
modules = find_all_linear_names(model)
print(modules)

['up_proj', 'k_proj', 'v_proj', 'q_proj', 'gate_proj', 'down_proj', 'o_proj']


In [None]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=modules,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)

In [None]:
trainable, total = model.get_nb_trainable_parameters()
print(f"Trainable: {trainable} | total: {total} | Percentage: {trainable/total*100:.4f}%")


Trainable: 41943040 | total: 7283675136 | Percentage: 0.5758%


In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
import torch
torch.cuda.empty_cache()
training_arguments = TrainingArguments(
    output_dir= "./results",
    num_train_epochs= 1,
    per_device_train_batch_size= 8,
    gradient_accumulation_steps= 2,
    optim = "paged_adamw_8bit",
    save_strategy="epoch",
    logging_steps= 1,
    learning_rate= 2e-4,
    weight_decay= 0.001,
    fp16= False,
    bf16= False,
    max_grad_norm= 0.3,
    max_steps= -1,
    warmup_ratio= 0.3,
    group_by_length= True,
    lr_scheduler_type= "constant",

)
# Setting sft parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    peft_config=lora_config,
    max_seq_length= None,
    dataset_text_field="chat_sample",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)



Map:   0%|          | 0/6440 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
trainer.train()

In [None]:
new_model = 'beta_1'
trainer.model.save_pretrained(new_model)

In [None]:
model.push_to_hub('jjookim/beta_1', private=False)

README.md:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/jjookim/qa_beta_2/commit/c3a762346517e3fc14e21d2ab45d6c496fd9081f', commit_message='Upload model', commit_description='', oid='c3a762346517e3fc14e21d2ab45d6c496fd9081f', pr_url=None, pr_revision=None, pr_num=None)