## Import

In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1,2' # nvidia-smi로 비어있는 gpu 확인하고 여기서 선택할것!

import pandas as pd
import numpy as np
import torch
import transformers
import bitsandbytes as bnb



from transformers import AutoTokenizer, AdamW, AutoModelForCausalLM, BitsAndBytesConfig
from datasets import load_dataset
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model,PeftModel
from tqdm import tqdm

#os.environ["TOKENIZERS_PARALLELISM"] = "true" 
#torch.backends.cuda.matmul.allow_tf32=True
#torch.set_float32_matmul_precision('medium')
#torch.backends.cudnn.benchmark = True

## Data Preprocessing

In [2]:
# 데이터 로드
model_id = "LDCC/LDCC-SOLAR-10.7B"
tokenizer = AutoTokenizer.from_pretrained(model_id,  eos_token='</s>')
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # 패딩을 문장 뒤에 추가
max_length = 1024
dataset = load_dataset("jojo0217/korean_safe_conversation",split="train")


formatted_data = []
for row in dataset:
    input_text = f'''### User:\n{row['instruction']}\n\n### Assistant:\n{row['output']}'''
    input_ids = tokenizer.encode(input_text, return_tensors='pt', padding='max_length', truncation=True, max_length=max_length)
    formatted_data.append(input_ids)
print('Done.')


Done.


In [3]:
formatted_data = torch.cat(formatted_data, dim=0)

## Model Fine-tuning

In [4]:
# 모델 로드



bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto")


model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )


config = LoraConfig(
    r=8, 
    lora_alpha=32, 
    #target_modules=["query_key_value"], 
    target_modules=[
    "q_proj",
    "up_proj",
    "o_proj",
    "k_proj",
    "down_proj",
    "gate_proj",
    "v_proj"],
    lora_dropout=0.05, 
    bias="none", 
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

tokenizer.pad_token = tokenizer.eos_token

trainer = transformers.Trainer(
    model=model,
    train_dataset=formatted_data,
    args=transformers.TrainingArguments(
        num_train_epochs=2,
        per_device_train_batch_size=8,
        gradient_accumulation_steps=1,
      #  max_steps=50,
        learning_rate=1e-4,
        fp16=True,
        logging_steps=1000,
        output_dir="outputs",
        optim="paged_adamw_8bit"
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False 
trainer.train()

Loading checkpoint shards:   0%|          | 0/23 [00:00<?, ?it/s]

trainable params: 31457280 || all params: 5645930496 || trainable%: 0.5571673264891711


Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss
1000,1.7326
2000,1.6979
3000,1.6936
4000,1.7057
5000,1.6938
6000,1.7039
7000,1.6981
8000,1.6844
9000,1.6829
10000,1.6721




TrainOutput(global_step=26980, training_loss=1.5341458172865317, metrics={'train_runtime': 105690.4626, 'train_samples_per_second': 0.511, 'train_steps_per_second': 0.255, 'total_flos': 3.5441356411911537e+18, 'train_loss': 1.5341458172865317, 'epoch': 2.0})

In [5]:
new_model = "solar-ft"
trainer.model.save_pretrained(new_model)
# trainer.tokenizer.save_pretrained(new_model)

AttributeError: 'NoneType' object has no attribute 'save_pretrained'

In [11]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained(model_id,
                                             quantization_config=bnb_config,
                                             device_map="auto",
                                             #torch_dtype=torch.float32,
                                             )
#model = PeftModel.from_pretrained(model, new_model,device_map="auto")

# 토크나이저 로드
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/23 [00:00<?, ?it/s]

In [12]:
formatted_data = []
qs = []
gts = []
for row in dataset:
    input_text = f'''### User:\n{row['instruction']}\n\n### Assistant:\n'''
    input_ids = tokenizer.encode(input_text, return_tensors='pt', padding='max_length', truncation=True, max_length=max_length)
    formatted_data.append(input_ids)
    qs.append(row['instruction'])
    gts.append(row['output'])
print('Done.')

Done.


In [None]:
with open('bigdata/out.txt', 'w', encoding='utf-8') as file:
    pass


terminators = [
    tokenizer.eos_token_id,
    #tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

for idx in range(len(formatted_data)):
    
    print(f"{idx}/{len(formatted_data)}")
    with torch.no_grad():

        outputs = model.generate(
            formatted_data[idx],
            max_new_tokens=512,
            eos_token_id=terminators,
        #     do_sample=True,
        #     temperature=1,
        #     top_p=0.9,
        )
        full_text = tokenizer.decode(outputs[0], skip_special_tokens=False)
        answer_start = full_text.find("\n\n### Assistant:") + len("\n\n### Assistant:")
        answer_only = full_text[answer_start:].strip()
        dic = {"question":qs[idx],"answer":answer_only,"goldanswer":gts[idx]}
        dic_str = '\n'.join([f"{key}: {value}" for key, value in dic.items()])
        with open('bigdata/out.txt', 'a', encoding='utf-8') as file:
            file.write(dic_str)

0/26979




1/26979
2/26979
3/26979
4/26979
5/26979
6/26979
7/26979
8/26979
9/26979
10/26979
11/26979
12/26979
13/26979
14/26979
15/26979
16/26979
17/26979
18/26979
19/26979
20/26979
21/26979
22/26979
23/26979
24/26979
25/26979
26/26979
27/26979
28/26979
29/26979
30/26979
31/26979
32/26979
33/26979
34/26979
35/26979
36/26979
37/26979
38/26979
39/26979
40/26979
41/26979
42/26979
43/26979
44/26979
45/26979
46/26979
47/26979
48/26979
49/26979
50/26979
51/26979
52/26979
53/26979
54/26979
55/26979
56/26979
57/26979
58/26979
59/26979
60/26979
61/26979
62/26979
63/26979
64/26979
65/26979
66/26979
67/26979
68/26979
69/26979
70/26979
71/26979
72/26979
73/26979
74/26979
75/26979
76/26979
77/26979
78/26979
79/26979
80/26979
81/26979
82/26979
83/26979
84/26979
85/26979
86/26979
87/26979
88/26979
89/26979
90/26979
91/26979
92/26979
93/26979
94/26979
95/26979
96/26979
97/26979
98/26979
99/26979
100/26979
101/26979
102/26979
103/26979
104/26979
105/26979
106/26979
107/26979
108/26979
109/26979
110/26979
111/2697