In [2]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git 
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets

In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
     

from peft import PeftModel, PeftConfig

In [4]:
peft_model_id = "beomi/qlora-koalpaca-polyglot-12.8b-50step"
config = PeftConfig.from_pretrained(peft_model_id)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, quantization_config=bnb_config, device_map={"":0})
model = PeftModel.from_pretrained(model, peft_model_id)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

model.eval()
     

Loading checkpoint shards:   0%|          | 0/28 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GPTNeoXForCausalLM(
      (gpt_neox): GPTNeoXModel(
        (embed_in): Embedding(30080, 5120)
        (emb_dropout): Dropout(p=0.0, inplace=False)
        (layers): ModuleList(
          (0-39): 40 x GPTNeoXLayer(
            (input_layernorm): LayerNorm((5120,), eps=1e-05, elementwise_affine=True)
            (post_attention_layernorm): LayerNorm((5120,), eps=1e-05, elementwise_affine=True)
            (post_attention_dropout): Dropout(p=0.0, inplace=False)
            (post_mlp_dropout): Dropout(p=0.0, inplace=False)
            (attention): GPTNeoXAttention(
              (rotary_emb): GPTNeoXRotaryEmbedding()
              (query_key_value): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=5120, out_features=15360, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                

In [5]:
def gen(x):
    q = f"### 질문: {x}\n\n### 답변:"
    # print(q)
    gened = model.generate(
        **tokenizer(
            q, 
            return_tensors='pt', 
            return_token_type_ids=False
        ).to('cuda'), 
        max_new_tokens=50,
        early_stopping=True,
        do_sample=True,
        eos_token_id=2,
    )
    print(tokenizer.decode(gened[0]))
     

In [6]:
gen('노하림 알아?')

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


### 질문: 노하림 알아?

### 답변: 노하림은 가수도 유명하지만 탤런트의 배우를 맡는것으로도 유명합니다.<|endoftext|>


# 1. 나만의 데이터셋 만들기

In [7]:
# 행과열을 핸들링하는 라이브러리
import pandas as pd
# json 포맷 데이터를 핸들링하는 라이브러리
import json
import jsonlines
import jsonlines
# hugging face 데이터 관리 라이브러리
from datasets import Dataset

In [8]:
dataPath = "./dataset/"

In [9]:
datasetName = "data.CSV"
jsonFileName = "data.jsonl"

In [11]:
def csv_to_json(csv_file_path, json_file_path):
    df = pd.read_csv(csv_file_path,encoding='cp949')

    # JSON 파일로 저장
    with open(json_file_path, 'w', encoding='utf-8') as json_file:
        # 각 행을 JSON으로 변환하여 바로 파일에 쓰기
        for index, row in df.iterrows():
            data = {'instruction': row['instruction'], 'output': row['output']}
            json.dump(data, json_file, ensure_ascii=False)
            json_file.write('\n')  # 각 행마다 줄바꿈

# CSV 파일 경로와 JSON 파일 경로 설정
csv_file_path = dataPath + datasetName
json_file_path = dataPath + jsonFileName

# 함수 호출
csv_to_json(csv_file_path, json_file_path)


In [12]:
import jsonlines
from datasets import Dataset

# JSON Lines 파일 경로 설정
json_file_path = './dataset/data.jsonl'
dataPath = './dataset/'

# 데이터를 로드하고 저장할 두 리스트 초기화
instructions = []
outputs = []

# JSON Lines 파일을 읽고 각 데이터 포인트를 리스트에 저장
with jsonlines.open(json_file_path) as f:
    for line in f.iter():
        instructions.append(line["instruction"])
        outputs.append(line["output"])

# 데이터셋 생성 및 저장
dataset = Dataset.from_dict({
    "instruction": instructions,
    "output": outputs
})
dataset.save_to_disk(dataPath)


# 데이터셋 확인
print('데이터셋 확인')
print(dataset[:5])



Saving the dataset (0/1 shards):   0%|          | 0/6 [00:00<?, ? examples/s]

데이터셋 확인
{'instruction': ['노하림은 몇살이야?', '노하림은 어디살아?', '노하림은 무슨 학교다녀?', '남서현은 몇살이야?', '남서현은 어디살아?'], 'output': ['노하림은 23살입니다.', '노하림은 서울 창동에 살아요.', '노하림은 덕성여대 컴퓨터공학과야.', '남서현은 24살이다.', '남서현은 강원도 원주 살고 지금은 기숙사 살아']}


In [13]:
dataset.push_to_hub("ho1iday/harimKo")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/302 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/ho1iday/harimKo/commit/0fcc3e0aabeb63a6e91222f443826552866c3c26', commit_message='Upload dataset', commit_description='', oid='0fcc3e0aabeb63a6e91222f443826552866c3c26', pr_url=None, pr_revision=None, pr_num=None)

# 2. 데이터셋 로드 하기

In [14]:
from datasets import load_dataset

data = load_dataset("ho1iday/harimKo")

Downloading readme:   0%|          | 0.00/302 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.78k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/6 [00:00<?, ? examples/s]

In [15]:
data

DatasetDict({
    train: Dataset({
        features: ['instruction', 'output'],
        num_rows: 6
    })
})

In [16]:
data = data.map(
    lambda x: {'text': f"### 질문: {x['instruction']}\n\n### 답변: {x['output']}<|endoftext|>" }
)

Map:   0%|          | 0/6 [00:00<?, ? examples/s]

In [17]:
data = data.map(lambda samples: tokenizer(samples["text"]), batched=True)

Map:   0%|          | 0/6 [00:00<?, ? examples/s]

In [18]:
data['train'][0]['text']

'### 질문: 노하림은 몇살이야?\n\n### 답변: 노하림은 23살입니다.<|endoftext|>'

In [19]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [20]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [21]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=8, 
    lora_alpha=32, 
    target_modules=["query_key_value"], 
    lora_dropout=0.05, 
    bias="none", 
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 6553600 || all params: 6608701440 || trainable%: 0.09916622894073424


# 3. 학습하기

In [22]:
import transformers

# needed for gpt-neo-x tokenizer
tokenizer.pad_token = tokenizer.eos_token

trainer = transformers.Trainer(
    model=model,
    train_dataset=data["train"],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=1,
        max_steps=5, ## 초소량만 학습: 50 step만 학습. 약 4분정도 걸립니다.
        learning_rate=1e-4,
        fp16=True,
        logging_steps=10,
        output_dir="outputs",
        optim="paged_adamw_8bit"
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
max_steps is given, it will override any value given in num_train_epochs
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/kmlee/.netrc




Step,Training Loss


TrainOutput(global_step=5, training_loss=4.379891967773437, metrics={'train_runtime': 26.8924, 'train_samples_per_second': 0.744, 'train_steps_per_second': 0.186, 'total_flos': 40226842583040.0, 'train_loss': 4.379891967773437, 'epoch': 2.5})

# 평가

In [25]:
model.eval()
model.config.use_cache 

False

In [28]:
def gen(x):
    gened = model.generate(
        **tokenizer(
            f"### 질문: {x}\n\n### 답변:", 
            return_tensors='pt', 
            return_token_type_ids=False
        ), 
        max_new_tokens=50,
        early_stopping=True,
        do_sample=True,
        eos_token_id=2,
    )
    print(tokenizer.decode(gened[0]))

In [29]:
gen('노하림은 몇살이야?')

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


### 질문: 노하림은 몇살이야?

### 답변: 노세연은 5살이에ㅋㅋㅋ이를 속이려고 나이를 안 말햇다 ㅋㅋ ㅋㅋㅋ 세연이 나이 속인거는 노하림이랑 노세연이 5살
