In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [2]:
%cd /content/drive/MyDrive/linq

/content/drive/MyDrive/linq


## Korean NER finetuning to LLAMA

In [2]:
!pip install -r requirements.txt
!pip install jsonlines
!pip install datasets
# !pip install torch
!pip install -q -U bitsandbytes
# !pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git

[31mERROR: Could not open requirements file: [Errno 2] No such file or directory: 'requirements.txt'[0m[31m
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [13]:
!pip install jsonlines
!pip install accelerate



In [None]:
!mkdir datasets
!mkdir datasets/NER

In [None]:
!git clone https://github.com/kmounlp/NER.git datasets/NER

In [None]:
# get all of .txt files in NER folder
import os
import glob
import shutil

# get all of .txt files in NER folder (recursively)
txt_files = glob.glob('datasets/NER/**/*_NER.txt', recursive=True)


In [None]:
# sample data
!head datasets/NER/00008_NER.txt

In [None]:
import Korpora

# load ner dataset
from Korpora import Korpora
ner = Korpora.load('naver_changwon_ner')

In [None]:
ner.get_all_words_and_tags()

매핑 규칙은 Spacy의 태그규칙을 최대한 따랐으며, 없는 태그는 원본태그들을 따르도록 하였습니다.

1. PER (PERSON) -> PERSON
2. FLD (FIELD) -> ORG
3. AFW (ARTIFACTS_WORKS) -> PRODUCT
4. ORG (ORGANIZATION) -> ORG  
5. LOC (LOCATION) -> GPE 또는 FAC
   - 지역명칭, 행정구역 명칭 등 -> GPE
   - 건물, 공항, 교량 등 물리적 시설 -> FAC
6. CVL (CIVILIZATION) -> WORK_OF_ART
7. DAT (DATE) -> DATE
8. TIM (TIME) -> TIME
9. NUM (NUMBER) -> CARDINAL (기본값), 필요시 ORDINAL, QUANTITY, PERCENT, MONEY 중 선택
10. EVT (EVENT) -> EVENT
11. ANM (ANIMAL) -> ANM (원본 유지)
12. PLT (PLANT) -> PLT (원본 유지)
13. MAT (MATERIAL) -> PRODUCT
14. TRM (TERM) -> TRM (원본 유지)
15. CORP (Corporations and businesses) -> ORG
16. GRP (All other groups) -> NORP
17. PROD (Consumer products) -> PRODUCT
18. CW (Titles of creative works) -> WORK_OF_ART

위의 매핑 규칙을 요약하면 다음과 같습니다:

- PER, ORG, DATE, TIME, EVENT는 그대로 해당 태그로 매핑
- FLD, CORP는 ORG로 매핑
- AFW, PROD, MAT는 PRODUCT로 매핑
- LOC은 GPE 또는 FAC로 매핑 (문맥에 따라 선택)
- CVL, GRP는 NORP로 매핑
- NUM은 기본적으로 CARDINAL로 두되, 문맥에 따라 ORDINAL, QUANTITY, PERCENT, MONEY 중 선택
- CW는 WORK_OF_ART로 매핑
- ANM, PLT, TRM은 원본 태그 유지

이 매핑 규칙에 따라 원본 코퍼스의 태그를 변환하면, NER 학습에 사용할 수 있는 형식의 데이터를 얻을 수 있습니다.


In [None]:
## load mapper
import json

with open('config/mapping.json') as f:
    mapper = json.load(f)
    tag_mapper = mapper['tag_mapping']

In [None]:
# preprocess dataset to tagged sentences
# ex.) WordTag(text='이 음경동맥의 직경이 8 19mm입니다 . ', words=['이', '음경동맥의', '직경이', '8', '19mm입니다', '.'], tags=['-', '-', '-', 'NUM_B', 'NUM_B', '-'])
# '이 음경동맥의 직경이 <819mm입니다:NUM>.'
# ㄴ NUM_B: 8, + NUM_I: 19mm입니다 = NUM: 819mm입니다

from typing import List, Tuple
import re

# save preprocessed dataset
with open('datasets/naver_changwon_ner.txt', 'w') as f:
    for i in range(len(ner.train)):
        result = []
        for w, tag in zip(ner.train[i].words, ner.train[i].tags):
            if tag[-2:] == '_B':
                tag = tag_mapper[tag[:-2]]
                chunk = '<' +tag+ ":" +  w + '>'
            elif tag[-2:] == '_I':
                try:
                    chunk = result.pop(-1)[:-1]
                except:
                    tag = tag_mapper[tag[:-2]]
                    chunk = '<' +tag+ ":"
                chunk += w + '>'
            else:
                continue
            result.append(chunk)
        f.write("\n### Text:\n" + ner.train[i].text + '\n' + "### Tags: \n" + '\n'.join(result) + '\n')

In [None]:
!mkdir datasets/multiconer2022/
!aws s3 cp --no-sign-request s3://multiconer/multiconer2022/KO-Korean/ datasets/multiconer2022/ --recursive

## Process MultiCoNer

In [None]:
# sample data
!head datasets/multiconer2022/ko_dev.conll

In [None]:
# preprocess dataset to tagged sentences

### example data
# # id b02dfd6c-fa8e-444d-8988-a7e201cd6541	domain=dev
# 김규식 _ _ B-PER
# 그의 _ _ O
# 귀국으로 _ _ O
# 이승만 _ _ O
# , _ _ O
# 김구 _ _ B-PER
# 독주가 _ _ O
# 견제되기를 _ _ O
# 희망하였다 _ _ O
# . _ _ O
#
# # id 5c37bcc2-7ab8-47e2-b90f-be9d03308a5f	domain=dev
# 국립진주박물관은 _ _ O
# 1984년 _ _ O
# 11월 _ _ O
# 2일 _ _ O
# ...

### processed
#
### Text:
### 김규식 그의 귀국으로 이승만, 김구 독주가 견제되기를 희망하였다
### Tags:
### <PER:김규식>
### <PER:김구>

from typing import List, Tuple

# save preprocessed dataset
with open('datasets/multiconer2022/ko_train.txt', 'w') as f:
    with open('datasets/multiconer2022/ko_train.conll', 'r') as f2:
        text = ''
        tags = []
        for line in f2:
            if line == '\n':
                result = []
                for w, tag in zip(text.split(), tags):
                    if tag == 'O':
                        continue
                    elif tag[:2] == 'I-':
                        chunk = result.pop(-1)[:-1]
                        chunk += w + '>'
                        result.append(chunk)
                    else:
                        try:
                            tag = mapper['tag_mapping'][tag[2:]]
                        except:
                            continue
                        chunk = '<' +tag+ ":" +  w + '>'
                        result.append(chunk)

                f.write("\n### Text:\n" + text[2:] + '\n' + "### Tags: \n" + '\n'.join(result) + '\n')
                text = ''
                tags = []
            else:
                text += line.split()[0] + ' '
                tags.append(line.split()[-1])

### Process NAVER NER dataset


In [None]:
!head datasets/NAVER/*.txt

In [None]:
## read all of .txt files in datasets directory
import glob
import os

txt_files = glob.glob('datasets/NER/*.txt')

# preprocess dataset to tagged sentences
from typing import List, Tuple

# save preprocessed dataset
with open('datasets/naver_ner.txt', 'w') as f:
    for file in txt_files:
        with open(file, 'r') as f2:
            text = []
            tags = []
            for line in f2:
                if line == '\n':
                    result = []
                    for w, tag in zip(text, tags):
                        if tag == 'O':
                            continue
                        elif tag[:2] == 'I-':
                            try:
                                chunk = result.pop(-1)[:-1]
                            except:
                                tag = tag_mapper[tag[2:]]
                                chunk = '<' +tag+ ":"
                            chunk += w + '>'
                            result.append(chunk)
                        else:
                            tag = mapper['tag_mapping'][tag[2:]]
                            chunk = '<' +tag+ ":" +  w + '>'
                            result.append(chunk)

                    result = set(result)
                    f.write("\n### Text:\n" + ''.join(text) + '\n' + "### Tags: \n" + '\n'.join(list(result)) + '\n')
                    text = []
                    tags = []
                else:
                    word = line.split()[0]
                    tag = line.split()[-1]
                    if word == "##":
                        continue
                    if word == '_':
                        word = ' '
                    text.append(word)
                    tags.append(line.split()[-1])

In [None]:
### load dataset and split train, test

import random
from typing import List, Tuple


def load_dataset(file_path: str) -> List[Tuple[str, str]]:
    with open(file_path, 'r') as f:
        lines = f.readlines()

    dataset = []
    text = ''
    tags = ''
    ### dataset format
    # ### Text:
    # 강원지역 1위 김화고는 휴전선 접경 지역이라는 열악한 환경에서 학생들이 사교육을 받지 않아도 되도록 교사들이 열정을 쏟았다.
    # ### Tags:
    # <QUANTITY:1위>
    # <GPE:강원>
    # <ORG:김화고>
    #
    # ### Text:
    # 현재 BAT의 미국 점유율은 5.3%에 불과하다.
    # ### Tags:
    # <PERCENT:5.3%>
    # <ORG:BAT>
    # <GPE:미국>
    # ...

    cur_data_form = 'text'
    for line in lines:
        if line.startswith('### Text:'):
            cur_data_form = 'text'
        elif line.startswith('### Tags:'):
            cur_data_form = 'tags'
        else:
            if cur_data_form == 'text':
                text += line
            elif cur_data_form == 'tags':
                tags += line
                if line == '\n':
                    dataset.append((text, tags))
                    text = ''
                    tags = ''

    return dataset


def split_dataset(dataset: List[Tuple[str, str]], ratio: float = 0.8) -> Tuple[List[Tuple[str, str]], List[Tuple[str, str]]]:
    random.shuffle(dataset)

    train_size = int(len(dataset) * ratio)
    train_dataset = dataset[:train_size]
    test_dataset = dataset[train_size:]

    return train_dataset, test_dataset


# load 3 datasets and merge them
naver_changwon_ner = load_dataset('datasets/naver_changwon_ner.txt')
multiconer2022 = load_dataset('datasets/multiconer2022/ko_train.txt')
naver_ner = load_dataset('datasets/naver_ner.txt')

# merge 3 datasets
dataset = naver_changwon_ner + multiconer2022 + naver_ner

# split dataset
train_dataset, test_dataset = split_dataset(dataset)


# save train, test datasets to jsonl form
import jsonlines
import json

with open('datasets/train.jsonl', 'w') as f:
    for i in range(len(train_dataset)):
        if train_dataset[i][0] == '':
            continue
        train_dataset[i] = {'text': train_dataset[i][0].strip(), 'tags': [t for t in train_dataset[i][1].split('\n') if t!='']}
        f.write(json.dumps(train_dataset[i], ensure_ascii=False) + '\n')



with open('datasets/test.jsonl', 'w') as f:
    for i in range(len(test_dataset)):
        if test_dataset[i][0] == '':
            continue
        test_dataset[i] = {'text': test_dataset[i][0].strip(), 'tags': [t for t in test_dataset[i][1].split('\n') if t!='']}
        f.write(json.dumps(test_dataset[i], ensure_ascii=False) + '\n')


## Training Model (LLAMA 7B)

- Training data:
-- MultiCoNer (NAVER NER dataset)
-- NAVER NER dataset
-- Changwon NER dataset
- Model: LLAMA 7B (with PEFT)

In [2]:
## training script
from cgi import test
import os
import jsonlines
import argparse
from typing import List, Tuple
from collections import Counter


from datasets import Dataset

# load json datasets
train_dataset = Dataset.from_json('datasets/train.jsonl')
test_dataset = Dataset.from_json('datasets/test.jsonl')

instruction = """다음 아래 문장에서 개체명을 추출하려 합니다. 개체명은 아래 규칙(spacy를 따름)으로 태깅시켜 주세요.
- PER: Person
- ORG: Organization
- GPE: Geopolitical Entity
- LOC: Location
- FAC: Facility
- QUANTITY: Quantity
- ORDINAL: Ordinal
- CARDINAL: Cardinal
- DATE: Date
- TIME: Time
- MONEY: Money
- PERCENT: Percent
- PRODUCT: Product
- EVENT: Event
- WORK_OF_ART: Work of Art
- LANGUAGE: Language
- LAW: Law
- NORP: Nationalities or religious or political groups
- MISC: Miscellaneous
아래 문장들에서 개체명을 추출하세요.
"""


# data
train_dataset = train_dataset.map(
    lambda x: {'text': f"###Instruct:\n{instruction}\n\n### Text:\n{x['text']}\n\n### Tags:\n{' '.join(x['tags'])}<|endoftext|>" }
)

test_dataset = test_dataset.map(
    lambda x: {'text': f"###Instruct:\n{instruction}\n\n### Text:\n{x['text']}\n\n### Tags:\n{' '.join(x['tags'])}<|endoftext|>" }
)

In [3]:
print(train_dataset[10]['text'])

###Instruct:
다음 아래 문장에서 개체명을 추출하려 합니다. 개체명은 아래 규칙(spacy를 따름)으로 태깅시켜 주세요.
- PER: Person
- ORG: Organization
- GPE: Geopolitical Entity
- LOC: Location
- FAC: Facility
- QUANTITY: Quantity
- ORDINAL: Ordinal
- CARDINAL: Cardinal
- DATE: Date
- TIME: Time
- MONEY: Money
- PERCENT: Percent
- PRODUCT: Product
- EVENT: Event
- WORK_OF_ART: Work of Art
- LANGUAGE: Language
- LAW: Law
- NORP: Nationalities or religious or political groups
- MISC: Miscellaneous
아래 문장들에서 개체명을 추출하세요.


### Text:
극의 개전이 빠르고 가볍기 때문에 시정권고는 한순간도 이완을 늦출 수가 없습니다 .

### Tags:
<WORK_OF_ART:시정권고는><|endoftext|>


In [4]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) ㅛ
Invalid input. Must be one of ('y', 'yes', '1', 'n', 'no', '0', '')
Add t

In [5]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_id = "meta-llama/Llama-2-7b-hf"
# model_id = "beomi/polyglot-ko-12.8b-safetensors"  # safetensors 컨버팅된 레포
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0})

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

In [6]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear4bit(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): Lla

In [7]:
train_dataset = train_dataset.map(lambda samples: tokenizer(samples["text"]), batched=True)

from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

Map:   0%|          | 0/99534 [00:00<?, ? examples/s]

In [11]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 8388608 || all params: 3508801536 || trainable%: 0.23907331075678143


In [12]:
import transformers

tokenizer.pad_token = tokenizer.eos_token

trainer = transformers.Trainer(
    model=model,
    train_dataset=train_dataset,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=5,
        gradient_accumulation_steps=1,
        # warmup_steps=200,
        max_steps=5000, ## 초소형만 학습: 10 step = 20개 샘플만 학습.
        learning_rate=2e-4,
        fp16=True,
        logging_steps=10,
        output_dir="outputs",
        optim="paged_adamw_8bit"
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
10,0.9736
20,0.4641
30,0.3974


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.06 GiB. GPU 0 has a total capacity of 14.75 GiB of which 1007.06 MiB is free. Process 62530 has 13.76 GiB memory in use. Of the allocated memory 11.92 GiB is allocated by PyTorch, and 1.71 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [13]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): PeftModelForCausalLM(
      (base_model): LoraModel(
        (model): LlamaForCausalLM(
          (model): LlamaModel(
            (embed_tokens): Embedding(32000, 4096)
            (layers): ModuleList(
              (0-31): 32 x LlamaDecoderLayer(
                (self_attn): LlamaSdpaAttention(
                  (q_proj): lora.Linear4bit(
                    (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.05, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=4096, out_features=8, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=8, out_features=4096, bias=False)
                    )
                    (lora_embedding_A): ParameterDict()
          

In [5]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) n
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [21]:
model.push_to_hub("jason9693/linq-assignment")

adapter_model.safetensors:   0%|          | 0.00/33.6M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/jason9693/linq-assignment/commit/6094715520b2bb123209600de152232951e09789', commit_message='Upload model', commit_description='', oid='6094715520b2bb123209600de152232951e09789', pr_url=None, pr_revision=None, pr_num=None)

In [22]:
tokenizer.push_to_hub("jason9693/linq-assignment")

README.md:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/jason9693/linq-assignment/commit/04b79386f37f08ee558ec196740338ab7e8f3067', commit_message='Upload tokenizer', commit_description='', oid='04b79386f37f08ee558ec196740338ab7e8f3067', pr_url=None, pr_revision=None, pr_num=None)

## load huggingface model

In [9]:
!pip install peft bitsandbytes

Collecting peft
  Downloading peft-0.10.0-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.1/199.1 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bitsandbytes
  Downloading bitsandbytes-0.43.0-py3-none-manylinux_2_24_x86_64.whl (102.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.2/102.2 MB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate>=0.21.0 (from peft)
  Downloading accelerate-0.28.0-py3-none-any.whl (290 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.1/290.1 kB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m35.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.13.0->pef

In [17]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) n
Token is valid (permission: read).
Your token has been saved to /root/.ca

In [2]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/accelerate.git

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel, PeftConfig

base_model = "meta-llama/Llama-2-7b-hf"
adapter_model = "jason9693/linq-assignment"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(base_model, torch_dtype=torch.float16)
model = PeftModel.from_pretrained(model, adapter_model)
tokenizer = AutoTokenizer.from_pretrained(base_model)

# model = model.to("cuda")
model.eval()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [15]:
instruction = """###Instruct:
다음 아래 문장에서 개체명을 추출하려 합니다. 개체명은 아래 규칙(spacy를 따름)으로 태깅시켜 주세요.
- PER: Person
- ORG: Organization
- GPE: Geopolitical Entity
- LOC: Location
- FAC: Facility
- QUANTITY: Quantity
- ORDINAL: Ordinal
- CARDINAL: Cardinal
- DATE: Date
- TIME: Time
- MONEY: Money
- PERCENT: Percent
- PRODUCT: Product
- EVENT: Event
- WORK_OF_ART: Work of Art
- LANGUAGE: Language
- LAW: Law
- NORP: Nationalities or religious or political groups
- MISC: Miscellaneous
아래 문장들에서 개체명을 추출하세요.
"""

def gen(x):
    gened = model.generate(
        **tokenizer(
            f"{instruction}\n\n### Text:\n{x}\n\n### Tags:\n",
            return_tensors='pt',
            return_token_type_ids=False
        ),
        max_new_tokens=20,
        early_stopping=True,
        do_sample=True,
        eos_token_id=2,
    )
    print(tokenizer.decode(gened[0]))

In [16]:
gen("현재 사용 가능한 컴퓨팅 단위가 없습니다. 무료 제공 리소스는 사용이 보장되지 않습니다. 여기에서 더 많은 단위를 구매하세요.")

<s> ###Instruct:
다음 아래 문장에서 개체명을 추출하려 합니다. 개체명은 아래 규칙(spacy를 따름)으로 태깅시켜 주세요.
- PER: Person
- ORG: Organization
- GPE: Geopolitical Entity
- LOC: Location
- FAC: Facility
- QUANTITY: Quantity
- ORDINAL: Ordinal
- CARDINAL: Cardinal
- DATE: Date
- TIME: Time
- MONEY: Money
- PERCENT: Percent
- PRODUCT: Product
- EVENT: Event
- WORK_OF_ART: Work of Art
- LANGUAGE: Language
- LAW: Law
- NORP: Nationalities or religious or political groups
- MISC: Miscellaneous
아래 문장들에서 개체명을 추출하세요.


### Text:
현재 사용 가능한 컴퓨팅 단위가 없습니다. 무료 제공 리소스는 사용이 보장되지 않습니다. 여기에서 더 많은 단위를 구매하세요.

### Tags:
1. 추출된 개체명 �


### Evaluate Spacy vs LLAMA

In [None]:
!pip install spacy
!pip install dataset
!python -m spacy download ko_core_news_md

In [3]:
import spacy
from typing import List, Tuple

nlp = spacy.load("ko_core_news_md")

def extract_entities(text: str) -> List[Tuple[str, str]]:
    doc = nlp(text)
    entities = []
    for ent in doc.ents:
        entities.append((ent.text, ent.label_))
    return entities

extract_entities("대한민국의 수도는 서울입니다. ")

[('대한민국의', 'LC'), ('서울입니다', 'LC')]

In [4]:
## using huggingface transformers

def extract_llama_entities(predicted: list) -> List[Tuple[str, str]]:
    # predicted = ["<PER:김규식>", "<PER:김구>", "<GPE:미국>"]
    entities = []
    for entity in predicted:
        entities.append((entity.split(':')[1][:-1], entity.split(':')[0][1:]))
    return entities


def extract_llama_with_inference(text: str) -> List[Tuple[str, str]]:
    inputs = tokenizer(text, return_tensors="pt")
    outputs = model.generate(**inputs, max_length=512, num_beams=5, early_stopping=True)
    predicted = tokenizer.decode(outputs[0])
    # only get generated part
    predicted = predicted.split("### Tags: ")[1].split(" ")
    return extract_llama_entities(predicted)

extract_llama_entities(["<PER:김규식>", "<PER:김구>", "<GPE:미국>"])

[('김규식', 'PER'), ('김구', 'PER'), ('미국', 'GPE')]

In [None]:
test_dataset = Dataset.from_json('datasets/test.jsonl')

spacy_result = []
goldenset_result = []
model_result = []

for i in range(len(test_dataset)):
    text = test_dataset[i]['text']
    tags = test_dataset[i]['tags']
    spacy_result.append(extract_entities(text))
    goldenset_result.append(extract_llama_entities(tags))
    model_result.append(extract_llama_with_inference(text))

In [None]:
def calc_metric(predicted: List[Tuple[str, str]], golden: List[Tuple[str, str]]) -> Tuple[int, int, int]:
    tp = 0
    fp = 0
    fn = 0
    if len(predicted) == 0:
        predicted = set()
    if len(golden) == 0:
        golden = set()
    predicted = set(predicted)
    golden = set(golden)

    for pred in predicted:
        if pred in golden:
            tp += 1
        else:
            fp += 1
    for gold in golden:
        if gold not in predicted:
            fn += 1
    # zero division
    if tp == 0:
        return 0, 0, 0

    f1 = 2 * tp / (2 * tp + fp + fn)
    acc = tp / (tp + fp + fn)
    return f1, acc, tp


def calc_metric_batch(predicted: List[List[Tuple[str, str]]], golden: List[List[Tuple[str, str]]]) -> Tuple[int, int, int]:
    f1s = []
    accs = []
    tps = []

    for p, g in zip(predicted, golden):
        if len(g) == len(p) == 0:
          continue
        f1, acc, tp = calc_metric(p, g)
        f1s.append(f1)
        accs.append(acc)
        tps.append(tp)

    return sum(f1s) / len(f1s), sum(accs) / len(accs), sum(tps) / len(tps)


llama_result = calc_metric_batch(model_result, goldenset_result)
spcy_result = calc_metric_batch(spacy_result, goldenset_result)

# show results as one pd table
import pandas as pd

df = pd.DataFrame({
    'Model': llama_result,
    'Spacy': spcy_result
}, index=['F1', 'Accuracy', 'TP'])

df