In [None]:
#데이터 없이 NER 모델 학습
#Few-shot 개체명 리스트 작성
#GPT-3를 사용한 개체명 리스트 확장
#GPT-3를 사용한 개체명인식 데이터셋 생성
#NER 모델 학습 - huggingface

In [None]:
#Few-shot 개체명 리스트 작성
#호텔 도메인
#엔티티 클래스는 <hotel name>, <room type>, <person name>, <date>, <hotel supplies>로 총 5개 입니다.

In [None]:
real_entities = [
    {
        'class_name': 'hotel name',
        'entity_names': [
            'Ritz-Carlton Hotel',
            'Marriott',
            'The Luxury Collection Hotels & Resorts',
            'St Regis Hotels',
            'Hyatt'
        ]
    },

    {
        'class_name': 'room type',
        'entity_names': [
            'Single room',
            'twin room',
            'Double room',
            'deluxe room',
            'Suites',
        ]
    },
    {
        'class_name': 'person name',
        'entity_names': [
            'Yongsun Yoon',
            'Steve Adams',
            'Donnie K. Schneider',
            'Eleanor Lockhart',
            'Jacqueline R. French'
        ]
    },
    {
        'class_name': 'date',
        'entity_names': [
            '3/4/2022',
            'November 27th',
            'December 15, 2023',
            'Feb. 8',
            'Saturday, Jul 22'
        ]
    },
    {
        'class_name': 'hotel supplies',
        'entity_names': [
            'shampoo',
            'Coffee kit',
            'towels',
            'Wine glass',
            'fan'
        ]
    }
]

In [None]:
#GPT-3를 사용한 개체명 리스트 확장

In [None]:
def generate(prompts, model='text-davinci-003', n=1, max_tokens=512):
    response = openai.Completion.create(
        model = model,
        prompt = prompts,
        echo = False,
        n = n,
        max_tokens = max_tokens,
        # stop = '\n'
    )

    texts = [c.text.strip() for c in response.choices]
    return texts


def construct_entity_prompt(class_name, entity_names, k=10):
    prompt = f'These are <{class_name}> entity names. Generate {k} new <{class_name}> entity names.\n\n'
    prompt += 'Entity names:\n'
    for e in entity_names:
        prompt += f'- {e}\n'
    prompt += '\nGenerated names:\n-'
    return prompt


def postprocess_entities(synthetic_entities):
    processed = []
    for ents in synthetic_entities:
        ents = f'- {ents}'.split('\n')
        ents = [e.split('- ')[1].strip() for e in ents]
        processed += ents
    return processed



synthetic_entities = []
for real_ent in tqdm(real_entities):
    class_name, entity_names = real_ent['class_name'], real_ent['entity_names']
    prompt = construct_entity_prompt(class_name, entity_names)

    syn_entities = generate(prompt, n=10)
    syn_entities = postprocess_entities(syn_entities)
    syn_entities = list(set(syn_entities))

    synthetic_entities.append({'class_name': class_name, 'entity_names': syn_entities})

NameError: ignored

In [None]:
print(prompt)

NameError: ignored

In [None]:
synthertic_entities[-1]

In [None]:
all_entities = []
for real, synthetic in zip(real_entities, synthetic_entities):
    all_entities.append({
        'class_name': real['class_name'],
        'entity_names': list(set(real['entity_names'] + synthetic['entity_names']))
    })

In [None]:
#GPT-3를 사용한 개체명인식 데이터셋 생성

In [None]:
def sample_entities(all_entities, min_k=1, max_k=3):
    k = np.random.randint(min_k, max_k+1)
    idxs = np.random.choice(range(len(all_entities)), size=k, replace=False)

    entities = []
    for i in idxs:
        ents = all_entities[i]
        name = np.random.choice(ents['entity_names'])
        entities.append({'class_name': ents['class_name'], 'entity_name': name})

    return entities


def construct_sentence_prompt(entities, style='dialog'):
    prompt = f'Generate a {style} sentence including following entities.\n\n'

    entities_string = ', '.join([f"{e['entity_name']}({e['class_name']})" for e in entities])
    prompt += f'Entities: {entities_string}\n'
    prompt += 'Sentence:'
    return prompt


def construct_labels(generated, entities, class2idx):
    labels = [class2idx['outside']] * len(generated)
    for ent in entities:
        l = class2idx[ent['class_name']]
        for span in re.finditer(ent['entity_name'].lower(), generated.lower()):
            s, e = span.start(), span.end()
            labels[s] = l
            labels[s+1:e] = [l+1] * (e-s-1)
    return labels


class2idx = {e['class_name']: i*2 for i, e in enumerate(all_entities)}
class2idx['outside'] = len(class2idx) * 2

data = []
for _ in tqdm(range(100)):
    batch_entities = [sample_entities(all_entities) for _ in range(10)]
    batch_prompts = [construct_sentence_prompt(ents) for ents in batch_entities]
    batch_generated = generate(batch_prompts, model='text-davinci-002')

    for generated, entities in zip(batch_generated, batch_entities):
        labels = construct_labels(generated, entities, class2idx)
        data.append({'text': generated, 'labels': labels})

    time.sleep(10)

In [None]:
#NER 모델 학습
# Huggingface에 공개된 roberta-base 모델을 Token classification 방법으로 학습

In [None]:
LABELS = ['B-HT', 'I-HT', 'B-RT', 'I-RT', 'B-PS', 'I-PS', 'B-DT', 'I-DT', 'B-SP', 'I-SP', 'O']


def pad_sequences(seqs, pad_val, max_length):
    _max_length = max([len(s) for s in seqs])
    max_length = min(max_length, _max_length)

    padded_seqs = []
    for seq in seqs:
        seq = seq[:max_length]
        pads = [pad_val] * (max_length - len(seq))
        seq = seq + pads
        padded_seqs.append(seq)

    return padded_seqs


class Dataset(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer, max_length, split='train'):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.split = split

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        text = item['text']
        char_labels = item['labels']

        inputs = self.tokenizer(text)
        input_ids = inputs.input_ids
        attention_mask = inputs.attention_mask

        labels = []
        for i in range(len(input_ids)):
            span = inputs.token_to_chars(i)
            if span is None:
                labels.append(len(LABELS)-1) # O
            else:
                labels.append(char_labels[span.start])

        return input_ids, attention_mask, labels


    def collate_fn(self, batch):
        input_ids, attention_mask, labels = zip(*batch)
        input_ids = pad_sequences(input_ids, self.tokenizer.pad_token_id, self.max_length)
        attention_mask = pad_sequences(attention_mask, 0, self.max_length)
        labels = pad_sequences(labels, -100, self.max_length)

        return torch.tensor(input_ids), torch.tensor(attention_mask), torch.tensor(labels)



tokenizer = AutoTokenizer.from_pretrained('roberta-base')

rand_idxs = np.random.permutation(range(len(data)))
train_idxs = rand_idxs[100:]
valid_idxs = rand_idxs[:100]

train_data = [data[i] for i in train_idxs]
valid_data = [data[i] for i in valid_idxs]

train_dataset = Dataset(train_data, tokenizer, 256)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=train_dataset.collate_fn)

valid_dataset = Dataset(valid_data, tokenizer, 256)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=16, shuffle=False, collate_fn=valid_dataset.collate_fn)

In [None]:
#KLUE 벤치마크 참고
#entity F1 / character F1 사용

In [None]:
def train(model, loader, device, outside_weight=0.9):
    model.train()

    label_weight = torch.ones(model.num_labels)
    label_weight[-1] = outside_weight
    label_weight = label_weight.to(device)

    pbar = tqdm(loader)
    for batch in pbar:
        batch = [b.to(device) for b in batch]
        input_ids, attention_mask, labels = batch

        outputs = model(input_ids, attention_mask)
        logits = outputs.logits
        logits = logits.view(-1, model.num_labels)
        labels = labels.view(-1)

        loss = F.cross_entropy(logits, labels, weight=label_weight)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        pbar.set_postfix({'loss': loss.item()})


def predict(model, loader, device):
    model.eval()

    total_preds, total_labels = [], []
    for batch in tqdm(loader):
        batch = [b.to(device) for b in batch]
        input_ids, attention_mask, labels = batch
        with torch.no_grad():
            outputs = model(input_ids, attention_mask, labels=labels)

        preds = outputs.logits.argmax(dim=-1)
        total_preds += preds.cpu().tolist()
        total_labels += labels.cpu().tolist()

    return total_preds, total_labels


def remove_padding(preds, labels):
    removed_preds, removed_labels = [], []
    for p, l in zip(preds, labels):
        if -100 not in l: continue

        idx = l.index(-100)
        removed_preds.append(p[:idx])
        removed_labels.append(l[:idx])

    return removed_preds, removed_labels


def entity_f1_func(preds, targets):
    preds = [[LABELS[p] for p in pred] for pred in preds]
    targets = [[LABELS[t] for t in target] for target in targets]
    entity_macro_f1 = ner_f1_score(targets, preds, average="macro", mode="strict", scheme=IOB2)
    f1 = entity_macro_f1 * 100.0
    return round(f1, 2)

def char_f1_func(preds, targets):
    label_indices = list(range(len(LABELS)))
    preds = list(itertools.chain(*preds))
    targets = list(itertools.chain(*targets))
    f1 = f1_score(targets, preds, labels=label_indices, average='macro', zero_division=True) * 100.0
    return round(f1, 2)


def evaluate(model, loader, device):
    preds, labels = predict(model, loader, device)
    preds, labels = remove_padding(preds, labels)
    entity_f1 = entity_f1_func(preds, labels)
    char_f1 = char_f1_func(preds, labels)
    return entity_f1, char_f1



num_labels = len(LABELS)
id2label = {i:l for i,l in enumerate(LABELS)}
label2id = {l:i for i,l in enumerate(LABELS)}

model = AutoModelForTokenClassification.from_pretrained('roberta-base', num_labels=num_labels, id2label=id2label, label2id=label2id)
_ = model.train().to('cuda:0')

optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)

best_score = 0.
for ep in range(5):
    train(model, train_loader, 'cuda:0')
    entity_f1, char_f1 = evaluate(model, valid_loader, 'cuda:0')
    print(f'ep: {ep:02d} | entity f1: {entity_f1:.2f} | char f1: {char_f1:.2f}')

    if entity_f1 > best_score:
        model.save_pretrained('checkpoint')
        tokenizer.save_pretrained('checkpoint')
        best_score = entity_f1

In [None]:
#5 epoch 학습 결과, 검증 데이터에서 94.83 entity F1, 96.58 character F1의 성능

In [None]:
#데스트

# 새 섹션

In [None]:
nlp = pipeline(task='token-classification', model="checkpoint", aggregation_strategy='simple')
sentence = "Can i make a reservation at the Chosun Hotel on Frbruary 3rd?"
nlp(sentence)

sentence = "i made a reservation for a basis room under Jung Hyung-joon."
nlp(sentence)

sentence = "Cann I refill the ice cups?"
nlp(sentence)