In [11]:
from datasets import load_dataset
from transformers import *

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


## NER with an LLM

Using LLMs to do NER is hard, as the model often fails to generate coherent output but interesting as it might enable a few-shot setting. This is, for example, outlined in [GPT-NER](https://arxiv.org/abs/2304.10428). On the other hand, for sequence classification tasks, LLMs such as GPT-4 exhibit human-level performance as outlined in [this paper]()

Here, we try to take one step further by forcing our LLM to generate coherent output using a grammar

#### Get our dataset

In [2]:
dataset = load_dataset('conll2003')
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [3]:
tagset = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}
tagset = {v: k for k, v in tagset.items()}

Add map function to convert rows to custom text format

In [4]:
def get_tagged_text(tokens, ner_tags): 
    output = ""

    needs_close = False
    first = True

    for token, tag in zip(tokens, ner_tags):

        if not first and not needs_close and not token in ['.', ',', '!', '?', '"']:
            output += " "
        first = False

        if needs_close and (tag == "O" or tag.startswith("B-")):
            output += "</mark> "
            needs_close = False

        if tag == "O":
            output += token
        elif tag.startswith("B-"):
            output += f'<mark tag="{tag[2:]}">{token}'
            needs_close = True
        elif tag.startswith("I-"):
            output += f" {token}"

    if needs_close:
        output += "</mark>"

    return output

def apply_tagged_text(example):
    example['tagged_text'] = get_tagged_text(example['tokens'], [tagset[tag] for tag in example['ner_tags']])
    return example

dataset = dataset.map(apply_tagged_text)

Map:  17%|█▋        | 2357/14041 [00:00<00:02, 5761.55 examples/s]

Map: 100%|██████████| 14041/14041 [00:02<00:00, 5702.45 examples/s]
Map: 100%|██████████| 3250/3250 [00:00<00:00, 5682.58 examples/s]
Map: 100%|██████████| 3453/3453 [00:00<00:00, 5862.32 examples/s]


In [5]:
def apply_raw_text(example): 
    sentence = ""
    for token in example['tokens']:
        if token in ['.', ',', '!', '?', '"']:
            sentence += token
        else:
            sentence += " " + token
    return {'raw_text': sentence.strip()}

dataset = dataset.map(apply_raw_text)

Map: 100%|██████████| 14041/14041 [00:02<00:00, 6336.84 examples/s]
Map: 100%|██████████| 3250/3250 [00:00<00:00, 6498.82 examples/s]
Map: 100%|██████████| 3453/3453 [00:00<00:00, 6125.93 examples/s]


In [6]:
for i in range(3):
    print(dataset['test'][i]['raw_text'])
    print(dataset['test'][i]['tagged_text'])
    print()

SOCCER - JAPAN GET LUCKY WIN, CHINA IN SURPRISE DEFEAT.
SOCCER - <mark tag="LOC">JAPAN</mark> GET LUCKY WIN, <mark tag="PER">CHINA</mark> IN SURPRISE DEFEAT.

Nadim Ladki
<mark tag="PER">Nadim Ladki</mark>

AL-AIN, United Arab Emirates 1996-12-06
<mark tag="LOC">AL-AIN</mark> , <mark tag="LOC">United Arab Emirates</mark> 1996-12-06



#### Raw LLM output

In [7]:
instructions = {
    "PER": "A person's name",
    "ORG": "An organization name",
    "LOC": "A location",
    "MISC": "A miscellaneous entity",
}
instructions = "\n".join([f"{tag}: {text}" for tag, text in instructions.items()])
print(instructions)

PER: A person's name
ORG: An organization name
LOC: A location
MISC: A miscellaneous entity


A very simple prompt asking our model to perform the sequence classification task

In [8]:
examples = dataset['train'][:3]
dataset['train'] = dataset['train'].select(range(3, len(dataset['train'])))

In [9]:
examples_str = ''
for raw, tagged in zip(examples['raw_text'], examples['tagged_text']):
    examples_str += f'Input: {raw}\n'
    examples_str += f'Output: {tagged}\n'
    examples_str += '\n'
examples_str = examples_str.strip()

print(examples_str)

Input: EU rejects German call to boycott British lamb.
Output: <mark tag="ORG">EU</mark> rejects <mark tag="MISC">German</mark> call to boycott <mark tag="MISC">British</mark> lamb.

Input: Peter Blackburn
Output: <mark tag="PER">Peter Blackburn</mark>

Input: BRUSSELS 1996-08-22
Output: <mark tag="LOC">BRUSSELS</mark> 1996-08-22


In [10]:
def get_prompt(raw_text): 
    return '\n'.join([
        "This is a sequence classification task.",
        "",
        "We want to classify the following text into one of the following categories:",
        f"{instructions}",
        "", 
        "Here are some examples:",
        "", 
        f"{examples_str}",
        '',
        f"Input: {raw_text}",
        "Output:",
    ])

print(get_prompt(dataset['train'][0]['raw_text']))

This is a sequence classification task.

We want to classify the following text into one of the following categories:
PER: A person's name
ORG: An organization name
LOC: A location
MISC: A miscellaneous entity

Here are some examples:

Input: EU rejects German call to boycott British lamb.
Output: <mark tag="ORG">EU</mark> rejects <mark tag="MISC">German</mark> call to boycott <mark tag="MISC">British</mark> lamb.

Input: Peter Blackburn
Output: <mark tag="PER">Peter Blackburn</mark>

Input: BRUSSELS 1996-08-22
Output: <mark tag="LOC">BRUSSELS</mark> 1996-08-22

Input: The European Commission said on Thursday it disagreed with German advice to consumers to shun British lamb until scientists determine whether mad cow disease can be transmitted to sheep.
Output:


In [13]:
model_name = "meta-llama/Llama-2-7b-chat"
model_name = 'meta-llama/Llama-2-7b-chat-hf'

model = AutoModelForCausalLM.from_pretrained(model_name, load_in_4bit=True, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)

pipeline = pipeline('text-generation', model=model, tokenizer=tokenizer)

loading configuration file config.json from cache at /home/jvh/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-chat-hf/snapshots/08751db2aca9bf2f7f80d2e516117a53d7450235/config.json
Model config LlamaConfig {
  "_name_or_path": "meta-llama/Llama-2-7b-chat-hf",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 4096,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "tie_word_embeddings": false,
  "torch_dtype": "float16",
  "transformers_version": "4.32.0",
  "use_cache": true,
  "vocab_size": 32000
}

loading weights file model.safetensors from cache at /home/jvh/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-chat-hf/snapshots/08751db2aca9bf2f7f80d2e516117a53d7450235/m

ValueError: 
                        Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit
                        the quantized model. If you want to dispatch the model on the CPU or the disk while keeping
                        these modules in 32-bit, you need to set `load_in_8bit_fp32_cpu_offload=True` and pass a custom
                        `device_map` to `from_pretrained`. Check
                        https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu
                        for more details.
                        

In [None]:
pipeline(get_prompt(dataset['train'][0]['raw_text']))