<a href="https://colab.research.google.com/github/SaiSakethGK/Reinforcement-Learning/blob/main/ner_2_0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import re
def get_tokens_with_entities(raw_text: str):
    raw_tokens = re.split(r"\s(?![^\[]*\])", raw_text)
    entity_value_pattern = r"\[(?P<value>.+?)\]\((?P<entity>.+?)\)"
    entity_value_pattern_compiled = re.compile(entity_value_pattern, flags=re.I|re.M)

    tokens_with_entities = []

    for raw_token in raw_tokens:
        match = entity_value_pattern_compiled.match(raw_token)
        if match:
            raw_entity_name, raw_entity_value = match.group("entity"), match.group("value")

            for i, raw_entity_token in enumerate(re.split("\s", raw_entity_value)):
                entity_prefix = "B" if i == 0 else "I"
                entity_name = f"{entity_prefix}-{raw_entity_name}"
                tokens_with_entities.append((raw_entity_token, entity_name))
        else:
            tokens_with_entities.append((raw_token, "O"))

    return tokens_with_entities


class NERDataMaker:
    def __init__(self, texts):
        self.unique_entities = []
        self.processed_texts = []

        temp_processed_texts = []
        for text in texts:
            tokens_with_entities = get_tokens_with_entities(text)
            for _, ent in tokens_with_entities:
                if ent not in self.unique_entities:
                    self.unique_entities.append(ent)
            temp_processed_texts.append(tokens_with_entities)

        self.unique_entities.sort(key=lambda ent: ent if ent != "O" else "")

        for tokens_with_entities in temp_processed_texts:
            self.processed_texts.append([(t, self.unique_entities.index(ent)) for t, ent in tokens_with_entities])

    @property
    def id2label(self):
        return dict(enumerate(self.unique_entities))

    @property
    def label2id(self):
        return {v:k for k, v in self.id2label.items()}

    def __len__(self):
        return len(self.processed_texts)

    def __getitem__(self, idx):
        def _process_tokens_for_one_text(id, tokens_with_encoded_entities):
            ner_tags = []
            tokens = []
            for t, ent in tokens_with_encoded_entities:
                ner_tags.append(ent)
                tokens.append(t)

            return {
                "id": id,
                "ner_tags": ner_tags,
                "tokens": tokens
            }

        tokens_with_encoded_entities = self.processed_texts[idx]
        if isinstance(idx, int):
            return _process_tokens_for_one_text(idx, tokens_with_encoded_entities)
        else:
            return [_process_tokens_for_one_text(i+idx.start, tee) for i, tee in enumerate(tokens_with_encoded_entities)]

    def as_hf_dataset(self, tokenizer):
        from datasets import Dataset, Features, Value, ClassLabel, Sequence
        def tokenize_and_align_labels(examples):
            tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

            labels = []
            for i, label in enumerate(examples[f"ner_tags"]):
                word_ids = tokenized_inputs.word_ids(batch_index=i)
                previous_word_idx = None
                label_ids = []
                for word_idx in word_ids:
                    if word_idx is None:
                        label_ids.append(-100)
                    elif word_idx != previous_word_idx:
                        label_ids.append(label[word_idx])
                    else:
                        label_ids.append(-100)
                    previous_word_idx = word_idx
                labels.append(label_ids)

            tokenized_inputs["labels"] = labels
            return tokenized_inputs

        ids, ner_tags, tokens = [], [], []
        for i, pt in enumerate(self.processed_texts):
            ids.append(i)
            pt_tokens,pt_tags = list(zip(*pt))
            ner_tags.append(pt_tags)
            tokens.append(pt_tokens)
        data = {
            "id": ids,
            "ner_tags": ner_tags,
            "tokens": tokens
        }
        features = Features({
            "tokens": Sequence(Value("string")),
            "ner_tags": Sequence(ClassLabel(names=dm.unique_entities)),
            "id": Value("int32")
        })
        ds = Dataset.from_dict(data, features)
        tokenized_ds = ds.map(tokenize_and_align_labels, batched=True)
        return tokenized_ds

# usage
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
dm = NERDataMaker(["I come from [Kathmanduu valley,](location) [Nepal](location)"])
dm.as_hf_dataset(tokenizer=tokenizer)

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'ner_tags', 'tokens', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1
})

In [1]:
!pip install -q datasets transformers



[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/471.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m471.0/471.6 kB[0m [31m25.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/134.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/194.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import re
def get_tokens_with_entities(raw_text: str):

    raw_tokens = re.split(r"\s(?![^\[]*\])", raw_text)

    entity_value_pattern = r"\[(?P<value>.+?)\]\((?P<entity>.+?)\)"
    entity_value_pattern_compiled = re.compile(entity_value_pattern, flags=re.I|re.M)

    tokens_with_entities = []

    for raw_token in raw_tokens:
        match = entity_value_pattern_compiled.match(raw_token)
        if match:
            raw_entity_name, raw_entity_value = match.group("entity"), match.group("value")

            for i, raw_entity_token in enumerate(re.split("\s", raw_entity_value)):
                entity_prefix = "B" if i == 0 else "I"
                entity_name = f"{entity_prefix}-{raw_entity_name}"
                tokens_with_entities.append((raw_entity_token, entity_name))
        else:
            tokens_with_entities.append((raw_token, "O"))

    return tokens_with_entities

In [3]:
print(get_tokens_with_entities("I am looking for a book on [Python Programming](Topic) by [Mihaela Sabin](Author)"))



[('I', 'O'), ('am', 'O'), ('looking', 'O'), ('for', 'O'), ('a', 'O'), ('book', 'O'), ('on', 'O'), ('Python', 'B-Topic'), ('Programming', 'I-Topic'), ('by', 'O'), ('Mihaela', 'B-Author'), ('Sabin', 'I-Author')]


In [4]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

sample_input = "I am looking for a book on [Pithon Programming](Topic) by [Mihaela Sabin](Author)"
tokens, entities = list(zip(*get_tokens_with_entities(sample_input)))
tokenized_input = tokenizer(tokens, is_split_into_words=True)
print("Original tokens           : ", tokens)
print("After subword tokenization: ", tokenizer.convert_ids_to_tokens(tokenized_input['input_ids']))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Original tokens           :  ('I', 'am', 'looking', 'for', 'a', 'book', 'on', 'Pithon', 'Programming', 'by', 'Mihaela', 'Sabin')
After subword tokenization:  ['[CLS]', 'i', 'am', 'looking', 'for', 'a', 'book', 'on', 'pit', '##hon', 'programming', 'by', 'mi', '##hae', '##la', 'sa', '##bin', '[SEP]']




In [7]:
raw_text = """
I am looking for a book on [Python Programming](Topic) by [Mihaela Sabin](Author).
Find me a book on [Java](Topic) which is published this year [2024](Year).
Do you have any books on [Machine Learning](Topic) by [Andrew Ng](Author)?
I need a book on [Data Science](Topic) published in [2021](Year).
Can you recommend a book about [Artificial Intelligence](Topic) by [Stuart Russell](Author)?
I’m searching for a book on [Deep Learning](Topic) that was published in [2019](Year).
Show me a book on [Web Development](Topic) by [David Flanagan](Author) from [2018](Year).
Do you have any books about [Cybersecurity](Topic) released in [2022](Year)?
Find me a book on [Cloud Computing](Topic) by [Thomas Erl](Author).
Looking for a book on [Software Engineering](Topic) by [Robert C. Martin](Author) from [2015](Year).
Show me the latest book on [Blockchain Technology](Topic) published in [2023](Year).
I’m looking for a book on [Big Data](Topic) by [Viktor Mayer-Schönberger](Author).
Do you have a book on [Computer Networks](Topic) written in [2020](Year)?
Find me a book on [Operating Systems](Topic) by [Abraham Silberschatz](Author).
Can you recommend a book about [Quantum Computing](Topic) published in [2022](Year)?
I need a book on [Natural Language Processing](Topic) by [Daniel Jurafsky](Author).
Do you have any books on [Robotics](Topic) published in [2017](Year)?
Show me a book on [Data Structures](Topic) by [Thomas H. Cormen](Author) from [2013](Year).
Find a book on [Algorithms](Topic) by [Sanjay Dasgupta](Author) from [2021](Year).
I’m searching for a book on [Computer Architecture](Topic) by [David Patterson](Author) from [2018](Year).
"""

dm = NERDataMaker(raw_text.split("\n"))
print(f"total examples = {len(dm)}")
print(dm[0:3])



total examples = 22
[{'id': 0, 'ner_tags': [0], 'tokens': ['']}, {'id': 1, 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 2, 5, 0, 1, 4], 'tokens': ['I', 'am', 'looking', 'for', 'a', 'book', 'on', 'Python', 'Programming', 'by', 'Mihaela', 'Sabin']}, {'id': 2, 'ner_tags': [0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 3], 'tokens': ['Find', 'me', 'a', 'book', 'on', 'Java', 'which', 'is', 'published', 'this', 'year', '2024']}]


In [8]:
from transformers import AutoTokenizer, DataCollatorForTokenClassification, AutoModelForTokenClassification, TrainingArguments, Trainer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
model = AutoModelForTokenClassification.from_pretrained("distilbert-base-uncased", num_labels=len(dm.unique_entities), id2label=dm.id2label, label2id=dm.label2id)



model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=40,
    weight_decay=0.01,
)

train_ds = dm.as_hf_dataset(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=train_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()



Map:   0%|          | 0/22 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss
1,No log,1.395695
2,No log,1.18908
3,No log,1.009429
4,No log,0.866498
5,No log,0.765555
6,No log,0.687674
7,No log,0.612836
8,No log,0.541449
9,No log,0.477687
10,No log,0.422501


TrainOutput(global_step=80, training_loss=0.3297114849090576, metrics={'train_runtime': 221.6601, 'train_samples_per_second': 3.97, 'train_steps_per_second': 0.361, 'total_flos': 4123015917552.0, 'train_loss': 0.3297114849090576, 'epoch': 40.0})

In [25]:
from transformers import pipeline
pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
pipe("""I am looking for a C Programming book by Sai Saketh from 2016""")

[{'entity_group': 'Topic',
  'score': 0.77586806,
  'word': 'c programming',
  'start': 19,
  'end': 32},
 {'entity_group': 'Author',
  'score': 0.682893,
  'word': 'sai saketh',
  'start': 41,
  'end': 51},
 {'entity_group': 'Year',
  'score': 0.91401,
  'word': '2016',
  'start': 57,
  'end': 61}]