In [None]:
from datasets import load_dataset, Sequence, ClassLabel
import re

dataset = load_dataset("conll2003",trust_remote_code=True)


In [None]:
DATE_PATTERNS = [
    r'\b\d{1,2}/\d{1,2}/\d{2,4}\b',  # Format MM/DD/YYYY or similar
    r'\b\d{1,2}-\d{1,2}-\d{2,4}\b',  # Format MM-DD-YYYY
    r'\b\d{4}-\d{2}-\d{2}\b',        # ISO 8601
    r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\b \d{1,2},? \d{4}',  # Format "March 3, 1985"
    r'\b\d{4}-\d{2}-\d{2}\b',  # Format YYYY-MM-DD
    r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\b'
]

In [81]:
def is_date(word):
    for pattern in DATE_PATTERNS:
        if re.match(pattern, word):
            return True
    return False

In [82]:

def add_date_labels(example):
    tokens = example["tokens"]
    labels = example["ner_tags"]
    new_labels = []

    for token, label in zip(tokens, labels):
        if is_date(token):
            new_labels.append(9) 
        else:
            new_labels.append(label)
    
    return {"tokens": tokens, "ner_tags": new_labels}

In [83]:
old_label_names = dataset["train"].features["ner_tags"].feature.names
new_label_names = old_label_names + ["DATE"]


new_ner_tags_feature = Sequence(
    feature=ClassLabel(names=new_label_names)
)

# Aktualizacja metadanych w zbiorze danych
dataset = dataset.cast_column("ner_tags", new_ner_tags_feature)

In [84]:
dataset = dataset.map(add_date_labels)

Map: 100%|██████████| 14041/14041 [00:02<00:00, 5919.78 examples/s]
Map: 100%|██████████| 3250/3250 [00:00<00:00, 6359.95 examples/s]
Map: 100%|██████████| 3453/3453 [00:00<00:00, 6914.75 examples/s]


In [85]:
print("Zaktualizowane etykiety:", dataset["train"].features["ner_tags"].feature.names)
 


Zaktualizowane etykiety: ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC', 'DATE']


In [86]:
print(dataset["train"]["tokens"][:8])


[['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'], ['Peter', 'Blackburn'], ['BRUSSELS', '1996-08-22'], ['The', 'European', 'Commission', 'said', 'on', 'Thursday', 'it', 'disagreed', 'with', 'German', 'advice', 'to', 'consumers', 'to', 'shun', 'British', 'lamb', 'until', 'scientists', 'determine', 'whether', 'mad', 'cow', 'disease', 'can', 'be', 'transmitted', 'to', 'sheep', '.'], ['Germany', "'s", 'representative', 'to', 'the', 'European', 'Union', "'s", 'veterinary', 'committee', 'Werner', 'Zwingmann', 'said', 'on', 'Wednesday', 'consumers', 'should', 'buy', 'sheepmeat', 'from', 'countries', 'other', 'than', 'Britain', 'until', 'the', 'scientific', 'advice', 'was', 'clearer', '.'], ['"', 'We', 'do', "n't", 'support', 'any', 'such', 'recommendation', 'because', 'we', 'do', "n't", 'see', 'any', 'grounds', 'for', 'it', ',', '"', 'the', 'Commission', "'s", 'chief', 'spokesman', 'Nikolaus', 'van', 'der', 'Pas', 'told', 'a', 'news', 'briefing', '.'], ['He', 'said

In [87]:
for essa in dataset["train"]:
    print(essa["tokens"])

print(dataset["train"][2]["tokens"])

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']
['Peter', 'Blackburn']
['BRUSSELS', '1996-08-22']
['The', 'European', 'Commission', 'said', 'on', 'Thursday', 'it', 'disagreed', 'with', 'German', 'advice', 'to', 'consumers', 'to', 'shun', 'British', 'lamb', 'until', 'scientists', 'determine', 'whether', 'mad', 'cow', 'disease', 'can', 'be', 'transmitted', 'to', 'sheep', '.']
['Germany', "'s", 'representative', 'to', 'the', 'European', 'Union', "'s", 'veterinary', 'committee', 'Werner', 'Zwingmann', 'said', 'on', 'Wednesday', 'consumers', 'should', 'buy', 'sheepmeat', 'from', 'countries', 'other', 'than', 'Britain', 'until', 'the', 'scientific', 'advice', 'was', 'clearer', '.']
['"', 'We', 'do', "n't", 'support', 'any', 'such', 'recommendation', 'because', 'we', 'do', "n't", 'see', 'any', 'grounds', 'for', 'it', ',', '"', 'the', 'Commission', "'s", 'chief', 'spokesman', 'Nikolaus', 'van', 'der', 'Pas', 'told', 'a', 'news', 'briefing', '.']
['He', 'said', 'fur

In [88]:
tokens_with_labels = []
for obj in dataset["train"]:
    example = obj
    tokens_with_labels.append(
        [(token, new_label_names[label]) for token, label in zip(example["tokens"], example["ner_tags"])]
    )
for obj in tokens_with_labels:
    for token, label in obj:
        if label == "DATE":
            print(f"{token}: {label}")

1996-08-22: DATE
March: DATE
1996-08-22: DATE
January: DATE
1996-08-22: DATE
1996-08-22: DATE
July: DATE
1996-08-22: DATE
July: DATE
July: DATE
July: DATE
July: DATE
July: DATE
July: DATE
1996-08-22: DATE
1996-08-22: DATE
1-10-100: DATE
January: DATE
1996-08-22: DATE
1-10-100: DATE
1996-08-22: DATE
June: DATE
February: DATE
1996-08-22: DATE
1996-08-22: DATE
July: DATE
1996-08-22: DATE
1996-08-22: DATE
1996-08-22: DATE
May: DATE
1996-08-22: DATE
August: DATE
1996-08-22: DATE
October: DATE
1996-08-22: DATE
November: DATE
1996-08-22: DATE
1996-08-22: DATE
1996-08-22: DATE
June: DATE
1996-08-22: DATE
1996-08-22: DATE
1996-08-22: DATE
1996-08-22: DATE
April: DATE
1996-08-22: DATE
April: DATE
March: DATE
April: DATE
April: DATE
March: DATE
January-April: DATE
April: DATE
1996-08-22: DATE
September: DATE
1996-08-22: DATE
March: DATE
June: DATE
1996-08-22: DATE
1996-08-22: DATE
1996-08-22: DATE
June: DATE
1996-08-22: DATE
1996-08-21: DATE
1996-08-21: DATE
1996-08-22: DATE
1996-08-22: DATE
1996