In [1]:
import json
import spacy
import pandas as pd
from datasets import Dataset, Features, ClassLabel, Sequence, Value

In [2]:
with open('data/ioi.jsonl', 'r') as file:
    for line in file:
        json_data = json.loads(line)

In [3]:
json_data

{'id': 1,
 'text': "Quantum Capital Partners\n100 Investment Blvd, Suite 3000\nNew York, NY 10020\nJuly 16, 2024\n\nCONFIDENTIAL\n\nBoard of Directors\nZephyr Technologies Inc.\n400 Innovation Way\nSan Jose, CA 95134\n\nDear Members of the Board,\n\nSubject: Indication of Interest in Acquiring Zephyr Technologies Inc.\n\nWe, Quantum Capital Partners, are pleased to submit this non-binding indication of interest with the intention of acquiring Zephyr Technologies Inc. Following a thorough review of publicly available information, we believe that a strategic acquisition by Quantum would significantly enhance our portfolio and deliver substantial value to our respective shareholders and customers.\n\nIntroduction of the Bidding Company:\nQuantum Capital Partners, based in New York, is a prominent investment firm focusing on technology and innovation-driven enterprises. With over $10 billion in managed assets and a significant presence in the tech investment landscape, our firm is well-pos

In [4]:
# start offsets and end offsets determine the span in the text for where the label is located
json_data["text"][291:315]

'Quantum Capital Partners'

In [5]:
 # will be using spacy to tokenize the data
nlp = spacy.blank("en")
doc = nlp(json_data["text"])

# getting the tokens
tokens = [token for token in doc]

# spacy's tokens have an idx attribute that tells us the character index of the tokens first character
# we can can calculate the span of the text by just adding the tokens length
# note that this will result in native python [inclusive, exclusive) indexing
token_spans = [(token.idx, token.idx + len(token)) for token in tokens]

# we are instantiating all tags as O tokens
tags = ["O"] * len(tokens)

In [6]:
list(zip(tokens, token_spans))

[(Quantum, (0, 7)),
 (Capital, (8, 15)),
 (Partners, (16, 24)),
 (, (24, 25)),
 (100, (25, 28)),
 (Investment, (29, 39)),
 (Blvd, (40, 44)),
 (,, (44, 45)),
 (Suite, (46, 51)),
 (3000, (52, 56)),
 (, (56, 57)),
 (New, (57, 60)),
 (York, (61, 65)),
 (,, (65, 66)),
 (NY, (67, 69)),
 (10020, (70, 75)),
 (, (75, 76)),
 (July, (76, 80)),
 (16, (81, 83)),
 (,, (83, 84)),
 (2024, (85, 89)),
 (
  ,
  (89, 91)),
 (CONFIDENTIAL, (91, 103)),
 (
  ,
  (103, 105)),
 (Board, (105, 110)),
 (of, (111, 113)),
 (Directors, (114, 123)),
 (, (123, 124)),
 (Zephyr, (124, 130)),
 (Technologies, (131, 143)),
 (Inc., (144, 148)),
 (, (148, 149)),
 (400, (149, 152)),
 (Innovation, (153, 163)),
 (Way, (164, 167)),
 (, (167, 168)),
 (San, (168, 171)),
 (Jose, (172, 176)),
 (,, (176, 177)),
 (CA, (178, 180)),
 (95134, (181, 186)),
 (
  ,
  (186, 188)),
 (Dear, (188, 192)),
 (Members, (193, 200)),
 (of, (201, 203)),
 (the, (204, 207)),
 (Board, (208, 213)),
 (,, (213, 214)),
 (
  ,
  (214, 216)),
 (Subject, (216, 

In [7]:
# suppose we opted to tokenize our data this way
tokens_naive = json_data["text"].split()

# Calculate positions of words in the text
positions = []
current_pos = 0
for i, word in enumerate(tokens_naive):
    positions.append((current_pos, current_pos + len(word)))
    current_pos += len(word) + 1  # update position +1 for the space << this is bug

In [8]:
list(zip(tokens_naive, positions))

[('Quantum', (0, 7)),
 ('Capital', (8, 15)),
 ('Partners', (16, 24)),
 ('100', (25, 28)),
 ('Investment', (29, 39)),
 ('Blvd,', (40, 45)),
 ('Suite', (46, 51)),
 ('3000', (52, 56)),
 ('New', (57, 60)),
 ('York,', (61, 66)),
 ('NY', (67, 69)),
 ('10020', (70, 75)),
 ('July', (76, 80)),
 ('16,', (81, 84)),
 ('2024', (85, 89)),
 ('CONFIDENTIAL', (90, 102)),
 ('Board', (103, 108)),
 ('of', (109, 111)),
 ('Directors', (112, 121)),
 ('Zephyr', (122, 128)),
 ('Technologies', (129, 141)),
 ('Inc.', (142, 146)),
 ('400', (147, 150)),
 ('Innovation', (151, 161)),
 ('Way', (162, 165)),
 ('San', (166, 169)),
 ('Jose,', (170, 175)),
 ('CA', (176, 178)),
 ('95134', (179, 184)),
 ('Dear', (185, 189)),
 ('Members', (190, 197)),
 ('of', (198, 200)),
 ('the', (201, 204)),
 ('Board,', (205, 211)),
 ('Subject:', (212, 220)),
 ('Indication', (221, 231)),
 ('of', (232, 234)),
 ('Interest', (235, 243)),
 ('in', (244, 246)),
 ('Acquiring', (247, 256)),
 ('Zephyr', (257, 263)),
 ('Technologies', (264, 276)),
 

In [9]:
# note that our offset is incorrect
json_data["text"][90:102]

'\nCONFIDENTIA'

In [10]:
json_data["entities"]

[{'id': 1, 'label': 'BUYER', 'start_offset': 0, 'end_offset': 24},
 {'id': 2, 'label': 'SELLER', 'start_offset': 124, 'end_offset': 148},
 {'id': 3, 'label': 'SELLER', 'start_offset': 261, 'end_offset': 285},
 {'id': 4, 'label': 'SELLER', 'start_offset': 411, 'end_offset': 435},
 {'id': 5, 'label': 'SELLER', 'start_offset': 999, 'end_offset': 1018},
 {'id': 6, 'label': 'SELLER', 'start_offset': 1093, 'end_offset': 1112},
 {'id': 7, 'label': 'SELLER', 'start_offset': 1490, 'end_offset': 1509},
 {'id': 8, 'label': 'SELLER', 'start_offset': 2433, 'end_offset': 2452},
 {'id': 9, 'label': 'BUYER', 'start_offset': 291, 'end_offset': 315},
 {'id': 10, 'label': 'BUYER', 'start_offset': 706, 'end_offset': 730},
 {'id': 15, 'label': 'VAL_UPPER', 'start_offset': 1542, 'end_offset': 1554},
 {'id': 16, 'label': 'VAL_LOWER', 'start_offset': 1526, 'end_offset': 1538},
 {'id': 17, 'label': 'DD_TIME', 'start_offset': 1878, 'end_offset': 1885},
 {'id': 18, 'label': 'MULT', 'start_offset': 1597, 'end_off

In [11]:
for entity in json_data["entities"]:

    entity_start = entity["start_offset"]
    entity_end = entity["end_offset"]
    entity_label = entity["label"]
    tagged_first_token = False # flag to help us determine whether tag should be B- or I-

    for token_idx, (token, (token_start, token_end)) in enumerate(zip(tokens, token_spans)):
        # the token span span has to be within the entity span (remember that an entity can span multiple tokens)
        if token_start >= entity_start and token_end <= entity_end:
            if not tagged_first_token:
                tags[token_idx] = f"B-{entity_label}"
                tagged_first_token = True
            else:
                tags[token_idx] = f"I-{entity_label}"

In [17]:
tagged_tokens = list(zip(tokens, tags))

In [18]:
for token, tag in tagged_tokens:
    print(repr(f"{token.text} --- {tag}"))

'Quantum --- B-BUYER'
'Capital --- I-BUYER'
'Partners --- I-BUYER'
'\n --- O'
'100 --- O'
'Investment --- O'
'Blvd --- O'
', --- O'
'Suite --- O'
'3000 --- O'
'\n --- O'
'New --- O'
'York --- O'
', --- O'
'NY --- O'
'10020 --- O'
'\n --- O'
'July --- O'
'16 --- O'
', --- O'
'2024 --- O'
'\n\n --- O'
'CONFIDENTIAL --- O'
'\n\n --- O'
'Board --- O'
'of --- O'
'Directors --- O'
'\n --- O'
'Zephyr --- B-SELLER'
'Technologies --- I-SELLER'
'Inc. --- I-SELLER'
'\n --- O'
'400 --- O'
'Innovation --- O'
'Way --- O'
'\n --- O'
'San --- O'
'Jose --- O'
', --- O'
'CA --- O'
'95134 --- O'
'\n\n --- O'
'Dear --- O'
'Members --- O'
'of --- O'
'the --- O'
'Board --- O'
', --- O'
'\n\n --- O'
'Subject --- O'
': --- O'
'Indication --- O'
'of --- O'
'Interest --- O'
'in --- O'
'Acquiring --- O'
'Zephyr --- B-SELLER'
'Technologies --- I-SELLER'
'Inc. --- I-SELLER'
'\n\n --- O'
'We --- O'
', --- O'
'Quantum --- B-BUYER'
'Capital --- I-BUYER'
'Partners --- I-BUYER'
', --- O'
'are --- O'
'pleased --- O'
'to

In [20]:
def create_hf_dataset(sentences, entity_types):
    
    data = {"tokens": [], "ner_tags": []}
    
    for sentence in sentences:
        tokens, tags = zip(*sentence)
        data["tokens"].append(tokens)
        data["ner_tags"].append(tags)

    # Define features
    features = Features({
        "tokens": Sequence(Value("string")),
        "ner_tags": Sequence(ClassLabel(names=entity_types))
    })

    # Create dataset
    dataset = Dataset.from_dict(data, features=features)
    return dataset

In [21]:
dataset = create_hf_dataset([tagged_tokens], list(set(tags)))

In [22]:
# note how many rows of data we have
# could this be a problem?
dataset

Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 1
})

In [23]:
dataset[0]

{'tokens': ['Quantum',
  'Capital',
  'Partners',
  '\n',
  '100',
  'Investment',
  'Blvd',
  ',',
  'Suite',
  '3000',
  '\n',
  'New',
  'York',
  ',',
  'NY',
  '10020',
  '\n',
  'July',
  '16',
  ',',
  '2024',
  '\n\n',
  'CONFIDENTIAL',
  '\n\n',
  'Board',
  'of',
  'Directors',
  '\n',
  'Zephyr',
  'Technologies',
  'Inc.',
  '\n',
  '400',
  'Innovation',
  'Way',
  '\n',
  'San',
  'Jose',
  ',',
  'CA',
  '95134',
  '\n\n',
  'Dear',
  'Members',
  'of',
  'the',
  'Board',
  ',',
  '\n\n',
  'Subject',
  ':',
  'Indication',
  'of',
  'Interest',
  'in',
  'Acquiring',
  'Zephyr',
  'Technologies',
  'Inc.',
  '\n\n',
  'We',
  ',',
  'Quantum',
  'Capital',
  'Partners',
  ',',
  'are',
  'pleased',
  'to',
  'submit',
  'this',
  'non',
  '-',
  'binding',
  'indication',
  'of',
  'interest',
  'with',
  'the',
  'intention',
  'of',
  'acquiring',
  'Zephyr',
  'Technologies',
  'Inc.',
  'Following',
  'a',
  'thorough',
  'review',
  'of',
  'publicly',
  'availabl

In [24]:
label_names = dataset.features["ner_tags"].feature.names
label_names

['B-VAL_LOWER',
 'I-BUYER',
 'O',
 'I-VAL_LOWER',
 'B-DD_TIME',
 'B-MULT_BASE',
 'I-DD_TIME',
 'I-MULT_BASE',
 'B-MULT',
 'B-SELLER',
 'I-VAL_UPPER',
 'B-BUYER',
 'B-VAL_UPPER',
 'I-SELLER']

In [25]:
records = zip(dataset[0]["tokens"], dataset[0]["ner_tags"], [label_names[idx] for idx in dataset[0]["ner_tags"]])
pd.DataFrame(records, columns=["tokens", "labels", "label_names"])

Unnamed: 0,tokens,labels,label_names
0,Quantum,11,B-BUYER
1,Capital,1,I-BUYER
2,Partners,1,I-BUYER
3,\n,2,O
4,100,2,O
...,...,...,...
526,212,2,O
527,),2,O
528,555,2,O
529,-,2,O
