In [13]:
from transformers import AutoTokenizer
import json
import numpy as np
from datasets import Dataset
from tokenizers import AddedToken
from tqdm import tqdm
import pandas as pd

In [14]:
df = pd.read_parquet("../data/raw_data.parquet")

In [23]:
df.shape

(6807, 16)

In [16]:
valid_doc_ids = []
for i, row in df.iterrows():
    if row["valid"]:
        document = row["document"]
        valid_doc_ids.append(document)

In [4]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base")

test_string = """
Design Thinking for innovation reflexion-Avril 2021-Nathalie Sylla\n\nChallenge & selection
The tool I use to help all stakeholders finding their way through the complexity of a project is the  mind map. 
...
by Nathalie Sylla
"""

tk = tokenizer(
    test_string, 
    max_length=8, 
    stride=3,
    truncation=True, 
    return_overflowing_tokens=True,
)

for seq in tk['input_ids']:
    print(tokenizer.decode(seq))




[CLS] Design Thinking for innovation reflexion[SEP]
[CLS] innovation reflexion-Avril[SEP]
[CLS]-Avril 2021-N[SEP]
[CLS] 2021-Nathalie S[SEP]
[CLS]athalie Sylla Challenge &[SEP]
[CLS]ylla Challenge & selection The tool[SEP]
[CLS] selection The tool I use to[SEP]
[CLS] I use to help all stakeholders[SEP]
[CLS] help all stakeholders finding their way[SEP]
[CLS] finding their way through the complexity[SEP]
[CLS] through the complexity of a project[SEP]
[CLS] of a project is the mind[SEP]
[CLS] is the mind map..[SEP]
[CLS] map.... by[SEP]
[CLS].. by Nathalie Sylla[SEP]


In [5]:
print(tk["input_ids"])

[[1, 2169, 12103, 270, 3513, 28310, 4593, 2], [1, 3513, 28310, 4593, 271, 57498, 24360, 2], [1, 271, 57498, 24360, 16789, 271, 1609, 2], [1, 16789, 271, 1609, 30065, 12287, 662, 2], [1, 30065, 12287, 662, 86260, 6738, 429, 2], [1, 86260, 6738, 429, 1857, 279, 1637, 2], [1, 1857, 279, 1637, 273, 380, 264, 2], [1, 273, 380, 264, 408, 305, 6998, 2], [1, 408, 305, 6998, 1879, 308, 384, 2], [1, 1879, 308, 384, 390, 262, 6870, 2], [1, 390, 262, 6870, 265, 266, 663, 2], [1, 265, 266, 663, 269, 262, 791, 2], [1, 269, 262, 791, 2269, 260, 323, 2], [1, 2269, 260, 323, 260, 260, 293, 2], [1, 260, 260, 293, 68949, 662, 86260, 2]]


In [2]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base")



In [17]:
with open('../data/train.json') as f:
    data = json.load(f)

In [18]:
LABELS = ['B-EMAIL',
        'B-ID_NUM',
        'B-NAME_STUDENT',
        'B-PHONE_NUM',
        'B-STREET_ADDRESS',
        'B-URL_PERSONAL',
        'B-USERNAME',
        'I-ID_NUM',
        'I-NAME_STUDENT',
        'I-PHONE_NUM',
        'I-STREET_ADDRESS',
        'I-URL_PERSONAL',
        'O']

In [19]:
labels_count = {label: 0 for label in LABELS}
for d in data:
    unique_labels = set(d['labels'])
    for label in unique_labels:
        labels_count[label] += 1


print(labels_count)

{'B-EMAIL': 24, 'B-ID_NUM': 33, 'B-NAME_STUDENT': 891, 'B-PHONE_NUM': 4, 'B-STREET_ADDRESS': 2, 'B-URL_PERSONAL': 72, 'B-USERNAME': 5, 'I-ID_NUM': 1, 'I-NAME_STUDENT': 814, 'I-PHONE_NUM': 3, 'I-STREET_ADDRESS': 2, 'I-URL_PERSONAL': 1, 'O': 6807}


In [20]:
labels_count = {label: 0 for label in LABELS}
for d in data:
    if d["document"] % 4 == 0:
        unique_labels = set(d['labels'])
        for label in unique_labels:
            labels_count[label] += 1


print(labels_count)

{'B-EMAIL': 4, 'B-ID_NUM': 10, 'B-NAME_STUDENT': 232, 'B-PHONE_NUM': 0, 'B-STREET_ADDRESS': 0, 'B-URL_PERSONAL': 18, 'B-USERNAME': 1, 'I-ID_NUM': 1, 'I-NAME_STUDENT': 210, 'I-PHONE_NUM': 0, 'I-STREET_ADDRESS': 0, 'I-URL_PERSONAL': 0, 'O': 1698}


In [22]:
print(len(valid_doc_ids))
labels_count = {label: 0 for label in LABELS}
for d in data:
    if d["document"] in valid_doc_ids:
        unique_labels = set(d['labels'])
        for label in unique_labels:
            labels_count[label] += 1


print(labels_count)

1146
{'B-EMAIL': 13, 'B-ID_NUM': 13, 'B-NAME_STUDENT': 124, 'B-PHONE_NUM': 4, 'B-STREET_ADDRESS': 2, 'B-URL_PERSONAL': 26, 'B-USERNAME': 5, 'I-ID_NUM': 1, 'I-NAME_STUDENT': 117, 'I-PHONE_NUM': 3, 'I-STREET_ADDRESS': 2, 'I-URL_PERSONAL': 0, 'O': 1146}


In [10]:
with open("../data/mixtral-8x7b-v1.json", "r") as f:
    external_data = json.load(f)

In [32]:
for i, row in enumerate(data):
    if row["document"] in valid_doc_ids:
        row["valid"] = True
    else:
        row["valid"] = False

In [35]:
total_valid_count = len([row for row in data if row["valid"]])
total_valid_count

1146

In [39]:
with open("train.json", "r") as f:
    data = json.load(f)

type(data[0]["valid"])

bool

In [12]:
labels_count = {label: 0 for label in LABELS}
for d in external_data:
    unique_labels = set(d['labels'])
    for label in unique_labels:
        labels_count[label] += 1


print(labels_count)

{'B-EMAIL': 1590, 'B-ID_NUM': 1565, 'B-NAME_STUDENT': 2355, 'B-PHONE_NUM': 1568, 'B-STREET_ADDRESS': 1526, 'B-URL_PERSONAL': 1842, 'B-USERNAME': 1561, 'I-ID_NUM': 347, 'I-NAME_STUDENT': 2355, 'I-PHONE_NUM': 1154, 'I-STREET_ADDRESS': 1526, 'I-URL_PERSONAL': 0, 'O': 2355}


In [5]:
def tokenize(example, tokenizer, label2id, max_length):
    """tokenize the examples"""
    text = []
    labels = []
    token_map = [] # Character index to spacy token mapping

    token_map_idx = 0
    for t, l, ws in zip(example["tokens"], example["provided_labels"], example["trailing_whitespace"]):
        text.append(t)
        labels.extend([l]*len(t))
        token_map.extend([token_map_idx] * len(t))
        if ws:
            text.append(" ")
            labels.append("O")
            token_map.append(-1)

        token_map_idx += 1


    tokenized = tokenizer("".join(text), return_offsets_mapping=True, truncation = True, max_length=max_length, return_overflowing_tokens=True, stride = 0)
    
    labels = np.array(labels)
    
    text = "".join(text)
    token_labels = []
    num_sequences = len(tokenized["input_ids"])
    for sequence_idx in range(num_sequences):
        offset_mapping_sequence = tokenized["offset_mapping"][sequence_idx]
        token_labels_sequence = []
        for start_idx, end_idx in offset_mapping_sequence:
            
            # CLS token
            if start_idx == 0 and end_idx == 0: 
                token_labels_sequence.append(label2id["O"])
                continue
            
            # case when token starts with whitespace
            if text[start_idx].isspace():
                start_idx += 1
            
            while start_idx >= len(labels):
                start_idx -= 1
                
            token_labels_sequence.append(label2id[labels[start_idx]])
        
        token_labels.append(token_labels_sequence)
    #preds, ds["token_map"], ds["offset_mapping"], ds["tokens"], ds["document"]
    token_map = [token_map for _ in range(num_sequences)]
    document = [example["document"] for _ in range(num_sequences)]
    fold = [example["fold"] for _ in range(num_sequences)]
    tokens = [example["tokens"] for _ in range(num_sequences)]
        
    return {
        **tokenized,
        "labels": token_labels,
        "token_map": token_map,
        "document": document,
        "fold": fold,
        "tokens": tokens
    }

In [6]:
with open("../data/train.json") as f:
    data = json.load(f)

ds = Dataset.from_dict({
    "full_text": [x["full_text"] for x in data],
    "document": [x["document"] for x in data],
    "tokens": [x["tokens"] for x in data],
    "trailing_whitespace": [x["trailing_whitespace"] for x in data],
    "provided_labels": [x["labels"] for x in data],
    "fold": [x["document"] % 4 for x in data]
})

label2id = {label: i for i, label in enumerate(LABELS)}
id2label = {i: label for i, label in enumerate(LABELS)}

# lots of newlines in the text
# adding this should be helpful
tokenizer.add_tokens(AddedToken("\n", normalized=False))


ds = ds.map(
    tokenize, 
    fn_kwargs={"tokenizer": tokenizer, "label2id": label2id, "max_length": 512}, 
    num_proc=4,
).remove_columns(["full_text", "trailing_whitespace", "provided_labels"])

Map (num_proc=4):   0%|          | 0/6807 [00:00<?, ? examples/s]

  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


In [12]:
list(ds.features.keys())

['document',
 'tokens',
 'fold',
 'input_ids',
 'token_type_ids',
 'attention_mask',
 'offset_mapping',
 'overflow_to_sample_mapping',
 'labels',
 'token_map']

In [15]:
from itertools import chain

def build_flatten_dataset(ds):
    features = list(ds.features.keys())
    dataset_dict = {feature: [] for feature in features}

    for example in tqdm(ds, total=len(ds)):
        num_sequences = len(example["input_ids"])

        #Also make sure everything is a list
        for feature in features:
            assert isinstance(example[feature], list), f"Feature {feature} is not a list"
        for feature in features:
            dataset_dict[feature].extend(example[feature])

    return Dataset.from_dict(dataset_dict)

def build_flatten_ds_efficient(ds):
    features = ds.features.keys()
    # Initialize the dataset dictionary with empty lists for each feature
    dataset_dict = {feature: [] for feature in features}

    # Use list comprehension and map to efficiently process and extend all features at once
    for feature in tqdm(features, total=len(features)):
        example_feature = ds[feature]
        dataset_dict[feature] = list(chain.from_iterable(example_feature))

    return Dataset.from_dict(dataset_dict)

In [8]:
final_ds = build_flatten_dataset(ds)

100%|██████████| 6807/6807 [00:35<00:00, 190.37it/s]


In [14]:
len(ds["offset_mapping"])

6807

In [16]:
build_flatten_ds_efficient(ds)

Exception ignored in: <function _xla_gc_callback at 0x10bfea040>
Traceback (most recent call last):
  File "/Users/jashdalvi/miniforge3/envs/ml/lib/python3.9/site-packages/jax/_src/lib/__init__.py", line 97, in _xla_gc_callback
    def _xla_gc_callback(*args):
KeyboardInterrupt: 


In [16]:
final_ds["document"]

[7,
 7,
 10,
 10,
 16,
 16,
 20,
 20,
 20,
 56,
 56,
 56,
 86,
 93,
 104,
 104,
 112,
 112,
 123,
 123,
 123,
 123,
 136,
 136,
 166,
 166,
 204,
 204,
 214,
 214,
 269,
 269,
 288,
 288,
 308,
 308,
 308,
 308,
 317,
 317,
 324,
 324,
 330,
 330,
 330,
 330,
 333,
 333,
 344,
 356,
 356,
 375,
 375,
 379,
 379,
 470,
 470,
 472,
 472,
 472,
 591,
 591,
 607,
 609,
 609,
 616,
 616,
 616,
 651,
 651,
 659,
 659,
 671,
 714,
 730,
 730,
 736,
 736,
 760,
 760,
 760,
 828,
 828,
 1105,
 1105,
 1134,
 1175,
 1175,
 1185,
 1185,
 1185,
 1210,
 1221,
 1221,
 1239,
 1239,
 1277,
 1277,
 1290,
 1290,
 1295,
 1295,
 1309,
 1309,
 1325,
 1325,
 1353,
 1353,
 1437,
 1437,
 1444,
 1444,
 1447,
 1447,
 1472,
 1472,
 1477,
 1477,
 1546,
 1546,
 1546,
 1549,
 1549,
 1549,
 1578,
 1578,
 1613,
 1613,
 1753,
 1753,
 1758,
 1758,
 1763,
 1763,
 1769,
 1769,
 1790,
 1790,
 1795,
 1795,
 1795,
 1798,
 1802,
 1802,
 1802,
 1802,
 1810,
 1810,
 1814,
 1817,
 1817,
 1824,
 1824,
 2054,
 2054,
 2058,
 2058,


In [68]:
data[0]["provided_labels"] = data[0]["labels"].copy()
output = tokenize(data[0], tokenizer, {label: i for i, label in enumerate(LABELS)}, 512)

In [71]:
output["document"]

[7, 7]

In [69]:
data[0].keys()

dict_keys(['document', 'full_text', 'tokens', 'trailing_whitespace', 'labels', 'provided_labels'])

In [70]:
output.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'overflow_to_sample_mapping', 'labels', 'length', 'token_map', 'document'])

In [65]:
len(output["token_map"][0])

3709

In [66]:
len(data[0]["full_text"])

3709

In [45]:
output = tokenized = tokenizer(data[0]["full_text"], return_offsets_mapping=True, truncation = True, max_length=512, return_overflowing_tokens=True, stride = 256)

In [46]:
output

{'input_ids': [[1, 2169, 12103, 270, 3513, 28310, 4593, 271, 57498, 24360, 16789, 271, 1609, 30065, 12287, 662, 86260, 6738, 429, 1857, 279, 1637, 273, 380, 264, 408, 305, 6998, 1879, 308, 384, 390, 262, 6870, 265, 266, 663, 269, 262, 791, 2269, 260, 458, 1444, 269, 266, 791, 2269, 302, 1663, 264, 262, 3742, 265, 72791, 1398, 897, 260, 263, 72791, 1398, 736, 260, 287, 15724, 261, 10040, 268, 5152, 271, 92671, 2531, 280, 51388, 260, 3045, 294, 9110, 25247, 42255, 268, 1931, 280, 65426, 7933, 260, 285, 261, 262, 791, 2269, 287, 698, 59729, 6000, 285, 269, 266, 4981, 5190, 3395, 272, 3832, 262, 1008, 7392, 265, 262, 791, 263, 1279, 262, 1959, 280, 268, 1068, 264, 282, 1315, 260, 45110, 30097, 435, 329, 1637, 303, 386, 5228, 294, 1795, 325, 269, 3469, 264, 305, 263, 490, 298, 1449, 1318, 1146, 1578, 263, 295, 282, 619, 1126, 1795, 325, 269, 18440, 1795, 325, 1279, 51669, 263, 9563, 265, 439, 1795, 325, 295, 282, 2312, 264, 356, 810, 265, 1364, 294, 1521, 14850, 261, 735, 7273, 261, 1423, 2

In [49]:
len(output["input_ids"])

2

In [50]:
output['overflow_to_sample_mapping']

[0, 0]

In [36]:
len(output["offset_mapping"][1])

216