In [77]:
import numpy as np 
import json
import os
from transformers import AutoTokenizer
from tqdm import tqdm
from itertools import chain

In [78]:
data = json.load(open('../data/train.json'))

In [79]:
all_labels = sorted(list(set(chain(*[x["labels"] for x in data]))))

In [80]:
all_labels

['B-EMAIL',
 'B-ID_NUM',
 'B-NAME_STUDENT',
 'B-PHONE_NUM',
 'B-STREET_ADDRESS',
 'B-URL_PERSONAL',
 'B-USERNAME',
 'I-ID_NUM',
 'I-NAME_STUDENT',
 'I-PHONE_NUM',
 'I-STREET_ADDRESS',
 'I-URL_PERSONAL',
 'O']

In [75]:
data[0].keys()

dict_keys(['document', 'full_text', 'tokens', 'trailing_whitespace', 'labels'])

In [52]:
x = data[0]

print(x["tokens"][:10])
print(x["labels"][:10])
print(x["trailing_whitespace"][:10])

['Design', 'Thinking', 'for', 'innovation', 'reflexion', '-', 'Avril', '2021', '-', 'Nathalie']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-NAME_STUDENT']
[True, True, True, True, False, False, True, False, False, True]


In [53]:
from itertools import chain

all_labels = sorted(list(set(chain(*[x["labels"] for x in data]))))
label2id = {l: i for i,l in enumerate(all_labels)}
id2label = {v:k for k,v in label2id.items()}

id2label

{0: 'B-EMAIL',
 1: 'B-ID_NUM',
 2: 'B-NAME_STUDENT',
 3: 'B-PHONE_NUM',
 4: 'B-STREET_ADDRESS',
 5: 'B-URL_PERSONAL',
 6: 'B-USERNAME',
 7: 'I-ID_NUM',
 8: 'I-NAME_STUDENT',
 9: 'I-PHONE_NUM',
 10: 'I-STREET_ADDRESS',
 11: 'I-URL_PERSONAL',
 12: 'O'}

In [54]:
example = data[0]

In [55]:
example.keys()

dict_keys(['document', 'full_text', 'tokens', 'trailing_whitespace', 'labels'])

In [56]:
text = []

# these are at the character level
labels = []

for t, l, ws in zip(example["tokens"], example["labels"], example["trailing_whitespace"]):

    text.append(t)
    labels.extend([l]*len(t))

    # if there is trailing whitespace
    if ws:
        text.append(" ")
        labels.append("O")

In [57]:
len(text)

1320

In [58]:
len(labels)

3709

In [59]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base")



In [60]:
tokenized = tokenizer("".join(text), return_offsets_mapping=True, truncation=False)

In [61]:
tokenized["offset_mapping"][:10]

[(0, 0),
 (0, 6),
 (6, 15),
 (15, 19),
 (19, 30),
 (30, 37),
 (37, 40),
 (40, 41),
 (41, 43),
 (43, 46)]

In [62]:
text = "".join(text)

In [63]:
len(text)

3709

In [64]:
token_labels = []
for start_idx, end_idx in tokenized.offset_mapping[:5]:

    # CLS token
    if start_idx == 0 and end_idx == 0: 
        token_labels.append(label2id["O"])
        continue
    
    # case when token starts with whitespace
    if text[start_idx].isspace():
        start_idx += 1
    
    while start_idx >= len(labels):
        start_idx -= 1
        
    token_labels.append(label2id[labels[start_idx]])
    print(start_idx, end_idx)
    print(token_labels)
    
length = len(tokenized.input_ids)

0 6
[12, 12]
7 15
[12, 12, 12]
16 19
[12, 12, 12, 12]
20 30
[12, 12, 12, 12, 12]


In [65]:
train_data = json.load(open('../data/train.json'))

In [66]:
doc_ids = [doc["document"] for doc in train_data]

In [67]:
doc_id_to_text_mapping = {doc["full_text"]: doc["document"] for doc in train_data}

In [68]:
data_with_id = []
for doc in tqdm(data, total=len(data)):
    doc_id = doc_id_to_text_mapping[doc["full_text"]]
    doc["doc_id"] = doc_id
    data_with_id.append(doc)

100%|██████████| 6807/6807 [00:00<00:00, 64711.60it/s]


In [69]:
for fold in [0,1,2,3]:
    fold_data = [doc for doc in train_data if doc["document"] % 4 == fold]
    print(f"Fold {fold} has {len(fold_data)} documents : {len(fold_data)/len(train_data) *  100:.2f}%")



Fold 0 has 1698 documents : 24.94%
Fold 1 has 1714 documents : 25.18%
Fold 2 has 1689 documents : 24.81%
Fold 3 has 1706 documents : 25.06%


In [70]:
len(data)

6807

In [71]:
len(train_data)

6807