# TAB Pre-Processing
This notebook contains functions to process the Text Anonymization Benchmark data off hugging face to match the token classification format for HuggingFace datasets

- Test code: [eda/TAB EDA](https://colab.research.google.com/drive/1ws_khqnClYYwYzfMG7dIZ40nOGZyCIYq#scrollTo=Edduhm_q_56L)
- JSON output: [/data/tab](https://drive.google.com/drive/folders/1C3h3rXdbr9nVAC3_G_I-72DfKNiDU_Pa)
- HuggingFace dataset output: [/data/tab/longformer]
    - [/data/tab/longformer](https://drive.google.com/drive/folders/1UQfl6oXyYt4Eepudmgi6A9xMAkqBuaHf) - for multiclassification; label names = ner_labels; mask_labels
    - [/data/tab/longformer_mask](https://drive.google.com/drive/folders/1bgkTuZ428fLdnFrtq0BWJcTBT3lpNXbK) - single classification; labels
    - [/data/tab/longformer_ner](https://drive.google.com/drive/folders/1M8KiTXhpdkiMzJRqcLX0X7KY0dCbqw3t) - single classification; labels

## Imports and Functions
- Functions:
    - tab_annotation_duplicats(dataset)
    - tab_data_loader(dataset)
    - ner_processor(doc_id, text, annotation)
- Global Variables:
    - Spacy model: nlp = 'en_core_web_lg'

In [None]:
!pip3 install datasets --quiet

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/480.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m471.0/480.6 kB[0m [31m20.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the f

In [None]:
from datasets import load_dataset, load_from_disk, Dataset
import pprint
import json
import pandas as pd
import spacy
import spacy.cli
from sklearn.model_selection import train_test_split
from transformers import LongformerTokenizerFast

from google.colab import drive

drive.mount('/content/drive')
path = '/content/drive/MyDrive/Colab Notebooks/DATASCI 266/266 project'

Mounted at /content/drive


In [None]:
spacy.cli.download("en_core_web_lg")
nlp = spacy.load('en_core_web_lg')

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
def create_tab_df(tab_dataset):
    """
    Outputs a pandas dataframe specific to the training split.
    Input:
        dataset: input with split selected ds['train']
        "mattmdjaga/text-anonymization-benchmark-train"
        "mattmdjaga/text-anonymization-benchmark-val-test"
    Output:
        df
    """
    annotations = tab_dataset['annotations']
    doc_ids = tab_dataset['doc_id']

    rows = []
    for i, sample in enumerate(annotations):
        for annot in sample:
            if sample[annot]:
                entity_mentions = sample[annot]['entity_mentions']
                for row in entity_mentions:
                    row['doc_id'] = doc_ids[i]
                    row['annotator'] = annot
                    rows.append(row)

    df = pd.DataFrame(rows)

    return df

def tab_annotation_duplicates(dataset):
    """
    Returns df and list of duplicate annotations for analysis and exclusion from pipeline.
    Input:
        dataset = dataset['train'], dataset['validation']
    Returns:
        df = df of analysis
        dup_annotations = array of doc_ids with multiple annotations
    """
    num_annot_per_doc = []
    doc_ids = dataset['doc_id']
    annotations = dataset['annotations']
    for i, doc_id in enumerate(doc_ids):
        count = 0
        doc_annotation = annotations[i]
        annotators_list = []
        for annot in doc_annotation:
            if doc_annotation[annot] is not None:
                annotators_list.append(annot)
                count += 1
        if count > 1:
            num_annot_per_doc.append((i, doc_id, annotators_list, count, 1))
        else:
            num_annot_per_doc.append((i, doc_id, annotators_list, count, 0))

    df = pd.DataFrame(num_annot_per_doc, columns=['index', 'doc_id', 'annotators', 'annotator_count', 'is_multi'])
    mult_annotations = [doc[1] for doc in num_annot_per_doc if doc[-1] == 1]
    annotation_dict = {doc[1]: doc[2] for doc in num_annot_per_doc}

    return df, mult_annotations, annotation_dict

def tab_data_loader(dataset):
    """
    Returns doc_ids, doc_text, annotations for processing.
    Input:
        Dataset: dataset['train'], dataset['validation]
    Returns:
        doc_ids = array of doc_ids
        doc_text = array of text
        annotations = array of annotations
    """

    doc_ids = dataset['doc_id']
    doc_texts = dataset['text']
    annotations = dataset['annotations']

    return doc_ids, doc_texts, annotations

def ner_processor(doc_id, text, annotation):
    """
    Processes TAB data by splitting text into tokens via spacy and applying ner and mask tags to token spans
    to match huggingface datasets.
    Input:
        doc_id, text, annotation = individual record
    Output:
        processed sample: {'id', 'ner_tags', 'mask_tags', 'text_spans', 'tokens', 'text'}
    """
    doc = nlp(text)

    ner_type = ['O', 'B-PERSON', 'I-PERSON', 'B-CODE', 'I-CODE', 'B-LOC', 'I-LOC', 'B-ORG', 'I-ORG',
                'B-DEM', 'I-DEM', 'B-DATETIME', 'I-DATETIME', 'B-QUANTITY', 'I-QUANTITY', 'B-MISC', 'I-MISC']
    masking_type = ['O', 'B-NO_MASK', 'I-NO_MASK', 'B-DIRECT', 'I-DIRECT', 'B-QUASI', 'I-QUASI']
    # confidential_attributes = ['NOT_CONFIDENTIAL', 'BELIEF', 'POLITICS', 'SEX', 'ETHNIC', 'HEALTH']

    # process data to match conll / ner_tag format
    ner_tags = []
    mask_tags = []
    text_spans = []
    tokens = []
    restart = 0
    for ner in annotation:
        for i, token in enumerate(doc[restart:]):
            if token.idx == ner['start_offset']:
                ner_tags.append(ner_type.index(f"B-{ner['entity_type']}"))
                mask_tags.append(masking_type.index(f"B-{ner['identifier_type']}"))
                text_spans.append(token.idx)
                tokens.append(token.text)
            elif token.idx > ner['start_offset'] and token.idx < ner['end_offset']:
                ner_tags.append(ner_type.index(f"I-{ner['entity_type']}"))
                mask_tags.append(masking_type.index(f"I-{ner['identifier_type']}"))
                text_spans.append(token.idx)
                tokens.append(token.text)
            elif token.idx >= ner['end_offset']:
                restart += i
                break
            else:
                ner_tags.append(0)
                mask_tags.append(0)
                text_spans.append(token.idx)
                tokens.append(token.text)

    # finish processing text after last annotation
    if ner == annotation[-1] and len(tokens) < len(doc):
        for token in doc[len(doc)-1:]:
            text_spans.append(token.idx)
            tokens.append(token.text)
            ner_tags.append(0)
            mask_tags.append(0)

    # compile sample into row; test how to merge with huggingface dataset structure
    processed_sample = {'id': doc_id, 'ner_tags': ner_tags, 'mask_tags': mask_tags, 'text_spans': text_spans, 'tokens': tokens, 'text': text}

    return processed_sample
    # yield processed_sample

def tokenize_and_align_labels(examples, label_all_tokens=True, task='ner'):
    """
    Tokenizes and aligns labels to match longformer tokenizer strategy; function should work as expected for other BERT based models
    Currently only returns mask or ner task
    Input:
        examples: individual example from dataset
    Output:
        dataset: tokenized and array aligned dataset with labels
    """
    tokenized_inputs = tokenizer(examples["tokens"], padding='max_length', max_length=4096, truncation=True, is_split_into_words=True)

    if task == 'both':
        task = ['ner', 'mask']
    else:
        task = [task]
    for t in task:
        labels = []
        for i, label in enumerate(examples[f'{t}_tags']):
            word_ids = tokenized_inputs.word_ids(batch_index=i)
            previous_word_idx = None
            label_ids = []
            for word_idx in word_ids:
                # Special tokens have a word id that is None. We set the label to -100 so they are automatically
                # ignored in the loss function.
                if word_idx is None:
                    label_ids.append(-100)
                # We set the label for the first token of each word.
                elif word_idx != previous_word_idx:
                    label_ids.append(label[word_idx])
                # For the other tokens in a word, we set the label to either the current label or -100, depending on
                # the label_all_tokens flag.
                else:
                    label_ids.append(label[word_idx] if label_all_tokens else -100)
                previous_word_idx = word_idx

            labels.append(label_ids)

            # tokenized_inputs[f'{t}_labels'] = labels
            tokenized_inputs['labels'] = labels
    return tokenized_inputs

# general functions
def select_data(split, task, size):
    """
    Loads the appropriate dataset per folder structure here: https://drive.google.com/drive/folders/1C3h3rXdbr9nVAC3_G_I-72DfKNiDU_Pa
    Input:
        Split: ['train', 'val', 'test']
        Task: ['ner', 'mask', 'both']
        Size: ['testing', 'mini', 'full']
    Returns:
        Huggingface dataset
    """
    if split not in ['train', 'val', 'test']:
        raise ValueError("Split value must be in ['train', 'val', 'test']")
    if task not in ['ner', 'mask', 'both', 'binary']:
        raise ValueError("Task value must be in ['ner', 'mask', 'both']")
    if size not in ['testing', 'mini', 'full']:
        raise ValueError("Size value must be in ['testing', 'mini', 'full']")

    path_label = {'both': 'longformer', 'ner': 'longformer_ner', 'mask': 'longformer_mask', 'binary': 'longformer_binary'}
    # path_label = {'both': 'longformer', 'ner': 'longformer_ner', 'mask': 'longformer_4096'}

    if size == 'testing':
        ds = load_from_disk(f'{path}/data/tab/{path_label[task]}/lf_{split}_testing')
    if size == 'mini':
        if split == 'train':
            ds = load_from_disk(f'{path}/data/tab/{path_label[task]}/lf_{split}_400')
        else:
            ds = load_from_disk(f'{path}/data/tab/{path_label[task]}/lf_{split}_50')
    if size == 'full':
        ds = load_from_disk(f'{path}/data/tab/{path_label[task]}/lf_{split}')

    return ds

def convert_to_binary_class(sample_list):
    masking_type = ['O', 'B-NO_MASK', 'I-NO_MASK', 'B-DIRECT', 'I-DIRECT', 'B-QUASI', 'I-QUASI']

    modified_docs = []
    for doc in sample_list:
        mod_doc = []
        for i in doc:
            if i < 0:
                mod_doc.append(i)
            elif masking_type[i] == 'B-QUASI':
                mod_doc.append(3)
            elif masking_type[i] == 'I-QUASI':
                mod_doc.append(4)
            else:
                mod_doc.append(i)
        modified_docs.append(mod_doc)

    return modified_docs

## Create TAB dataframe

In [None]:
tab_dataset = load_dataset("mattmdjaga/text-anonymization-benchmark-train")
df_train = create_tab_df(tab_dataset['train'])
print(df_train.shape)
df_train.head()

tab_dataset = load_dataset("mattmdjaga/text-anonymization-benchmark-val-test")
df_val = create_tab_df(tab_dataset['validation'])
print(df_val.shape)
df_val.head()

df_test = create_tab_df(tab_dataset['test'])
print(df_test.shape)
df_test.head()

In [None]:
df = pd.concat([df_train, df_val, df_test], ignore_index=True)
df.shape

In [None]:
num_annotators = df.groupby('doc_id').annotator.nunique()
df = df.join(num_annotators, on='doc_id', how='left', rsuffix='join')
df = df.rename(columns={'annotatorjoin': 'num_annotators'})

In [None]:
df_path = '/content/drive/MyDrive/Colab Notebooks/DATASCI 266/266 project/data/tab/tab_dataframe.csv'

df.to_csv(df_path)
df = pd.read_csv(path, index_col=0)

## HuggingFace Dataset Transformation

### Train

In [None]:
tab_dataset = load_dataset("mattmdjaga/text-anonymization-benchmark-train")

README.md:   0%|          | 0.00/11.3k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/6.68M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1014 [00:00<?, ? examples/s]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(range(len(processed_list)), range(len(processed_list)), test_size = 0.16, random_state=1234)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=1234)

print(len(X_train))
print(len(X_val))
print(len(X_test))
X_train = [processed_list[i] for i in X_train]
X_val = [processed_list[i] for i in X_val]
X_test = [processed_list[i] for i in X_test]



795
76
76


In [None]:
# with open(f'{path}/data/tab/train_tab_model_testing.json', 'w') as f:
#     for record in X_train[:40]:
#         json_line = json.dumps(record)
#         f.write(json_line + "\n")

with open(f'{path}/data/tab/train_tab_160.json', 'w') as f:
    for record in X_train[:160]:
        json_line = json.dumps(record)
        f.write(json_line + "\n")

# with open(f'{path}/data/tab/train_tab.json', 'w') as f:
#     for record in X_train:
#         json_line = json.dumps(record)
#         f.write(json_line + "\n")

In [None]:
ds = load_dataset('json', data_files=f'{path}/data/tab/train_tab_160.json')
ds

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'ner_tags', 'mask_tags', 'text_spans', 'tokens', 'text'],
        num_rows: 160
    })
})

### Validation and Test

In [None]:
tab_dataset = load_dataset("mattmdjaga/text-anonymization-benchmark-val-test")

README.md:   0%|          | 0.00/12.0k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.30M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/127 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/127 [00:00<?, ? examples/s]

In [None]:
# validation
df, mult_annotations, annotation_dict = tab_annotation_duplicates(tab_dataset['validation'])
print('doc ids with multiple annotations:', len(mult_annotations))

doc_ids, doc_texts, annotations = tab_data_loader(tab_dataset['validation'])

processed_list = []
for i, d_id in enumerate(doc_ids):
    if d_id not in mult_annotations:
        if i % 100 == 0:
            print('progress', i)
        text = doc_texts[i]
        annotation = annotations[i][annotation_dict[d_id][0]]['entity_mentions']
        processed_list.append(ner_processor(d_id, text, annotation))

print('orig_val_samples', len(processed_list))
X_val = processed_list + X_val
print('val_samples', len(X_val))

doc ids with multiple annotations: 102
orig_val_samples 25
val_samples 101


In [None]:
# train
df, mult_annotations, annotation_dict = tab_annotation_duplicates(tab_dataset['train'])
print('doc ids with multiple annotations:', len(mult_annotations))

doc_ids, doc_texts, annotations = tab_data_loader(tab_dataset['train'])

processed_list = []
for i, d_id in enumerate(doc_ids):
    if d_id not in mult_annotations:
        if i % 100 == 0:
            print('progress', i)
        text = doc_texts[i]
        annotation = annotations[i][annotation_dict[d_id][0]]['entity_mentions']
        processed_list.append(ner_processor(d_id, text, annotation))

print('train_samples', len(processed_list))

doc ids with multiple annotations: 67
progress 100
progress 200
progress 300
progress 400
progress 500
progress 600
progress 700
progress 800
progress 900
progress 1000
train_samples 947


In [None]:
# with open(f'{path}/data/tab/val_tab_model_testing.json', 'w') as f:
#     for record in X_val[:5]:
#         json_line = json.dumps(record)
#         f.write(json_line + "\n")

with open(f'{path}/data/tab/val_tab_20.json', 'w') as f:
    for record in X_val[:20]:
        json_line = json.dumps(record)
        f.write(json_line + "\n")

# with open(f'{path}/data/tab/val_tab.json', 'w') as f:
#     for record in X_val:
#         json_line = json.dumps(record)
#         f.write(json_line + "\n")

In [None]:
ds = load_dataset('json', data_files=f'{path}/data/tab/val_tab_20.json')
ds

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'ner_tags', 'mask_tags', 'text_spans', 'tokens', 'text'],
        num_rows: 20
    })
})

In [None]:
# test
df, mult_annotations, annotation_dict = tab_annotation_duplicates(tab_dataset['test'])
print('doc ids with multiple annotations:', len(mult_annotations))

doc_ids, doc_texts, annotations = tab_data_loader(tab_dataset['test'])

processed_list = []
for i, d_id in enumerate(doc_ids):
    if d_id not in mult_annotations:
        if i % 100 == 0:
            print('progress', i)
        text = doc_texts[i]
        annotation = annotations[i][annotation_dict[d_id][0]]['entity_mentions']
        processed_list.append(ner_processor(d_id, text, annotation))

print('orig_test_samples', len(processed_list))
X_test = processed_list + X_test
print('test_samples', len(X_test))

doc ids with multiple annotations: 105
progress 0
orig_test_samples 22
test_samples 98


In [None]:
# with open(f'{path}/data/tab/test_tab_model_testing.json', 'w') as f:
#     for record in X_test[:5]:
#         json_line = json.dumps(record)
#         f.write(json_line + "\n")

with open(f'{path}/data/tab/test_tab_20.json', 'w') as f:
    for record in X_test[:20]:
        json_line = json.dumps(record)
        f.write(json_line + "\n")

# with open(f'{path}/data/tab/test_tab.json', 'w') as f:
#     for record in X_test:
#         json_line = json.dumps(record)
#         f.write(json_line + "\n")

In [None]:
ds = load_dataset('json', data_files=f'{path}/data/tab/test_tab_20.json')
ds

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'ner_tags', 'mask_tags', 'text_spans', 'tokens', 'text'],
        num_rows: 20
    })
})

## Longformer Tokenization

In [None]:
model_checkpoint = 'allenai/longformer-base-4096'
tokenizer = LongformerTokenizerFast.from_pretrained(model_checkpoint, add_prefix_space=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

In [None]:
task = 'ner' #update to change labels include in hf ds and location of where ds is saved
path_label = {'both': 'longformer', 'ner': 'longformer_ner', 'mask': 'longformer_mask', 'binary': 'longformer_binary'}
# path_label = {'both': 'longformer', 'ner': 'longformer_ner', 'mask': 'longformer_4096'}

In [None]:
# train test
ds = load_dataset('json', data_files=f'{path}/data/tab/train_tab_model_testing.json')
tokenized_datasets = ds.map(lambda x: tokenize_and_align_labels(x, task=task), batched=True)

tokenized_datasets.save_to_disk(f'{path}/data/tab/{path_label[task]}/lf_train_testing')
ds = load_from_disk(f'{path}/data/tab/{path_label[task]}/lf_train_testing')
ds

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/40 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'ner_tags', 'mask_tags', 'text_spans', 'tokens', 'text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 40
    })
})

In [None]:
# train 400
ds = load_dataset('json', data_files=f'{path}/data/tab/train_tab_160.json')
tokenized_datasets = ds.map(lambda x: tokenize_and_align_labels(x, task=task), batched=True)

tokenized_datasets.save_to_disk(f'{path}/data/tab/{path_label[task]}/lf_train_160')
ds = load_from_disk(f'{path}/data/tab/{path_label[task]}/lf_train_160')
ds

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/160 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/160 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'ner_tags', 'mask_tags', 'text_spans', 'tokens', 'text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 160
    })
})

In [None]:
# train
ds = load_dataset('json', data_files=f'{path}/data/tab/train_tab.json')
tokenized_datasets = ds.map(lambda x: tokenize_and_align_labels(x, task=task), batched=True)

tokenized_datasets.save_to_disk(f'{path}/data/tab/{path_label[task]}/lf_train')
ds = load_from_disk(f'{path}/data/tab/{path_label[task]}/lf_train')
ds

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/795 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/795 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'ner_tags', 'mask_tags', 'text_spans', 'tokens', 'text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 795
    })
})

In [None]:
# validation test
ds = load_dataset('json', data_files=f'{path}/data/tab/val_tab_model_testing.json')
tokenized_datasets = ds.map(lambda x: tokenize_and_align_labels(x, task=task), batched=True)

tokenized_datasets.save_to_disk(f'{path}/data/tab/{path_label[task]}/lf_val_testing')
ds = load_from_disk(f'{path}/data/tab/{path_label[task]}/lf_val_testing')
ds

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'ner_tags', 'mask_tags', 'text_spans', 'tokens', 'text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 5
    })
})

In [None]:
# validation 40
ds = load_dataset('json', data_files=f'{path}/data/tab/val_tab_20.json')
tokenized_datasets = ds.map(lambda x: tokenize_and_align_labels(x, task=task), batched=True)

tokenized_datasets.save_to_disk(f'{path}/data/tab/{path_label[task]}/lf_val_20')
ds = load_from_disk(f'{path}/data/tab/{path_label[task]}/lf_val_20')
ds

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/20 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'ner_tags', 'mask_tags', 'text_spans', 'tokens', 'text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 20
    })
})

In [None]:
# validation
ds = load_dataset('json', data_files=f'{path}/data/tab/val_tab.json')
tokenized_datasets = ds.map(lambda x: tokenize_and_align_labels(x, task=task), batched=True)

tokenized_datasets.save_to_disk(f'{path}/data/tab/{path_label[task]}/lf_val')
ds = load_from_disk(f'{path}/data/tab/{path_label[task]}/lf_val')
ds

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/101 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/101 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'ner_tags', 'mask_tags', 'text_spans', 'tokens', 'text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 101
    })
})

In [None]:
# test test
ds = load_dataset('json', data_files=f'{path}/data/tab/test_tab_model_testing.json')
tokenized_datasets = ds.map(lambda x: tokenize_and_align_labels(x, task=task), batched=True)

tokenized_datasets.save_to_disk(f'{path}/data/tab/{path_label[task]}/lf_test_testing')
ds = load_from_disk(f'{path}/data/tab/{path_label[task]}/lf_test_testing')
ds

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'ner_tags', 'mask_tags', 'text_spans', 'tokens', 'text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 5
    })
})

In [None]:
# test 50
ds = load_dataset('json', data_files=f'{path}/data/tab/test_tab_20.json')
tokenized_datasets = ds.map(lambda x: tokenize_and_align_labels(x, task=task), batched=True)

tokenized_datasets.save_to_disk(f'{path}/data/tab/{path_label[task]}/lf_test_20')
ds = load_from_disk(f'{path}/data/tab/{path_label[task]}/lf_test_20')
ds

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/20 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'ner_tags', 'mask_tags', 'text_spans', 'tokens', 'text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 20
    })
})

In [None]:
# test
ds = load_dataset('json', data_files=f'{path}/data/tab/test_tab.json')
tokenized_datasets = ds.map(lambda x: tokenize_and_align_labels(x, task=task), batched=True)

tokenized_datasets.save_to_disk(f'{path}/data/tab/{path_label[task]}/lf_test')
ds = load_from_disk(f'{path}/data/tab/{path_label[task]}/lf_test')
ds

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/98 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/98 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'ner_tags', 'mask_tags', 'text_spans', 'tokens', 'text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 98
    })
})

# Change to Binary Label

In [None]:
task = 'mask'
size = 'full'
split = 'test'
task_save = 'binary'

In [None]:
ds_orig = select_data(split=split, task=task, size=size)
ds_orig

DatasetDict({
    train: Dataset({
        features: ['id', 'ner_tags', 'mask_tags', 'text_spans', 'tokens', 'text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 98
    })
})

In [None]:
modified_labels = convert_to_binary_class(ds_orig['train']['labels'])
modified_mask_tags = convert_to_binary_class(ds_orig['train']['mask_tags'])

In [None]:
dictionary = {'id': ds_orig['train']['id'],
              'ner_tags': ds_orig['train']['ner_tags'],
              'mask_tags': modified_mask_tags,
              'text_spans': ds_orig['train']['text_spans'],
              'tokens': ds_orig['train']['tokens'],
              'text': ds_orig['train']['text'],
              'input_ids': ds_orig['train']['input_ids'],
              'attention_mask': ds_orig['train']['attention_mask'],
              'labels': modified_labels}

In [None]:
ds = Dataset.from_dict(dictionary)
ds

Dataset({
    features: ['id', 'ner_tags', 'mask_tags', 'text_spans', 'tokens', 'text', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 98
})

In [None]:
print(f'Original Path: {path}/data/tab/{path_label[task]}/lf_{split}')
print(f'New Path: {path}/data/tab/{path_label[task_save]}/lf_test')


Original Path: /content/drive/MyDrive/Colab Notebooks/DATASCI 266/266 project/data/tab/longformer_mask/lf_test
New Path: /content/drive/MyDrive/Colab Notebooks/DATASCI 266/266 project/data/tab/longformer_binary/lf_val


In [None]:
ds.save_to_disk(f'{path}/data/tab/{path_label[task_save]}/lf_test')

Saving the dataset (0/1 shards):   0%|          | 0/98 [00:00<?, ? examples/s]