# Data Exploration and Tokenization Preprocessing
In this notebook we: 
1. Explore the distribution of labels to answer questions such as: 
    - How many documents contain special labels
    - How many instances of special labels do we have
2. Tokenize the data according to both potential models and analyze based on MAX_TRAINING_LENGTH of models
3. Standardize training and testing data for fine-tuning, including
    - Choosing standard train/test split
    - Partitioning the data for models that split the tokenized text into chunks
    - Choosing standard set of files to downsample (if downsampling)
    - Saving .csv and .txt files of all model-standardized features for easy data loading

# Load data and packages
This part will need to be changed to run locally. 

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/pii-detection-removal-from-educational-data/sample_submission.csv
/kaggle/input/pii-detection-removal-from-educational-data/train.json
/kaggle/input/pii-detection-removal-from-educational-data/test.json


In [37]:
import json
from itertools import chain

from transformers import AutoTokenizer,AutoModelForTokenClassification,Trainer, TrainingArguments, AutoConfig, DataCollatorForTokenClassification, TrainerCallback
from datasets import Dataset, features
from sklearn.model_selection import train_test_split
import random

In [3]:
data_train = json.load(open('/kaggle/input/pii-detection-removal-from-educational-data/train.json'))
data_test = pd.read_json('/kaggle/input/pii-detection-removal-from-educational-data/test.json')

df_train = pd.DataFrame(data_train)
df_test = pd.DataFrame(data_test)

In [4]:
print("Length of Training Data is " + str(len(df_train)))
print("Length of Testing Data is " + str(len(df_test)))

Length of Training Data is 6807
Length of Testing Data is 10


In [5]:
df_train.head()

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels
0,7,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[True, True, True, True, False, False, True, F...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-..."
1,10,Diego Estrada\n\nDesign Thinking Assignment\n\...,"[Diego, Estrada, \n\n, Design, Thinking, Assig...","[True, False, False, True, True, False, False,...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O..."
2,16,Reporting process\n\nby Gilberto Gamboa\n\nCha...,"[Reporting, process, \n\n, by, Gilberto, Gambo...","[True, False, False, True, True, False, False,...","[O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O..."
3,20,Design Thinking for Innovation\n\nSindy Samaca...,"[Design, Thinking, for, Innovation, \n\n, Sind...","[True, True, True, False, False, True, False, ...","[O, O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT..."
4,56,Assignment: Visualization Reflection Submitt...,"[Assignment, :, , Visualization, , Reflecti...","[False, False, False, False, False, False, Fal...","[O, O, O, O, O, O, O, O, O, O, O, O, B-NAME_ST..."


# Distribution of Labels

In [6]:
LABELS = sorted(list(set(chain(*df_train.labels))))
num_labels = len(LABELS)

print(LABELS)
print('Length:', num_labels)

['B-EMAIL', 'B-ID_NUM', 'B-NAME_STUDENT', 'B-PHONE_NUM', 'B-STREET_ADDRESS', 'B-URL_PERSONAL', 'B-USERNAME', 'I-ID_NUM', 'I-NAME_STUDENT', 'I-PHONE_NUM', 'I-STREET_ADDRESS', 'I-URL_PERSONAL', 'O']
Length: 13


In [7]:
label2id = {l:i for i, l in enumerate(LABELS)}
id2label = {i:l for i, l in enumerate(LABELS)}

In [8]:
# Functions to find occurences of each label
def find_label_instances(df, label): 
    instances = 0
    for i in range(len(df)):
        labels = df.iloc[i].labels
        instances += labels.count(label)
    return instances
    
def find_label_number(df,label):
    num_documents = 0
    for i in range(len(df)):
        labels = df.iloc[i].labels
        if (labels.count(label) > 0): num_documents += 1
    return num_documents

In [9]:
tot_space = 20
print('-' * 10, 'NUMBER OF TOTAL LABEL OCCURENCES', '-' * 10, '\n')
for label in LABELS: 
    num_instances = find_label_instances(df_train, label)
    print(label, ' ' * (tot_space - len(label)) + ':', num_instances)
    
print('\n')
print('-' * 10, 'NUMBER OF LABEL OCCURENCES', '-' * 10, '\n')
for label in LABELS: 
    num_documents = find_label_number(df_train, label)
    print(label, ' ' * (tot_space - len(label)) + ':', num_documents)

---------- NUMBER OF TOTAL LABEL OCCURENCES ---------- 

B-EMAIL              : 39
B-ID_NUM             : 78
B-NAME_STUDENT       : 1365
B-PHONE_NUM          : 6
B-STREET_ADDRESS     : 2
B-URL_PERSONAL       : 110
B-USERNAME           : 6
I-ID_NUM             : 1
I-NAME_STUDENT       : 1096
I-PHONE_NUM          : 15
I-STREET_ADDRESS     : 20
I-URL_PERSONAL       : 1
O                    : 4989794


---------- NUMBER OF LABEL OCCURENCES ---------- 

B-EMAIL              : 24
B-ID_NUM             : 33
B-NAME_STUDENT       : 891
B-PHONE_NUM          : 4
B-STREET_ADDRESS     : 2
B-URL_PERSONAL       : 72
B-USERNAME           : 5
I-ID_NUM             : 1
I-NAME_STUDENT       : 814
I-PHONE_NUM          : 3
I-STREET_ADDRESS     : 2
I-URL_PERSONAL       : 1
O                    : 6807


# Tokenized Length

In [10]:
def reconstruct(batch): 
    text = []
    labels = []
    
    for t, l, ws in zip(
        batch["tokens"], batch["labels"], batch["trailing_whitespace"]):
            
            text.append(t)
            labels.extend([l]*len(t))
            
            if ws: 
                text.append(" ")
                labels.append("O")
                
    return "".join(text),np.array(labels)

def tokenize(batch, tokenizer):
    
    text,labels = reconstruct(batch)
    
    # Tokenize with pre-trained tokenizer
    #tokenized = tokenizer(text, return_offsets_mapping=True, max_length=TRAINING_MAX_LENGTH)
    tokenized = tokenizer(text, return_offsets_mapping=True)
    
    # Create labels for each token 
    token_labels = []
    
    for start_idx, end_idx in tokenized.offset_mapping:
        # CLS token
        if start_idx == 0 and end_idx == 0:
            token_labels.append(label2id["O"])
            continue

        # case when token starts with whitespace
        if text[start_idx].isspace():
            start_idx += 1

        token_labels.append(label2id[labels[start_idx]])

    length = len(tokenized.input_ids)

    return tokenized.input_ids, tokenized.attention_mask, tokenized.offset_mapping, token_labels, length

In [11]:
# Model (Roberta) Settings from Hugging Face: Path, Length, Output Directory
TRAINING_MODEL_PATH = "FacebookAI/roberta-base"
TRAINING_MAX_LENGTH = 512
tokenizer = AutoTokenizer.from_pretrained(TRAINING_MODEL_PATH)
df_train[['roberta_input_ids', 'roberta_attention_mask', 'roberta_offset_mapping', 'roberta_token_labels', 'roberta_length']] = df_train.apply(lambda row: tokenize(row, tokenizer), axis='columns', result_type='expand')

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (835 > 512). Running this sequence through the model will result in indexing errors


In [12]:
# Model (Deberta) Settings from Hugging Face: Path, Length, Output Directory
TRAINING_MODEL_PATH = "microsoft/deberta-v3-base"
TRAINING_MAX_LENGTH = 1024
tokenizer = AutoTokenizer.from_pretrained(TRAINING_MODEL_PATH)
df_train[['deberta_input_ids', 'deberta_attention_mask', 'deberta_offset_mapping', 'deberta_token_labels', 'deberta_length']] = df_train.apply(lambda row: tokenize(row, tokenizer), axis='columns', result_type='expand')

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



In [13]:
# Determine What percentage of training data is over TRAINING_MAX_LENGTH for each model
lengths = [512, 1024, 2048]
tot_space = 10

print('-' * 10, 'RoBerta Stats', '-' * 10, '\n')
roberta_lengths = df_train['roberta_length']
for l in lengths: 
    count = sum(roberta_lengths > l)
    str_l = str(l)
    print('Length', l, ' ' * (tot_space - len(str_l)) + ':', count)
    
print('\n')
print('-' * 10, 'DeBerta Stats', '-' * 10, '\n')
deberta_lengths = df_train['deberta_length']
for l in lengths: 
    count = sum(deberta_lengths > l)
    str_l = str(l)
    print('Length', l, ' ' * (tot_space - len(str_l)) + ':', count)


---------- RoBerta Stats ---------- 

Length 512        : 5495
Length 1024       : 1372
Length 2048       : 87


---------- DeBerta Stats ---------- 

Length 512        : 4907
Length 1024       : 762
Length 2048       : 14


# Documents for downsampling
If downsampling, we choose to eliminate a (standardized) random half of all documents with no PII labels

In [14]:
# Find documents with no "special" labels
def find_documents(df): 
    documents = []
    for index, row in df.iterrows(): 
        labels = row.labels
        if (labels.count("O") == len(labels)): 
            documents.append(row.document)
    return documents

In [1]:
bad_documents = find_documents(df_train)
n = len(bad_documents)
to_cut = random.choices(bad_documents, k=int(n / 2))
print(len(to_cut))

NameError: name 'find_documents' is not defined

In [43]:
with open('docs2cut.txt', 'w') as file:
    for number in to_cut:
        file.write(f"{number}\n")

# Standardized Train/Test Split


In [16]:
# Add columns indicating which labels are present
df_train['B-EMAIL'] = df_train.apply(lambda row: 'B-EMAIL' in set(row.labels), axis='columns', result_type='expand')
df_train['B-ID_NUM'] = df_train.apply(lambda row: 'B-ID_NUM' in set(row.labels), axis='columns', result_type='expand')
df_train['B-NAME_STUDENT'] = df_train.apply(lambda row: 'B-NAME_STUDENT' in set(row.labels), axis='columns', result_type='expand')
df_train['B-URL_PERSONAL'] = df_train.apply(lambda row: 'B-URL_PERSONAL' in set(row.labels), axis='columns', result_type='expand')
df_train['I-NAME_STUDENT'] = df_train.apply(lambda row: 'I-NAME_STUDENT' in set(row.labels), axis='columns', result_type='expand')

In [17]:
# we can't stratify including B-EMAIL - just how the data works out, 
# there are too few data points with email so that it won't work 
train, test = train_test_split(df_train, 
                 shuffle=True,
                 random_state=42,
                 stratify=df_train[['B-URL_PERSONAL', 
                                           'B-NAME_STUDENT',
                                           'I-NAME_STUDENT',
                                           'B-ID_NUM'
                                          ]],
                 test_size=0.25
                )

In [18]:
tot_space = 20
print('-' * 10, 'NUMBER OF TOTAL LABEL OCCURENCES: TRAINING', '-' * 10, '\n')
for label in LABELS: 
    num_instances = find_label_instances(train, label)
    print(label, ' ' * (tot_space - len(label)) + ':', num_instances)
    
print('\n')
print('-' * 10, 'NUMBER OF LABEL OCCURENCES: TRAINING', '-' * 10, '\n')
for label in LABELS: 
    num_documents = find_label_number(train, label)
    print(label, ' ' * (tot_space - len(label)) + ':', num_documents)

---------- NUMBER OF TOTAL LABEL OCCURENCES: TRAINING ---------- 

B-EMAIL              : 30
B-ID_NUM             : 59
B-NAME_STUDENT       : 1032
B-PHONE_NUM          : 6
B-STREET_ADDRESS     : 2
B-URL_PERSONAL       : 89
B-USERNAME           : 3
I-ID_NUM             : 0
I-NAME_STUDENT       : 807
I-PHONE_NUM          : 15
I-STREET_ADDRESS     : 20
I-URL_PERSONAL       : 1
O                    : 3734808


---------- NUMBER OF LABEL OCCURENCES: TRAINING ---------- 

B-EMAIL              : 19
B-ID_NUM             : 25
B-NAME_STUDENT       : 667
B-PHONE_NUM          : 4
B-STREET_ADDRESS     : 2
B-URL_PERSONAL       : 54
B-USERNAME           : 3
I-ID_NUM             : 0
I-NAME_STUDENT       : 610
I-PHONE_NUM          : 3
I-STREET_ADDRESS     : 2
I-URL_PERSONAL       : 1
O                    : 5105


In [19]:
tot_space = 20
print('-' * 10, 'NUMBER OF TOTAL LABEL OCCURENCES: TESTING', '-' * 10, '\n')
for label in LABELS: 
    num_instances = find_label_instances(test, label)
    print(label, ' ' * (tot_space - len(label)) + ':', num_instances)
    
print('\n')
print('-' * 10, 'NUMBER OF LABEL OCCURENCES: TESTING', '-' * 10, '\n')
for label in LABELS: 
    num_documents = find_label_number(test, label)
    print(label, ' ' * (tot_space - len(label)) + ':', num_documents)

---------- NUMBER OF TOTAL LABEL OCCURENCES: TESTING ---------- 

B-EMAIL              : 9
B-ID_NUM             : 19
B-NAME_STUDENT       : 333
B-PHONE_NUM          : 0
B-STREET_ADDRESS     : 0
B-URL_PERSONAL       : 21
B-USERNAME           : 3
I-ID_NUM             : 1
I-NAME_STUDENT       : 289
I-PHONE_NUM          : 0
I-STREET_ADDRESS     : 0
I-URL_PERSONAL       : 0
O                    : 1254986


---------- NUMBER OF LABEL OCCURENCES: TESTING ---------- 

B-EMAIL              : 5
B-ID_NUM             : 8
B-NAME_STUDENT       : 224
B-PHONE_NUM          : 0
B-STREET_ADDRESS     : 0
B-URL_PERSONAL       : 18
B-USERNAME           : 2
I-ID_NUM             : 1
I-NAME_STUDENT       : 204
I-PHONE_NUM          : 0
I-STREET_ADDRESS     : 0
I-URL_PERSONAL       : 0
O                    : 1702


In [20]:
train.to_csv("train.csv", index=False)
test.to_csv("test.csv", index = False)

In [21]:
train.columns

Index(['document', 'full_text', 'tokens', 'trailing_whitespace', 'labels',
       'roberta_input_ids', 'roberta_attention_mask', 'roberta_offset_mapping',
       'roberta_token_labels', 'roberta_length', 'deberta_input_ids',
       'deberta_attention_mask', 'deberta_offset_mapping',
       'deberta_token_labels', 'deberta_length', 'B-EMAIL', 'B-ID_NUM',
       'B-NAME_STUDENT', 'B-URL_PERSONAL', 'I-NAME_STUDENT'],
      dtype='object')

# Partitioning Data for each base language model


In [22]:
def split_files(df, MAX_LENGTH): 
    df_split = pd.DataFrame(columns = ['document', 'full_text', 'tokens', 'trailing_whitespace', 'provided_labels', 'input_ids', 'attention_mask', 'offset_mapping', 'labels'])

    for index, row in df.iterrows(): 
        document = df.loc[index].document
        full_text = df.loc[index].full_text
        tokens = df.loc[index].tokens
        trailing_whitespace = df.loc[index].trailing_whitespace
        provided_labels = df.loc[index].provided_labels
        input_ids = df.loc[index].input_ids
        attention_mask = df.loc[index].attention_mask
        offset_mapping = df.loc[index].offset_mapping
        labels = df.loc[index].labels
        length = df.loc[index].length
        num_rows = -(length // -MAX_LENGTH)       
        for i in range(num_rows): 
            new_input_ids = input_ids[i*MAX_LENGTH: (i + 1)* MAX_LENGTH]
            new_attention_mask = attention_mask[i*MAX_LENGTH: (i + 1)* MAX_LENGTH]
            new_offset_mapping = offset_mapping[i*MAX_LENGTH: (i + 1)* MAX_LENGTH]
            new_labels = labels[i*MAX_LENGTH: (i + 1)* MAX_LENGTH]
            df_split = df_split._append({'document': document, 'full_text': full_text, 'tokens': tokens, 'trailing_whitespace': trailing_whitespace, 'provided_labels': provided_labels, 'input_ids': new_input_ids, 'attention_mask': new_attention_mask, 'offset_mapping': new_offset_mapping, 'labels': new_labels}, ignore_index=True)
    
    return df_split

FOR ROBERTA

In [23]:
roberta_train_train = pd.DataFrame(columns = ['document', 'full_text', 'tokens', 'trailing_whitespace', 'provided_labels', 'input_ids', 'attention_mask', 'offset_mapping', 'labels', 'length'])
roberta_train_train[['document', 'full_text', 'tokens', 'trailing_whitespace', 
                     'provided_labels', 'input_ids', 'attention_mask', 
                     'offset_mapping', 'labels', 'length']]= train[['document', 'full_text', 'tokens', 'trailing_whitespace', 'labels', 'roberta_input_ids', 'roberta_attention_mask', 'roberta_offset_mapping', 'roberta_token_labels', 'roberta_length']]

roberta_test_test = pd.DataFrame(columns = ['document', 'full_text', 'tokens', 'trailing_whitespace', 'provided_labels', 'input_ids', 'attention_mask', 'offset_mapping', 'labels', 'length'])
roberta_test_test[['document', 'full_text', 'tokens', 'trailing_whitespace', 
                     'provided_labels', 'input_ids', 'attention_mask', 
                     'offset_mapping', 'labels', 'length']]= test[['document', 'full_text', 'tokens', 'trailing_whitespace', 'labels', 'roberta_input_ids', 'roberta_attention_mask', 'roberta_offset_mapping', 'roberta_token_labels', 'roberta_length']]

In [24]:
roberta_train = split_files(roberta_train_train, 512)
roberta_test = split_files(roberta_test_test, 512)

In [25]:
roberta_train['B-EMAIL'] = roberta_train.apply(lambda row: label2id['B-EMAIL'] in set(row.labels), axis='columns', result_type='expand')
roberta_train['B-ID_NUM'] = roberta_train.apply(lambda row: label2id['B-ID_NUM'] in set(row.labels), axis='columns', result_type='expand')
roberta_train['B-NAME_STUDENT'] = roberta_train.apply(lambda row: label2id['B-NAME_STUDENT'] in set(row.labels), axis='columns', result_type='expand')
roberta_train['B-URL_PERSONAL'] = roberta_train.apply(lambda row: label2id['B-URL_PERSONAL'] in set(row.labels), axis='columns', result_type='expand')
roberta_train['I-NAME_STUDENT'] = roberta_train.apply(lambda row: label2id['I-NAME_STUDENT'] in set(row.labels), axis='columns', result_type='expand')


roberta_test['B-EMAIL'] = roberta_test.apply(lambda row: label2id['B-EMAIL'] in set(row.labels), axis='columns', result_type='expand')
roberta_test['B-ID_NUM'] = roberta_test.apply(lambda row: label2id['B-ID_NUM'] in set(row.labels), axis='columns', result_type='expand')
roberta_test['B-NAME_STUDENT'] = roberta_test.apply(lambda row: label2id['B-NAME_STUDENT'] in set(row.labels), axis='columns', result_type='expand')
roberta_test['B-URL_PERSONAL'] = roberta_test.apply(lambda row: label2id['B-URL_PERSONAL'] in set(row.labels), axis='columns', result_type='expand')
roberta_test['I-NAME_STUDENT'] = roberta_test.apply(lambda row: label2id['I-NAME_STUDENT'] in set(row.labels), axis='columns', result_type='expand')

In [26]:
tot_space = 20
print('-' * 10, 'NUMBER OF TOTAL LABEL OCCURENCES: TRAINING', '-' * 10, '\n')
for label in LABELS: 
    num_instances = find_label_instances(roberta_train, label2id[label])
    print(label, ' ' * (tot_space - len(label)) + ':', num_instances)
    
print('\n')
print('-' * 10, 'NUMBER OF LABEL OCCURENCES: TRAINING', '-' * 10, '\n')
for label in LABELS: 
    num_documents = find_label_number(roberta_train, label2id[label])
    print(label, ' ' * (tot_space - len(label)) + ':', num_documents)

---------- NUMBER OF TOTAL LABEL OCCURENCES: TRAINING ---------- 

B-EMAIL              : 257
B-ID_NUM             : 369
B-NAME_STUDENT       : 2380
B-PHONE_NUM          : 13
B-STREET_ADDRESS     : 5
B-URL_PERSONAL       : 1462
B-USERNAME           : 12
I-ID_NUM             : 0
I-NAME_STUDENT       : 1419
I-PHONE_NUM          : 46
I-STREET_ADDRESS     : 33
I-URL_PERSONAL       : 8
O                    : 4100929


---------- NUMBER OF LABEL OCCURENCES: TRAINING ---------- 

B-EMAIL              : 23
B-ID_NUM             : 30
B-NAME_STUDENT       : 780
B-PHONE_NUM          : 5
B-STREET_ADDRESS     : 2
B-URL_PERSONAL       : 69
B-USERNAME           : 3
I-ID_NUM             : 0
I-NAME_STUDENT       : 709
I-PHONE_NUM          : 4
I-STREET_ADDRESS     : 2
I-URL_PERSONAL       : 1
O                    : 10603


In [27]:
tot_space = 20
print('-' * 10, 'NUMBER OF TOTAL LABEL OCCURENCES: TESTING', '-' * 10, '\n')
for label in LABELS: 
    num_instances = find_label_instances(roberta_test, label2id[label])
    print(label, ' ' * (tot_space - len(label)) + ':', num_instances)
    
print('\n')
print('-' * 10, 'NUMBER OF LABEL OCCURENCES: TESTING', '-' * 10, '\n')
for label in LABELS: 
    num_documents = find_label_number(roberta_test, label2id[label])
    print(label, ' ' * (tot_space - len(label)) + ':', num_documents)

---------- NUMBER OF TOTAL LABEL OCCURENCES: TESTING ---------- 

B-EMAIL              : 66
B-ID_NUM             : 116
B-NAME_STUDENT       : 807
B-PHONE_NUM          : 0
B-STREET_ADDRESS     : 0
B-URL_PERSONAL       : 367
B-USERNAME           : 11
I-ID_NUM             : 4
I-NAME_STUDENT       : 501
I-PHONE_NUM          : 0
I-STREET_ADDRESS     : 0
I-URL_PERSONAL       : 0
O                    : 1370299


---------- NUMBER OF LABEL OCCURENCES: TESTING ---------- 

B-EMAIL              : 8
B-ID_NUM             : 10
B-NAME_STUDENT       : 266
B-PHONE_NUM          : 0
B-STREET_ADDRESS     : 0
B-URL_PERSONAL       : 18
B-USERNAME           : 3
I-ID_NUM             : 1
I-NAME_STUDENT       : 243
I-PHONE_NUM          : 0
I-STREET_ADDRESS     : 0
I-URL_PERSONAL       : 0
O                    : 3529


FOR DEBERTA

In [28]:
deberta_train_train = pd.DataFrame(columns = ['document', 'full_text', 'tokens', 'trailing_whitespace', 'provided_labels', 'input_ids', 'attention_mask', 'offset_mapping', 'labels', 'length'])
deberta_train_train[['document', 'full_text', 'tokens', 'trailing_whitespace', 
                     'provided_labels', 'input_ids', 'attention_mask', 
                     'offset_mapping', 'labels', 'length']]= train[['document', 'full_text', 'tokens', 'trailing_whitespace', 'labels', 'deberta_input_ids', 'deberta_attention_mask', 'deberta_offset_mapping', 'deberta_token_labels', 'deberta_length']]

deberta_test_test = pd.DataFrame(columns = ['document', 'full_text', 'tokens', 'trailing_whitespace', 'provided_labels', 'input_ids', 'attention_mask', 'offset_mapping', 'labels', 'length'])
deberta_test_test[['document', 'full_text', 'tokens', 'trailing_whitespace', 
                     'provided_labels', 'input_ids', 'attention_mask', 
                     'offset_mapping', 'labels', 'length']]= test[['document', 'full_text', 'tokens', 'trailing_whitespace', 'labels', 'deberta_input_ids', 'deberta_attention_mask', 'deberta_offset_mapping', 'deberta_token_labels', 'deberta_length']]

In [29]:
deberta_train = split_files(deberta_train_train, 1024)
deberta_test = split_files(deberta_test_test, 1024)

In [30]:
deberta_train['B-EMAIL'] = deberta_train.apply(lambda row: label2id['B-EMAIL'] in set(row.labels), axis='columns', result_type='expand')
deberta_train['B-ID_NUM'] = deberta_train.apply(lambda row: label2id['B-ID_NUM'] in set(row.labels), axis='columns', result_type='expand')
deberta_train['B-NAME_STUDENT'] = deberta_train.apply(lambda row: label2id['B-NAME_STUDENT'] in set(row.labels), axis='columns', result_type='expand')
deberta_train['B-URL_PERSONAL'] = deberta_train.apply(lambda row: label2id['B-URL_PERSONAL'] in set(row.labels), axis='columns', result_type='expand')
deberta_train['I-NAME_STUDENT'] = deberta_train.apply(lambda row: label2id['I-NAME_STUDENT'] in set(row.labels), axis='columns', result_type='expand')


deberta_test['B-EMAIL'] = deberta_test.apply(lambda row: label2id['B-EMAIL'] in set(row.labels), axis='columns', result_type='expand')
deberta_test['B-ID_NUM'] = deberta_test.apply(lambda row: label2id['B-ID_NUM'] in set(row.labels), axis='columns', result_type='expand')
deberta_test['B-NAME_STUDENT'] = deberta_test.apply(lambda row: label2id['B-NAME_STUDENT'] in set(row.labels), axis='columns', result_type='expand')
deberta_test['B-URL_PERSONAL'] = deberta_test.apply(lambda row: label2id['B-URL_PERSONAL'] in set(row.labels), axis='columns', result_type='expand')
deberta_test['I-NAME_STUDENT'] = deberta_test.apply(lambda row: label2id['I-NAME_STUDENT'] in set(row.labels), axis='columns', result_type='expand')

In [31]:
tot_space = 20
print('-' * 10, 'NUMBER OF TOTAL LABEL OCCURENCES: TRAINING', '-' * 10, '\n')
for label in LABELS: 
    num_instances = find_label_instances(deberta_train, label2id[label])
    print(label, ' ' * (tot_space - len(label)) + ':', num_instances)
    
print('\n')
print('-' * 10, 'NUMBER OF LABEL OCCURENCES: TRAINING', '-' * 10, '\n')
for label in LABELS: 
    num_documents = find_label_number(deberta_train, label2id[label])
    print(label, ' ' * (tot_space - len(label)) + ':', num_documents)

---------- NUMBER OF TOTAL LABEL OCCURENCES: TRAINING ---------- 

B-EMAIL              : 209
B-ID_NUM             : 287
B-NAME_STUDENT       : 1317
B-PHONE_NUM          : 11
B-STREET_ADDRESS     : 2
B-URL_PERSONAL       : 1528
B-USERNAME           : 11
I-ID_NUM             : 0
I-NAME_STUDENT       : 1071
I-PHONE_NUM          : 39
I-STREET_ADDRESS     : 22
I-URL_PERSONAL       : 7
O                    : 3521120


---------- NUMBER OF LABEL OCCURENCES: TRAINING ---------- 

B-EMAIL              : 19
B-ID_NUM             : 25
B-NAME_STUDENT       : 682
B-PHONE_NUM          : 4
B-STREET_ADDRESS     : 2
B-URL_PERSONAL       : 58
B-USERNAME           : 3
I-ID_NUM             : 0
I-NAME_STUDENT       : 624
I-PHONE_NUM          : 3
I-STREET_ADDRESS     : 2
I-URL_PERSONAL       : 1
O                    : 5695


In [32]:
tot_space = 20
print('-' * 10, 'NUMBER OF TOTAL LABEL OCCURENCES: TESTING', '-' * 10, '\n')
for label in LABELS: 
    num_instances = find_label_instances(deberta_test, label2id[label])
    print(label, ' ' * (tot_space - len(label)) + ':', num_instances)
    
print('\n')
print('-' * 10, 'NUMBER OF LABEL OCCURENCES: TESTING', '-' * 10, '\n')
for label in LABELS: 
    num_documents = find_label_number(deberta_test, label2id[label])
    print(label, ' ' * (tot_space - len(label)) + ':', num_documents)

---------- NUMBER OF TOTAL LABEL OCCURENCES: TESTING ---------- 

B-EMAIL              : 54
B-ID_NUM             : 85
B-NAME_STUDENT       : 437
B-PHONE_NUM          : 0
B-STREET_ADDRESS     : 0
B-URL_PERSONAL       : 383
B-USERNAME           : 10
I-ID_NUM             : 3
I-NAME_STUDENT       : 393
I-PHONE_NUM          : 0
I-STREET_ADDRESS     : 0
I-URL_PERSONAL       : 0
O                    : 1183923


---------- NUMBER OF LABEL OCCURENCES: TESTING ---------- 

B-EMAIL              : 5
B-ID_NUM             : 8
B-NAME_STUDENT       : 229
B-PHONE_NUM          : 0
B-STREET_ADDRESS     : 0
B-URL_PERSONAL       : 19
B-USERNAME           : 2
I-ID_NUM             : 1
I-NAME_STUDENT       : 208
I-PHONE_NUM          : 0
I-STREET_ADDRESS     : 0
I-URL_PERSONAL       : 0
O                    : 1889


SAVE DATA

In [33]:
roberta_train.to_csv("roberta_train.csv", index=False)
roberta_test.to_csv("roberta_test.csv", index = False)
deberta_train.to_csv("deberta_train.csv", index=False)
deberta_test.to_csv("deberta_test.csv", index = False)