### Define the objectives:
know insight of the data


In [1]:
#### import libraries
import pandas as pd
import numpy as np
import json
# import argparse library to handle command-line arguments
import argparse
# a Hugging Face Dataset used for natural language processing (NLP)
from datasets import Dataset
from pathlib import Path
# import the 'chain' function to combine multiple iterables into a single iterable
from itertools import chain
# tokenizer that can automatically find the model's required tokenization from the model name
from transformers import AutoTokenizer
# model class that can automatically find a token classification model from the model name
from transformers import AutoModelForTokenClassification
# class that provides an API for feature-compplete training in PyTorch
from transformers import Trainer
# class to store hyperparameters for training
from transformers import TrainingArguments
# data collator that dynamically pads the inputs received, used for token classification tasks
from transformers import DataCollatorForTokenClassification

2024-02-02 06:34:18.953092: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-02 06:34:18.953222: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-02 06:34:19.103649: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


### Data Collection

In [2]:
# load the JSON file content

with open("/kaggle/input/pii-detection-removal-from-educational-data/train.json", 'r') as file:
    data = json.load(file)

### Data Cleaning

In [3]:
#### gain insight from an example 
# data is a list contains multiple dictionaries, choose the first dictionary
# as an example
example = data[0]
print("### Data Size ###")
print(f"The length of the data is: {len(data)}\n")

print("### Sample Keys ###")
print(f'The keys of the data is: {example.keys()}\n')

print("### Sample Document Number ###")
print(f"The document of this sample is: {example['document']}\n")

print("### Sample Full Text ###")
print(f"The full_text of this sample is: {example['full_text']}\n")

print("### Sample Tokens ###")
print(f"The tokens of this sample is: {example['tokens']}\n")

print("### Sample Trailing Whitespace ###")
print(f"The trailing_whitespce of this sample is: {example['trailing_whitespace']}")

print("### Sample Labels ###")
print(f"The labels of this sample is: {example['labels']}")

### Data Size ###
The length of the data is: 6807

### Sample Keys ###
The keys of the data is: dict_keys(['document', 'full_text', 'tokens', 'trailing_whitespace', 'labels'])

### Sample Document Number ###
The document of this sample is: 7

### Sample Full Text ###
The full_text of this sample is: Design Thinking for innovation reflexion-Avril 2021-Nathalie Sylla

Challenge & selection

The tool I use to help all stakeholders finding their way through the complexity of a project is the  mind map.

What exactly is a mind map? According to the definition of Buzan T. and Buzan B. (1999, Dessine-moi  l'intelligence. Paris: Les Éditions d'Organisation.), the mind map (or heuristic diagram) is a graphic  representation technique that follows the natural functioning of the mind and allows the brain's  potential to be released. Cf Annex1

This tool has many advantages:

•  It is accessible to all and does not require significant material investment and can be done  quickly

•  It is scalable

In [4]:
#### downsize the data size
positive_samples = [] # samples that contain named entities
negative_samples = [] # samples that do not contain any named entity

for sentence in data:
    # check if the sentence contains any named entities
    if any(label != '0' for label in sentence['labels']):
        positive_samples.append(sentence)
    else:
        negative_samples.append(sentence)

In [5]:
#### creating label-id mappings for data labels
# flatten all labels from the 'labels' key in each item of 'data' and remove duplicates
all_labels = sorted({label for sentence in data for label in sentence['labels']})
print('######## All Labels ########')
print(all_labels)

# create a dictionary mapping each label to a unique ID using enumerate
label2id = {label: id for id, label in enumerate(all_labels)}
print('######## Labels to ID ########')
print(label2id)

# reverse the label2id dict to create a mapping from IDs back to labels
id2label = {id: label for label, id in label2id.items()}
print('######## ID to Labels ########')
print(id2label)

######## All Labels ########
['B-EMAIL', 'B-ID_NUM', 'B-NAME_STUDENT', 'B-PHONE_NUM', 'B-STREET_ADDRESS', 'B-URL_PERSONAL', 'B-USERNAME', 'I-ID_NUM', 'I-NAME_STUDENT', 'I-PHONE_NUM', 'I-STREET_ADDRESS', 'I-URL_PERSONAL', 'O']
######## Labels to ID ########
{'B-EMAIL': 0, 'B-ID_NUM': 1, 'B-NAME_STUDENT': 2, 'B-PHONE_NUM': 3, 'B-STREET_ADDRESS': 4, 'B-URL_PERSONAL': 5, 'B-USERNAME': 6, 'I-ID_NUM': 7, 'I-NAME_STUDENT': 8, 'I-PHONE_NUM': 9, 'I-STREET_ADDRESS': 10, 'I-URL_PERSONAL': 11, 'O': 12}
######## ID to Labels ########
{0: 'B-EMAIL', 1: 'B-ID_NUM', 2: 'B-NAME_STUDENT', 3: 'B-PHONE_NUM', 4: 'B-STREET_ADDRESS', 5: 'B-URL_PERSONAL', 6: 'B-USERNAME', 7: 'I-ID_NUM', 8: 'I-NAME_STUDENT', 9: 'I-PHONE_NUM', 10: 'I-STREET_ADDRESS', 11: 'I-URL_PERSONAL', 12: 'O'}


In [6]:
def tokenize(example, tokenizer, label2id, max_length):
    '''tokenize example'''
    
    # rebuild text from tokens and labels, adding spaces
    rebuilt_text = "".join(token + " " * ws for token, ws in zip(example['tokens'], example['trailing_whitespace']))
    labels = [l for label in example['provided_labels'] for l in label * len(token)] + ['0'] * example['trailing_whitespace'].count(True)
    
    # tokenize the text
    tokenized_input = tokenizer(rebuilt_text, return_offsets_mapping=True, max_length=max_length, truncation=True)
    # create labels for subtokens
    subtoken_labels = []
    labels = np.array(labels)
    
    for start_idx, end_idx in tokenized_input.offset_mapping:
        # check if the offset mapping corresponds to a special token
        if start_idx == end_idx == 0:
            # append the label ID for a special token ('0')
            subtoken_labels.append(label2id['0'])
        else:
            # if the character at start_idx is a whitespace, move to the next character
            start_idx += rebuilt_text[start_idx].isspace()
            # append the label ID for the subtoken, using the adjusted start_idx
            subtoken_labels.append(label2id[labels[start_idx]])
    
    # remove offset mapping before returning as it's not needed anymore
    tokenized_input.pop('offset_mapping')
    
    # python dictionary comprehension
    return {
        **tokenized_input, # unpacks the original tokenized_input dictionary into the new dictionary
        'labels': subtoken_labels, # adds a new key 'labels' with its value being whatever subtoken_labels is
        'length': len(tokenized_input['input_ids']) # adds another key value pair
    }
                                
    

In [7]:
# # a tokenizer compatible with the model to be trained is loaded from a given path
# tokenizer = AutoTokenizer.from_pretrained(TRAINING_MODEL_PATH)

# # prepare the dataset 
# processed_dataset = Dataset.from_dict({
#     'full_text': [x['full_text'] for x in data],
#     'document': [str(x['document']) for x in data],
#     'tokens': [x['tokens'] for x in data],
#     'trailing_whitespace': [x['trailing_whitespace'] for x in data],
#     # rename 'labels' to 'provided_labels'
#     'provided_labels': [x['labels'] for x in data] 
# })

# # appply the tokenize function
# # the map() will apply tokenize() to every value of the dataset
# processed_dataset = processed_dataset.map(
#     tokenize, # the tokenize() is applied in the dataset
#     fn_kwargs={
#         "tokenizer": tokenizer, 
#         "label2id": label2id, 
#         "max_length": TRAINING_MAX_LENGTH
#     }, 
#     num_proc=3 # 3 parallel processes 
# )