In [1]:
import json
import argparse
from itertools import chain
import pandas as pd
from pathlib import Path
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification
from datasets import Dataset
import numpy as np



In [2]:
from transformers import AutoModelForSequenceClassification, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.metrics import f1_score

Ahoy there, me hearties! Let me spin ye a yarn about this here code, known as 'tokenize':

- **Function Purpose**: This here 'tokenize' function be a vital part of our quest in the world of Natural Language Processing (NLP). Its mission be to take an 'example' and a 'tokenizer,' then turn that text into tokens while keepin' tabs on where they belong.

- **Initialization**: We start by creatin' two lists - 'text' and 'token_map' - to store the tokens and their positions.

- **Tracking Tokens**: The 'idx' variable sets sail at 0 and helps us navigate through the tokens as we process 'em.

- **Token Loop**: We voyage through the tokens and their trailin' white spaces (if any). For each token 't' and its mate, the whitespace 'ws':

    - 't' gets added to the 'text' list so we can gather all the tokens in one place.
    
    - 'token_map' expands by repeatin' 'idx' as many times as there be characters in token 't.' This step be our map to show where each character belongs to which token.
    
    - If there be trailin' whitespace ('ws'), we add a space to 'text' and put a '-1' in 'token_map' to mark where the whitespace lies.

    - 'idx' gets a boost to be ready for the next token on our journey.

- **Tokenization**: The 'text' we've gathered be tokenized using the 'tokenizer' we were given. We also ask for offsets mappings and set a maximum token length ('max_length') to make sure our tokens don't get too unruly.

- **Returning the Result**: The function hands back a chest of treasures in the form of a dictionary. Inside, ye'll find the tokenized data, includin' offset mappings, and the 'token_map.' This be essential for future NLP adventures where knowin' where the tokens sit be key.

This 'tokenize' function be our loyal mate on the high seas of NLP, helpin' us find the buried treasure of information within the text. Aye, it be a treasure map for the crew! 🏴‍☠️💰


In [3]:
INFERENCE_MAX_LENGTH = 2500

def tokenize(example, tokenizer):
    # We be creatin' two empty lists, 'text' and 'token_map', to store our tokens and their respective maps.
    text = []
    token_map = []
    
    # We start the 'idx' at 0, it be used to keep track of the tokens.
    idx = 0
    
    # Now, we be loopin' through the tokens and their trailin' white spaces.
    for t, ws in zip(example["tokens"], example["trailing_whitespace"]):
        
        # We add the token 't' to the 'text' list.
        text.append(t)
        
        # We be extendin' the 'token_map' list by repeatin' the 'idx' as many times as the length of token 't'.
        token_map.extend([idx]*len(t))
        
        # If there be trailin' whitespace (ws), we add a space to 'text' and mark it with a '-1' in 'token_map'.
        if ws:
            text.append(" ")
            token_map.append(-1)
            
        # We increment 'idx' to keep track of the next token.
        idx += 1
        
    # Now, we tokenize the concatenated 'text' and return offsets mappings along with 'token_map'.
    tokenized = tokenizer("".join(text), return_offsets_mapping=True, truncation=True, max_length=INFERENCE_MAX_LENGTH)
    
    # We return a dictionary containin' the tokenized data and the 'token_map'.
    return {
        **tokenized,
        "token_map": token_map,
    }


Ahoy there, matey! Allow me to decipher this code for ye:

- **Loading Test Data**: We begin by loading test data from a JSON file, setting sail on our data adventure.

- **Creating a Dataset**: Next, we construct a dataset using the data we've loaded. This dataset be our treasure chest, containing details like 'full_text,' 'document,' 'tokens,' and 'trailing_whitespace.'

- **Tokenizer and Model**: We call upon the services of a trusty 'tokenizer' and a fearsome 'model' to aid us in our task. These tools be crucial for handling text data.

- **Parallel Tokenization**: We employ parallel processing to tokenize our dataset using the 'tokenize' function, ensuring speedy execution.

- **Data Collator for Classification**: A 'data collator' be created to assist in token classification, ensuring everything lines up nicely.

- **Training Arguments**: We set the stage with training arguments, specifyin' the output directory, evaluation batch size, and a vow of silence – we won't be reportin' results to any server.

- **Trainer for Evaluation**: Finally, we muster the 'trainer' – a commander for our model, equipped with the model itself, training arguments, data collator, and tokenizer.

With this code, we be ready to evaluate our model's performance on the test data, settin' sail for an exciting NLP adventure! 🏴‍☠️🌊


In [4]:
# JUST 1 MODEL INSTEAD OF 3

# Load the test data from a JSON file
data = json.load(open("/kaggle/input/pii-detection-removal-from-educational-data/test.json"))
#X_train, X_val, y_train, y_val = train_test_split(train_df['token_str'], train_df['label'], test_size=0.2, random_state=42)

# Create a dataset from the loaded data
ds = Dataset.from_dict({
    "full_text": [x["full_text"] for x in data],
    "document": [x["document"] for x in data],
    "tokens": [x["tokens"] for x in data],
    "trailing_whitespace": [x["trailing_whitespace"] for x in data],
})

# Initialize a tokenizer and model from the pretrained model path
model_paths = {'/kaggle/input/pii-deberta-models/cola-de-piiranha' : 1/3,
              '/kaggle/input/pii-deberta-models/cuerpo-de-piiranha' : 1/3,
              '/kaggle/input/pii-deberta-models/cabeza-de-piiranha' : 1/3}

model = '/kaggle/input/pii-deberta-models/cola-de-piiranha'

# first_model_path = list(model_paths.keys())[0]

tokenizer = AutoTokenizer.from_pretrained(model)

# Tokenize the dataset using the 'tokenize' function in parallel
ds = ds.map(tokenize, fn_kwargs={"tokenizer": tokenizer}, num_proc = 4)


import gc
import torch
import numpy as np

from scipy.special import softmax


# all_preds = []

# Calculate the total weight
# total_weight = sum(model_paths.values())


# print("begin tokenization/training on {} model".format(model_path))
# tokenizer = AutoTokenizer.from_pretrained(model)

model = AutoModelForTokenClassification.from_pretrained(model)
collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of = 16)
args = TrainingArguments(
    ".", 
    per_device_eval_batch_size=1, 
    report_to="none",
)
trainer = Trainer(
    model=model, 
    args=args, 
    data_collator=collator, 
    tokenizer=tokenizer,
)
print("corresponding labels b:", model.config.id2label)
# print("begin predictions on {} model".format(model_path))
predictions = trainer.predict(ds).predictions
    # This idea from this notebook: https://www.kaggle.com/code/olyatsimboy/912-blending-0-903-0-854-deberta3base
#     weighted_predictions = softmax(predictions, axis = -1) * weight # softmax helps convert the logits (the raw, unnormalized scores outputted by the last layer of the model) to probabilities
#     all_preds.append(weighted_predictions)
print("corresponding labels:", model.config.id2label)
del model, trainer
torch.cuda.empty_cache()
gc.collect()

# Calculate the weighted average of predictions
# weighted_average_predictions = np.sum(all_preds, axis=0) / total_weight
weighted_average_predictions = predictions

      

#0:   0%|          | 0/3 [00:00<?, ?ex/s]

#1:   0%|          | 0/3 [00:00<?, ?ex/s]

  

#2:   0%|          | 0/2 [00:00<?, ?ex/s]

#3:   0%|          | 0/2 [00:00<?, ?ex/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


corresponding labels b: {0: 'B-EMAIL', 1: 'B-ID_NUM', 2: 'B-NAME_STUDENT', 3: 'B-PHONE_NUM', 4: 'B-STREET_ADDRESS', 5: 'B-URL_PERSONAL', 6: 'B-USERNAME', 7: 'I-ID_NUM', 8: 'I-NAME_STUDENT', 9: 'I-PHONE_NUM', 10: 'I-STREET_ADDRESS', 11: 'I-URL_PERSONAL', 12: 'O'}


corresponding labels: {0: 'B-EMAIL', 1: 'B-ID_NUM', 2: 'B-NAME_STUDENT', 3: 'B-PHONE_NUM', 4: 'B-STREET_ADDRESS', 5: 'B-URL_PERSONAL', 6: 'B-USERNAME', 7: 'I-ID_NUM', 8: 'I-NAME_STUDENT', 9: 'I-PHONE_NUM', 10: 'I-STREET_ADDRESS', 11: 'I-URL_PERSONAL', 12: 'O'}


In [5]:
# WEIGHTED AVERAGE OF THE 3 MODELS
'''
# Load the test data from a JSON file
data = json.load(open("/kaggle/input/pii-detection-removal-from-educational-data/test.json"))
#X_train, X_val, y_train, y_val = train_test_split(train_df['token_str'], train_df['label'], test_size=0.2, random_state=42)

# Create a dataset from the loaded data
ds = Dataset.from_dict({
    "full_text": [x["full_text"] for x in data],
    "document": [x["document"] for x in data],
    "tokens": [x["tokens"] for x in data],
    "trailing_whitespace": [x["trailing_whitespace"] for x in data],
})

# Initialize a tokenizer and model from the pretrained model path
model_paths = {'/kaggle/input/pii-deberta-models/cola-de-piiranha' : 1/3,
              '/kaggle/input/pii-deberta-models/cuerpo-de-piiranha' : 1/3,
              '/kaggle/input/pii-deberta-models/cabeza-de-piiranha' : 1/3}

first_model_path = list(model_paths.keys())[0]

tokenizer = AutoTokenizer.from_pretrained(first_model_path)

# Tokenize the dataset using the 'tokenize' function in parallel
ds = ds.map(tokenize, fn_kwargs={"tokenizer": tokenizer}, num_proc = 4)


import gc
import torch
import numpy as np

from scipy.special import softmax


all_preds = []

# Calculate the total weight
total_weight = sum(model_paths.values())

for model_path, weight in model_paths.items():
    print("begin tokenization/training on {} model".format(model_path))
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    model = AutoModelForTokenClassification.from_pretrained(model_path)
    collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of = 16)
    args = TrainingArguments(
        ".", 
        per_device_eval_batch_size=1, 
        report_to="none",
    )
    trainer = Trainer(
        model=model, 
        args=args, 
        data_collator=collator, 
        tokenizer=tokenizer,
    )
    print("corresponding labels b:", model.config.id2label)
    print("begin predictions on {} model".format(model_path))
    predictions = trainer.predict(ds).predictions
    # This idea from this notebook: https://www.kaggle.com/code/olyatsimboy/912-blending-0-903-0-854-deberta3base
    weighted_predictions = softmax(predictions, axis = -1) * weight # softmax helps convert the logits (the raw, unnormalized scores outputted by the last layer of the model) to probabilities
    all_preds.append(weighted_predictions)
    print("corresponding labels:", model.config.id2label)
    del model, trainer
    torch.cuda.empty_cache()
    gc.collect()

# Calculate the weighted average of predictions
weighted_average_predictions = np.sum(all_preds, axis=0) / total_weight
'''

'\n# Load the test data from a JSON file\ndata = json.load(open("/kaggle/input/pii-detection-removal-from-educational-data/test.json"))\n#X_train, X_val, y_train, y_val = train_test_split(train_df[\'token_str\'], train_df[\'label\'], test_size=0.2, random_state=42)\n\n# Create a dataset from the loaded data\nds = Dataset.from_dict({\n    "full_text": [x["full_text"] for x in data],\n    "document": [x["document"] for x in data],\n    "tokens": [x["tokens"] for x in data],\n    "trailing_whitespace": [x["trailing_whitespace"] for x in data],\n})\n\n# Initialize a tokenizer and model from the pretrained model path\nmodel_paths = {\'/kaggle/input/pii-deberta-models/cola-de-piiranha\' : 1/3,\n              \'/kaggle/input/pii-deberta-models/cuerpo-de-piiranha\' : 1/3,\n              \'/kaggle/input/pii-deberta-models/cabeza-de-piiranha\' : 1/3}\n\nfirst_model_path = list(model_paths.keys())[0]\n\ntokenizer = AutoTokenizer.from_pretrained(first_model_path)\n\n# Tokenize the dataset using th

In [6]:
predictions.shape

(10, 1936, 13)

In [7]:
weighted_average_predictions.shape

(10, 1936, 13)

In [8]:
predictions[0][0].sum()

-6.2648153

In [9]:
softmax_predict = softmax(predictions, axis = -1)

In [10]:
softmax_predict[0][0].sum()

1.0000001

In [11]:
ds

Dataset({
    features: ['full_text', 'document', 'tokens', 'trailing_whitespace', 'input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'token_map'],
    num_rows: 10
})

In [12]:
###
#training_data = pd.read_json('/kaggle/input/pii-detection-removal-from-educational-data/train.json')
#print(training_data.head(5))

In [13]:
'''
This post-processing step allows for adjusting the predictions based on the model's confidence in predicting the 'O' class. 
By using a threshold, you can fine-tune the trade-off between precision and recall for the 'O' class prediction.
'''
model_path = '/kaggle/input/pii-deberta-models/cola-de-piiranha'

config = json.load(open(Path(model_path) / "config.json"))
id2label = config["id2label"]
preds = weighted_average_predictions.argmax(-1) # argmax returns the indices of the maximum value in each row
preds_without_O = weighted_average_predictions[:,:,:12].argmax(-1)
O_preds = weighted_average_predictions[:,:,12]
# Change this threshold to "manually" adjust for the FBeta metric
threshold = 0.9875
preds_final = np.where(O_preds < threshold, preds_without_O , preds)

In [14]:
model_path

'/kaggle/input/pii-deberta-models/cola-de-piiranha'

In [15]:
# config files are the same for all the 3 models
config1 = json.load(open(Path('/kaggle/input/pii-deberta-models/cabeza-de-piiranha') / "config.json")) 
config2 = json.load(open(Path('/kaggle/input/pii-deberta-models/cuerpo-de-piiranha') / "config.json"))
config1 == config2

True

🏴‍☠️ Ahoy, me shipmates! Let's unravel the secrets of this code:

- **Triplet Gathering**: Our quest begins with the formation of empty lists to store triplets – valuable loot from the dataset. These triplets consist of a 'label,' 'token_id,' and 'token_str.'

- **Data Exploration Voyage**: We embark on a voyage through the dataset, exploring each prediction ('p'), token mapping ('token_map'), offsets ('offsets'), tokens ('tokens'), and documents ('doc').

- **Treasure in Tokens**: For each prediction, we inspect the tokens and their corresponding offsets. We decipher the 'label_pred' – the predicted label from the token.

- **Navigating the Indices**: We navigate the tricky indices, checking for special cases where the start and end indices sum to zero, or where token mapping is -1.

- **Ignore the Whitespace**: We're savvy enough to ignore leading whitespace tokens ("\n\n") as we seek the real treasure.

- **Forming Triplets**: With the pieces in place, we form a triplet consisting of the 'label_pred,' 'token_id,' and 'token_str.' But, we only add it to our treasure chest ('triplets') if it hasn't been plundered before.

- **Successful Voyage**: Our data exploration voyage yields a bounty of valuable triplets, ready to be analyzed and deciphered!

May these code treasures guide ye to the heart of the dataset! 🏴‍☠️💰


In [16]:
# Prepare to plunder the data for valuable triplets!
triplets = []
document, token, label, token_str = [], [], [], []

# For each prediction, token mapping, offsets, tokens, and document in the dataset
for p, token_map, offsets, tokens, doc in zip(preds_final, ds["token_map"], ds["offset_mapping"], ds["tokens"], ds["document"]):

    # Iterate through each token prediction and its corresponding offsets
    for token_pred, (start_idx, end_idx) in zip(p, offsets):
        label_pred = id2label[str(token_pred)]  # Predicted label from token

        # If start and end indices sum to zero, continue to the next iteration
        if start_idx + end_idx == 0:
            continue

        # If the token mapping at the start index is -1, increment start index
        if token_map[start_idx] == -1:
            start_idx += 1

        # Ignore leading whitespace tokens ("\n\n")
        while start_idx < len(token_map) and tokens[token_map[start_idx]].isspace():
            start_idx += 1

        # If start index exceeds the length of token mapping, break the loop
        if start_idx >= len(token_map):
            break

        token_id = token_map[start_idx]  # Token ID at start index

        # Ignore "O" predictions and whitespace tokens
        if label_pred != "O" and token_id != -1:
            triplet = (label_pred, token_id, tokens[token_id])  # Form a triplet

            # If the triplet is not in the list of triplets, add it
            if triplet not in triplets:
                document.append(doc)
                token.append(token_id)
                label.append(label_pred)
                token_str.append(tokens[token_id])
                triplets.append(triplet)

# We've gathered the valuable triplets from the dataset, ready for analysis!


In [17]:
# Haul in the data and prepare for your quest!
df = pd.DataFrame({
    "document": document,
    "token": token,
    "label": label,
    "token_str": token_str
})

# Assign each row a unique 'row_id'
df["row_id"] = list(range(len(df)))

# Display a glimpse of the first 100 rows of your data
display(df.head(100))

# Cast your findings into a CSV file for further exploration
df[["row_id", "document", "token", "label"]].to_csv("submission.csv", index=False)

# May the winds of fortune guide ye to untold discoveries!


Unnamed: 0,document,token,label,token_str,row_id
0,7,9,B-NAME_STUDENT,Nathalie,0
1,7,10,I-NAME_STUDENT,Sylla,1
2,7,482,B-NAME_STUDENT,Nathalie,2
3,7,483,I-NAME_STUDENT,Sylla,3
4,7,741,B-NAME_STUDENT,Nathalie,4
5,7,742,I-NAME_STUDENT,Sylla,5
6,10,0,B-NAME_STUDENT,Diego,6
7,10,1,I-NAME_STUDENT,Estrada,7
8,10,464,B-NAME_STUDENT,Diego,8
9,10,465,I-NAME_STUDENT,Estrada,9


In [18]:
'''
from sklearn.metrics import f1_score
# Convert predictions to actual labels (not probabilities)
predicted_labels = np.argmax(weighted_average_predictions, axis=-1)
# Calculate the accuracy
accuracy = np.mean(predicted_labels == labels)
print(f"Model accuracy: {accuracy * 100:.2f}%")
bert_f1_score = f1_score(labels, predicted_labels, average='micro')
print(f"Model F1 score: {bert_f1_score}")
'''

'\nfrom sklearn.metrics import f1_score\n# Convert predictions to actual labels (not probabilities)\npredicted_labels = np.argmax(weighted_average_predictions, axis=-1)\n# Calculate the accuracy\naccuracy = np.mean(predicted_labels == labels)\nprint(f"Model accuracy: {accuracy * 100:.2f}%")\nbert_f1_score = f1_score(labels, predicted_labels, average=\'micro\')\nprint(f"Model F1 score: {bert_f1_score}")\n'

In [None]:
# predicted_labels.shape

# log models:

In [1]:
# WITH AUGMENTED TRAINING DATA

import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.metrics import f1_score

# Load training data
with open("/kaggle/input/pii-data-207-aug/pii_train_format2.json", 'r') as file:
    train_data = json.load(file)

# Extract features and labels
train_documents, train_tokens, train_labels, train_token_strs = [], [], [], []
for entry in train_data:
    doc_id = entry['document']
    for token, label, trailing_space in zip(entry['tokens'], entry['labels'], entry['trailing_whitespace']):
        train_documents.append(doc_id)
        train_tokens.append(token)
        train_labels.append(label)
        train_token_strs.append(token + " " if trailing_space else token)

train_df = pd.DataFrame({
    "document": train_documents,
    "token": train_tokens,
    "label": train_labels,
    "token_str": train_token_strs
})

# Duplicate rows where the label is only 1 I-ID_NUM (for stratify to work in train_test_split)
dup_1 = train_df[train_df['label'] == 'I-URL_PERSONAL']
dup_2 = train_df[train_df['label'] == 'I-ID_NUM']
train_df = pd.concat([train_df, dup_1, dup_2], ignore_index=True)

# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_df['token_str'], train_df['label'], test_size=0.2, random_state=42, stratify=train_df['label'])

# Create a pipeline with TF-IDF Vectorizer and Logistic Regression
pipeline = make_pipeline(TfidfVectorizer(max_features=1000), LogisticRegression(max_iter=1000))

print("Start Training:")
# Train the model on the training data
pipeline.fit(X_train, y_train)

# Make predictions on the training and validation data
y_train_pred = pipeline.predict(X_train)
y_val_pred = pipeline.predict(X_val)

# Calculate and print accuracy for training and validation sets
train_accuracy = accuracy_score(y_train, y_train_pred)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Training Accuracy: {train_accuracy}")
print(f"Validation Accuracy: {val_accuracy}")
train_f1 = f1_score(y_train, y_train_pred, average='micro')
val_f1 = f1_score(y_val, y_val_pred, average='micro')
print(f"Training F1: {train_f1}")
print(f"Validation F1: {val_f1}")




# Prepare the submission using test data
# Load test data
with open("/kaggle/input/pii-detection-removal-from-educational-data/test.json", 'r') as file:
    test_data = json.load(file)

# Extract features for test data
test_documents, test_tokens, test_token_strs = [], [], []
for entry in test_data:
    doc_id = entry['document']
    for token, trailing_space in zip(entry['tokens'], entry['trailing_whitespace']):
        test_documents.append(doc_id)
        test_tokens.append(token)
        test_token_strs.append(token + " " if trailing_space else token)

test_df = pd.DataFrame({
    "document": test_documents,
    "token_str": test_token_strs
})

# Use the trained pipeline to predict labels for the test data
test_df['label'] = pipeline.predict(test_df['token_str'])

# Filter predictions to include only positive PII labels and prepare for submission
submission_df = test_df[test_df['label'] != 'O'].copy()
# submission_df = test_df.copy()
submission_df.reset_index(drop=True, inplace=True)
submission_df['row_id'] = submission_df.index
formatted_submission = submission_df[['row_id', 'document', 'token_str', 'label']]

# Save the submission file
formatted_submission.to_csv("submission.csv", index=False)
print("Submission file 'submission.csv' prepared.")




Start Training:


KeyboardInterrupt: 

dcdc

In [13]:
# WITH CROSS VALIDATION
'''
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.metrics import f1_score
from sklearn.model_selection import ParameterGrid

# Load training data
with open("/kaggle/input/pii-detection-removal-from-educational-data/train.json", 'r') as file:
    train_data = json.load(file)

# Extract features and labels
train_documents, train_tokens, train_labels, train_token_strs = [], [], [], []
for entry in train_data:
    doc_id = entry['document']
    for token, label, trailing_space in zip(entry['tokens'], entry['labels'], entry['trailing_whitespace']):
        train_documents.append(doc_id)
        train_tokens.append(token)
        train_labels.append(label)
        train_token_strs.append(token + " " if trailing_space else token)

train_df = pd.DataFrame({
    "document": train_documents,
    "token": train_tokens,
    "label": train_labels,
    "token_str": train_token_strs
})

# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_df['token_str'], train_df['label'], test_size=0.2, random_state=42)

# Define the hyperparameters grid
params = {
    "max_features": [1000, 5000, 10000]
}

best_model = None
best_score = 0
best_param = None

#Perform grid search
for param in ParameterGrid(params):
    # Create a pipeline with TF-IDF Vectorizer and Logistic Regression
    pipeline = make_pipeline(TfidfVectorizer(max_features=param["max_features"]), LogisticRegression(max_iter=1000))

    print("Start Training with param {}:".format(param["max_features"]))
    # Train the model on the training data
    pipeline.fit(X_train, y_train)

    # Make predictions on the training and validation data
    y_train_pred = pipeline.predict(X_train)
    y_val_pred = pipeline.predict(X_val)

    # Calculate and print accuracy for training and validation sets
    train_accuracy = accuracy_score(y_train, y_train_pred)
    val_accuracy = accuracy_score(y_val, y_val_pred)
    print(f"Training Accuracy: {train_accuracy}")
    print(f"Validation Accuracy: {val_accuracy}")
    train_f1 = f1_score(y_train, y_train_pred, average='micro')
    val_f1 = f1_score(y_val, y_val_pred, average='micro')
    print(f"Training F1: {train_f1}")
    print(f"Validation F1: {val_f1}\n")
    if val_f1 > best_score:
        best_score = val_f1
        best_model = pipeline
        best_param = param["max_features"]

print(f"Best Validation F1: {best_score}")
print(f"Best max_features param: {best_param}")


# Prepare the submission using test data
# Load test data
with open("/kaggle/input/pii-detection-removal-from-educational-data/test.json", 'r') as file:
    test_data = json.load(file)

# Extract features for test data
test_documents, test_tokens, test_token_strs = [], [], []
for entry in test_data:
    doc_id = entry['document']
    for token, trailing_space in zip(entry['tokens'], entry['trailing_whitespace']):
        test_documents.append(doc_id)
        test_tokens.append(token)
        test_token_strs.append(token + " " if trailing_space else token)

test_df = pd.DataFrame({
    "document": test_documents,
    "token_str": test_token_strs
})

# Use the best trained model to predict labels for the test data
test_df['label'] = best_model.predict(test_df['token_str'])

# Filter predictions to include only positive PII labels and prepare for submission
submission_df = test_df[test_df['label'] != 'O'].copy()
submission_df.reset_index(drop=True, inplace=True)
submission_df['row_id'] = submission_df.index
formatted_submission = submission_df[['row_id', 'document', 'token_str', 'label']]

# Save the submission file
formatted_submission.to_csv("submission.csv", index=False)
print("Submission file 'submission.csv' prepared.")
'''

Start Training with param 1000:
Training Accuracy: 0.9994484262245664
Validation Accuracy: 0.9994631985554433
Training F1: 0.9994484262245664
Validation F1: 0.9994631985554433

Start Training with param 5000:
Training Accuracy: 0.999459943425506
Validation Accuracy: 0.99947020902207
Training F1: 0.999459943425506
Validation F1: 0.99947020902207

Start Training with param 10000:
Training Accuracy: 0.9994964980197926
Validation Accuracy: 0.9995002538790414
Training F1: 0.9994964980197926
Validation F1: 0.9995002538790414

Best Validation F1: 0.9995002538790414
Best max_features param: 10000
Submission file 'submission.csv' prepared.


In [36]:
'''
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.metrics import f1_score

# Load training data
with open("/kaggle/input/pii-detection-removal-from-educational-data/train.json", 'r') as file:
    train_data = json.load(file)

# Extract features and labels
train_documents, train_tokens, train_labels, train_token_strs = [], [], [], []
for entry in train_data:
    doc_id = entry['document']
    for token, label, trailing_space in zip(entry['tokens'], entry['labels'], entry['trailing_whitespace']):
        train_documents.append(doc_id)
        train_tokens.append(token)
        train_labels.append(label)
        train_token_strs.append(token + " " if trailing_space else token)

train_df = pd.DataFrame({
    "document": train_documents,
    "token": train_tokens,
    "label": train_labels,
    "token_str": train_token_strs
})

# Duplicate rows where the label is only 1 I-ID_NUM (for stratify to work in train_test_split)
dup_1 = train_df[train_df['label'] == 'I-URL_PERSONAL']
dup_2 = train_df[train_df['label'] == 'I-ID_NUM']
train_df = pd.concat([train_df, dup_1, dup_2], ignore_index=True)

# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_df['token_str'], train_df['label'], test_size=0.2, random_state=42, stratify=train_df['label'])

# Create a pipeline with TF-IDF Vectorizer and Logistic Regression
pipeline = make_pipeline(TfidfVectorizer(max_features=1000), LogisticRegression(max_iter=1000))

print("Start Training:")
# Train the model on the training data
pipeline.fit(X_train, y_train)

# Make predictions on the training and validation data
y_train_pred = pipeline.predict(X_train)
y_val_pred = pipeline.predict(X_val)

# Calculate and print accuracy for training and validation sets
train_accuracy = accuracy_score(y_train, y_train_pred)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Training Accuracy: {train_accuracy}")
print(f"Validation Accuracy: {val_accuracy}")
train_f1 = f1_score(y_train, y_train_pred, average='micro')
val_f1 = f1_score(y_val, y_val_pred, average='micro')
print(f"Training F1: {train_f1}")
print(f"Validation F1: {val_f1}")




# Prepare the submission using test data
# Load test data
with open("/kaggle/input/pii-detection-removal-from-educational-data/test.json", 'r') as file:
    test_data = json.load(file)

# Extract features for test data
test_documents, test_tokens, test_token_strs = [], [], []
for entry in test_data:
    doc_id = entry['document']
    for token, trailing_space in zip(entry['tokens'], entry['trailing_whitespace']):
        test_documents.append(doc_id)
        test_tokens.append(token)
        test_token_strs.append(token + " " if trailing_space else token)

test_df = pd.DataFrame({
    "document": test_documents,
    "token_str": test_token_strs
})

# Use the trained pipeline to predict labels for the test data
test_df['label'] = pipeline.predict(test_df['token_str'])

# Filter predictions to include only positive PII labels and prepare for submission
submission_df = test_df[test_df['label'] != 'O'].copy()
# submission_df = test_df.copy()
submission_df.reset_index(drop=True, inplace=True)
submission_df['row_id'] = submission_df.index
formatted_submission = submission_df[['row_id', 'document', 'token_str', 'label']]

# Save the submission file
formatted_submission.to_csv("submission.csv", index=False)
print("Submission file 'submission.csv' prepared.")
'''

Start Training:
Training Accuracy: 0.9994509302388466
Validation Accuracy: 0.9994511806126547
Training F1: 0.9994509302388466
Validation F1: 0.9994511806126547
Submission file 'submission.csv' prepared.


In [37]:
# test_df

Unnamed: 0,document,token_str,label
0,7,Design,O
1,7,Thinking,O
2,7,for,O
3,7,innovation,O
4,7,reflexion,O
...,...,...,...
8500,123,(,O
8501,123,https://www.melessa.uni-,O
8502,123,muenchen.de/team/vorstandssprecher/schmidt/pub...,O
8503,123,),O


In [38]:
# test_df['label'].value_counts()

label
O    8505
Name: count, dtype: int64

In [27]:
# train_df['label'].head()

0    O
1    O
2    O
3    O
4    O
Name: label, dtype: object

In [34]:
# train_df['label'].value_counts()

label
O                   4989794
B-NAME_STUDENT         1365
I-NAME_STUDENT         1096
B-URL_PERSONAL          110
B-ID_NUM                 78
B-EMAIL                  39
I-STREET_ADDRESS         20
I-PHONE_NUM              15
B-USERNAME                6
B-PHONE_NUM               6
B-STREET_ADDRESS          2
I-URL_PERSONAL            1
I-ID_NUM                  1
Name: count, dtype: int64

In [35]:
# train_df[train_df['label'] == 'I-URL_PERSONAL']

Unnamed: 0,document,token,label,token_str
83770,3202,nYZqnhEXw,I-URL_PERSONAL,nYZqnhEXw
