Config

In [30]:
class Config:
	exp = "002"
	ver = "001"
	
	seed = 42

	num_proc = 4

	threshold = 0.99
	max_length = 1024

	model_name = "deberta3base-truncation-false"
	freeze_layers = 0

	save_path = f"/kaggle/input/{model_name}-{exp}-{ver}"

	output_dir = "/kaggle/output"

	model_path = "/kaggle/input/huggingfacedebertav3variants/deberta-v3-small"
	
	train_path = "/kaggle/input/pii-detection-removal-from-educational-data/train.json"
	test_path = "/kaggle/input/pii-detection-removal-from-educational-data/test.json"
	moredata_path = "/kaggle/input/fix-punctuation-tokenization-external-dataset/moredata_dataset_fixed.json"
	pii_dataset_fixed_path = "/kaggle/input/fix-punctuation-tokenization-external-dataset/pii_dataset_fixed.json"

	all_labels = ['B-EMAIL', 'B-ID_NUM', 'B-NAME_STUDENT', 'B-PHONE_NUM', 'B-STREET_ADDRESS', 'B-URL_PERSONAL', 'B-USERNAME', 'I-ID_NUM', 'I-NAME_STUDENT', 'I-PHONE_NUM', 'I-STREET_ADDRESS', 'I-URL_PERSONAL', 'O']
	num_pii_labels = len(all_labels)-1
	label2id = {label: index for index, label in enumerate(all_labels)}
	id2label = {index: label for index, label in enumerate(all_labels)}

TabError: inconsistent use of tabs and spaces in indentation (597210843.py, line 13)

In [None]:
import json, argparse, torch, sys, random, gc, os
import numpy as np
import pandas as pd
import functools
from itertools import chain
from functools import partial
from pathlib import Path
import ctypes

import torch.nn as nn
import pytorch_lightning as pl

# Transformer
from transformers import (
	AutoTokenizer,
	Trainer,
	TrainingArguments,
	AutoConfig,
	AutoModel,
	AutoModelForTokenClassification,
	DataCollatorForTokenClassification,
	PreTrainedTokenizer,
	PreTrainedModel,
	PretrainedConfig,
	DebertaV2Config,
)
from datasets import Dataset, features
from typing import Iterable, Any, Callable
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

from seqeval.metrics import recall_score, precision_score

In [None]:
# Seed the same seed to all 
def seed_everything(seed=42):
	random.seed(seed)
	np.random.seed(seed)
	torch.manual_seed(seed)

seed_everything(Config.seed)


libc = ctypes.CDLL("libc.so.6")
def clear_memory():
	libc.malloc_trim(0)
	torch.cuda.empty_cache()
	gc.collect()

DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f"Device: {DEVICE}")

Device: cuda


Pre Processeing

In [None]:
def load_data():
	# Load training data
	train_data = pd.read_json(Path(Config.train_path))
	print(f"kaggle train data = {len(train_data)}")

	more_data = pd.read_json(Path(Config.moredata_path))
	print(f"more data = {len(more_data)}")

	pii_dataset_fixed = pd.read_json(Path(Config.pii_dataset_fixed_path))
	print(f"pii_dataset_fixed = {len(pii_dataset_fixed)}")

	# Combine to a single df
	df = pd.concat([train_data, more_data, pii_dataset_fixed])
	df["document"] = [i for i in range(len(df))]  # Update the document id
	df.reset_index(drop=True, inplace=True)

	return df


# Eencode labels to columns
def encode_labels(df: pd.DataFrame):
	total = len(df)
	df["unique_labels"] = df["labels"].apply(
		lambda labels: list(
			set([label.split("-")[1] for label in labels if label != "O"])
		)
	)
	mlb = MultiLabelBinarizer()
	one_hot_encoded = mlb.fit_transform(df["unique_labels"])
	one_hot_df = pd.DataFrame(one_hot_encoded, columns=mlb.classes_)
	df = pd.concat([df, one_hot_df], axis=1)
	# add 'POS' column that don't have
	df["others"] = df["unique_labels"].apply(lambda x: 1 if len(x) == 0 else 0)
	label_classes = list(mlb.classes_) + ["others"]
	for col in label_classes:
		subtotal = df[col].sum()
		percent = subtotal / total * 100
		print(f"{col}: {subtotal}  ({percent:.1f}%)")
	return df, label_classes

In [None]:
def split_df_by_sampling(df: pd.DataFrame, n_samples: int):
	# Get the sample df
	samples_df = df.sample(n=n_samples, random_state=Config.seed)
	# The remaining df
	cond = df["document"].isin(samples_df["document"])
	others_df = df.drop(df[cond].index, inplace=False)
	return samples_df, others_df


def downsample_df(df: pd.DataFrame):
	"""Split the df into training and valid dataset"""
	df["is_labels"] = df["labels"].apply(
		lambda labels: any(label != "O" for label in labels)
	)

	# One or more labels are not 'O'
	true_labels = df[df["is_labels"]]
	# all labels are 'O'
	false_labels = df[~df["is_labels"]]

	# Reset index to two df
	true_labels = true_labels.reset_index(drop=True, inplace=False)
	false_labels = false_labels.reset_index(drop=True, inplace=False)
	print(f"Number of true_labels = {len(true_labels)}")
	print(f"Number of false_labels = {len(false_labels)}")

	# Get 300 as valid dataset
	n_true_samples = len(true_labels) - int(300 * len(true_labels) / len(df))

	# Sample true labels
	true_samples, true_others = split_df_by_sampling(true_labels, n_true_samples)
	print(f"true_samples = {len(true_samples)} true_others = {len(true_others)}")
	n_samples = len(false_labels) - (300 - int(300 * len(true_labels) / len(df)))
	# Sample false labels
	false_samples, false_others = split_df_by_sampling(false_labels, n_samples)
	print(f"false_samples = {len(false_samples)} false_others = {len(false_others)}")
	# Training ds = P * true_labels + P * false_labels
	train_df = pd.concat([true_samples, false_samples])
	# Valid ds = (1-P) * true_labels + (1-P) * false_labels
	valid_df = pd.concat([true_others, false_others])
	return train_df, valid_df

In [None]:
# Tokenize function
def tokenize(example: pd.DataFrame, tokenizer: PreTrainedTokenizer, label2id: dict[str, int]):
	# Preprocess the tokens and labels by adding trailing whitespace and labels
	tokens = []
	labels = []
	for token, label, t_ws in zip(
		example["tokens"], example["provided_labels"], example["trailing_whitespace"]
	):
		tokens.append(token)
		labels.extend([label] * len(token))
		# Added trailing whitespace and label if true and
		if t_ws:
			tokens.append(" ")
			labels.append("O")

	text = "".join(tokens)
	# print(f"len(text)={len(text)}, len(tokens)={len(tokens)}")
	# tokenization without truncation
	tokenized = tokenizer(text, return_offsets_mapping=True, truncation=False)
	labels = np.array(labels)
	# Labels
	token_labels = []
	for start_idx, end_idx in tokenized.offset_mapping:
		# Added 'O'
		if start_idx == 0 and end_idx == 0:
			token_labels.append(label2id["O"])
		else:
			# case when the text starts with whitespace
			if text[start_idx].isspace():
				start_idx += 1
			# Convert label to id (int)
			label_id = label2id[labels[start_idx]]
			token_labels.append(label_id)

	return {**tokenized, "labels": token_labels, "length": len(tokenized.input_ids)}

# Convert df to tokenized dataset
def create_dataset(df: pd.DataFrame, tokenizer: PreTrainedTokenizer, label2id: dict[str, int]):
	ds = Dataset.from_dict(
		{
			"full_text": df["full_text"].tolist(),
			"document": df["document"].astype("string"),
			"tokens": df["tokens"].tolist(),
			"trailing_whitespace": df["trailing_whitespace"].tolist(),
			"provided_labels": df["labels"].tolist(),
		}
	)
	# Tokenize the dataset
	tokenized_ds = ds.map(
		tokenize,
		fn_kwargs={"tokenizer": tokenizer, "label2id": label2id},
		num_proc=Config.num_proc,
	)
	return tokenized_ds

In [None]:
df = load_data()

# Split 'df' into training and valid dataset (300) based on whether the row is all 'O' or not. 
train_df, valid_df = downsample_df(df.copy())
train_df.reset_index(drop=True, inplace=True)
valid_df.reset_index(drop=True, inplace=True)
print(f"Number of train_df = {len(train_df)}")
print(f"Number of valid_df = {len(valid_df)}")
clear_memory()

In [None]:
tokenizer = AutoTokenizer.from_pretrained(Config.model_path)
train_ds = create_dataset(train_df, tokenizer, Config.label2id)
valid_ds = create_dataset(valid_df, tokenizer, Config.label2id)

Model

In [None]:
def post_processing_preds(preds: torch.Tensor, is_train: bool = True):
	preds_final = []
	if is_train:
		preds_softmax: torch.Tensor = np.exp(preds) / np.sum(
			np.exp(preds), axis=2
		).reshape(preds.shape[0], preds.shape[1], 1)
	else:
		preds_softmax: torch.Tensor = np.exp(preds) / np.sum(
			np.exp(preds), axis=2
		).reshape(preds.shape[0], preds.shape[1], -1)
	# Get the maximal value as the final preds
	preds = preds.argmax(-1)
	preds_without_O = preds_softmax[:, :, : Config.num_pii_labels].argmax(
		-1
	)  # Prob of entity labels (like 'NAME_STUDENT')
	O_preds = preds_softmax[:, :, Config.num_pii_labels]  # Prob for 'O'

	preds_final = np.where(O_preds < Config.threshold, preds_without_O, preds)
	return preds_final

In [None]:
# Compute the model performance metrics using `seqeval`
def compute_metrics(preds, all_labels):    
	try:
		#print("Compute metrics")
		predictions, labels = preds
		# predictions = np.argmax(predictions, axis=2)
		predictions = post_processing_preds(predictions)
		# Include prediction Remove ignored index (special tokens)
		true_preds = []
		true_labels = []
		for pred, label in zip(predictions, labels):
			true_preds.append([all_labels[p] for p, l in zip(pred, label) if l != -100])
			true_labels.append([all_labels[l] for p, l in zip(pred, label) if l != -100])
	
		# Compute recall, precision and f1 score
		recall = recall_score(true_labels, true_preds)
		precision = precision_score(true_labels, true_preds)
		# f5 score to measure the performance
		f5_score = (1 + 5*5) * recall * precision / (5*5*precision + recall)
		result = {'f5': f5_score,  
				  'recall': recall,
				  'precision': precision}
		print(f"result = {result}")
		return result
	except Exception as e: 
		print(e)

In [29]:
class LSTMHead(nn.Module):
	def __init__(self, in_features, hidden_dim, n_layers):
		super().__init__()
		self.lstm = nn.LSTM(
			in_features,
			hidden_dim,
			n_layers,
			batch_first=True,
			bidirectional=True,
			dropout=0.1,
		)
		self.out_features = hidden_dim

	def forward(self, x):
		self.lstm.flatten_parameters()
		hidden, (_, _) = self.lstm(x)
		out = hidden
		return out


class DebertaLSTMModel(pl.LightningModule):
	def __init__(self):
		super(DebertaLSTMModel, self).__init__()
		self.model_config: DebertaV2Config = AutoConfig.from_pretrained(Config.model_path)
		
		hidden_dropout_prob: float = 0.1
		layer_norm_eps: float = 1e-7
		self.model_config.update(
			{
				"output_hidden_states": True,
				"hidden_dropout_prob": hidden_dropout_prob,
				"layer_norm_eps": layer_norm_eps,
				"add_pooling_layer": False,
			}
		)

		self.transformers_model: PreTrainedModel = AutoModel.from_pretrained(
			Config.model_path
		)
		self.head = LSTMHead(in_features=self.model_config.hidden_size, hidden_dim=self.model_config.hidden_size//2, n_layers=1)
		
		self.output = nn.Linear(self.model_config.hidden_size, len(self.cfg.target_cols))
		
		if Config.freeze_layers>0:
			print(f'Freezing {Config.freeze_layers} layers.')
			for layer in self.transformers_model.longformer.encoder.layer[:config.freeze_layers]:
				for param in layer.parameters():
					param.requires_grad = False


		self.loss_function = nn.CrossEntropyLoss(reduction='mean',ignore_index=-100) 
		self.validation_step_outputs = []

	def forward(self, input_ids, attention_mask,train):        
		transformer_out = self.transformers_model(input_ids,attention_mask = attention_mask)
		sequence_output = transformer_out.last_hidden_state
		sequence_output = self.head(sequence_output)
		logits = self.output(sequence_output)

		return (logits, _)
	

	def training_step(self,batch,batch_idx):
		input_ids = batch['input_ids']
		attention_mask = batch['attention_mask']
		target = batch['labels'] 

		outputs = self(input_ids,attention_mask,train=True)
		output = outputs[0]
		loss = self.loss_function(output.view(-1,len(self.cfg.target_cols)), target.view(-1))
		
		self.log('train_loss', loss , prog_bar=True)
		return {'loss': loss}

	def train_epoch_end(self,outputs):
		avg_loss = torch.stack([x['loss'] for x in outputs]).mean()
		print(f'epoch {trainer.current_epoch} training loss {avg_loss}')
		return {'train_loss': avg_loss} 
	
	def validation_step(self,batch,batch_idx):
		input_ids = batch['input_ids']
		attention_mask = batch['attention_mask']
		target = batch['labels'] 

		outputs = self(input_ids,attention_mask,train=False)
		output = outputs[0]

		loss = self.loss_function(output.view(-1,len(self.cfg.target_cols)), target.view(-1))
		
		self.log('val_loss', loss , prog_bar=True)
		self.validation_step_outputs.append({"val_loss": loss, "logits": output, "targets": target})
		return {'val_loss': loss, 'logits': output,'targets':target}

	def on_validation_epoch_end(self):
		outputs = self.validation_step_outputs
		avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()

		flattened_preds = [logit for batch in outputs for logit in batch['logits']]

		flattened_preds = process_predictions(flattened_preds)
		# print(flattened_preds.shape)
		pred_df = predictions_to_df(flattened_preds, self.val_ds)
		
		print(pred_df.shape)
		print(pred_df)
		
		self.validation_step_outputs = []

		# print(output_val.shape)
		avg_score = compute_metrics(pred_df,self.true_val_df)
		f5_score = avg_score['ents_f5']
		print(f'epoch {trainer.current_epoch} validation loss {avg_loss}')
		print(f'epoch {trainer.current_epoch} validation scores {avg_score}')
		
		return {'val_loss': avg_loss,'val_f5':f5_score}
	
	def train_dataloader(self):
		return self._train_dataloader 
	
	def validation_dataloader(self):
		return self._validation_dataloader

	def get_optimizer_params(self, encoder_lr, decoder_lr, weight_decay=0.0):
		param_optimizer = list(model.named_parameters())
		no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
		optimizer_parameters = [
			{'params': [p for n, p in self.transformers_model.named_parameters() if not any(nd in n for nd in no_decay)],
			 'lr': encoder_lr, 'weight_decay': weight_decay},
			{'params': [p for n, p in self.transformers_model.named_parameters() if any(nd in n for nd in no_decay)],
			 'lr': encoder_lr, 'weight_decay': 0.0},
			{'params': [p for n, p in self.named_parameters() if "transformers_model" not in n],
			 'lr': decoder_lr, 'weight_decay': 0.0}
		]
		return optimizer_parameters

	def configure_optimizers(self):
		optimizer = AdamW(self.parameters(), lr = config.learning_rate)

		epoch_steps = self.cfg.data_length
		batch_size = self.cfg.batch_size

		warmup_steps = 0.05 * epoch_steps // batch_size
		training_steps = self.cfg.epochs * epoch_steps // batch_size
		# scheduler = get_linear_schedule_with_warmup(optimizer,warmup_steps,training_steps,-1)
		# scheduler = get_polynomial_decay_schedule_with_warmup(optimizer, warmup_steps, training_steps, lr_end=1e-6, power=3.0)
		scheduler = get_cosine_schedule_with_warmup(optimizer, warmup_steps, training_steps, num_cycles=0.5)
		
		lr_scheduler_config = {
				'scheduler': scheduler,
				'interval': 'step',
				'frequency': 1,
			}

		return {'optimizer': optimizer, 'lr_scheduler': lr_scheduler_config}

TabError: inconsistent use of tabs and spaces in indentation (2993439985.py, line 40)

Train

In [None]:
# # Training requires the GPUs and internet
# TRAINING = True # True: Model Training, False: Model Inference
# if TRAINING: 
#     # Configuration class containing various model and training parameters
#     trainer = ModelTrainer()
#     trainer.train(train_df, valid_df)

Infer

In [None]:
# Model Inferer
class ModelInfer:
	def __init__(self):
		self.infer_dir = "/kaggle/working/infer" # Model infer output 
		self.load_model()

	def load_model(self):
		# Create the tokenizer
		self.tokenizer = AutoTokenizer.from_pretrained(Config.model_path) 
		# Create the model
		self.model = AutoModelForTokenClassification.from_pretrained(Config.model_path)        
		# # Load the fine-tuned adapter layer on top of base model
		# self.model = self.model.to(DEVICE)n
		print("Complete loading pretrained LLM model")     
	
	def infer_preds(self, ds: Dataset):
		# Tokenize the dataset using customized Tokenizer (the same as Training Tokenizer)
		tokenized_ds = ds.map(tokenize, fn_kwargs={"tokenizer": self.tokenizer}, num_proc=2)
		# Create data loader
		data_collator = DataCollatorForTokenClassification(self.tokenizer,
														   pad_to_multiple_of=16)
		# Arguments (infer only)
		args = TrainingArguments(output_dir=self.infer_dir,
								 per_device_eval_batch_size=1, 
								 report_to="none")
		# Create the trainer 
		trainer = Trainer(model=self.model, 
						  args=args, 
						  data_collator=data_collator, 
						  tokenizer=self.tokenizer)
		
		# predict for that split
		preds = trainer.predict(tokenized_ds).predictions
				
		# Clear the unused memory
		del self.model, data_collator, trainer, args 
		clear_memory()
		preds_final = post_processing_preds(preds)
		return preds_final, tokenized_ds

In [None]:
test_data = pd.read_json("/kaggle/input/pii-detection-removal-from-educational-data/test.json")

test_ds = Dataset.from_dict({
	"full_text": test_data["full_text"].tolist(),
	"document": test_data["document"].tolist(),
	"tokens": test_data["tokens"].tolist(),
	"trailing_whitespace": test_data["trailing_whitespace"].tolist(),
})
print(f"Total number of test dataset {len(test_ds)}")
# config = json.load(open(Path(Config.model_path) / "config.json"))
# id2label = config["id2label"]
# Load the pretrained model and make the predictions
inferer = ModelInfer()
preds_final, tokenized_ds = inferer.infer_preds(test_ds) 

Post Processing

In [None]:
# Convert preds to a list of dictionaries
results = []
for p, token_map, offsets, tokens, doc in zip(preds_final,
											  tokenized_ds["token_map"], 
											  tokenized_ds["offset_mapping"],
											  tokenized_ds["tokens"],
											  tokenized_ds["document"]):
	for token_pred, (start_idx, end_idx) in zip(p, offsets):
		try:
			label_pred = Config.id2label[str(token_pred)]
			if start_idx + end_idx == 0: 
				continue

			if token_map[start_idx] == -1:
				start_idx += 1
			 # ignore "\n\n"
			while start_idx < len(token_map) and tokens[token_map[start_idx]].isspace():
				start_idx += 1

			if start_idx >= len(token_map): 
				break

			token_id = token_map[start_idx]

			# ignore "O" predictions and whitespace preds
			if label_pred != "O" and token_id != -1:
				results.append({
						"document": doc,
						"token": token_id,
						"label": label_pred,
						"token_str": tokens[token_id]
					})
				
		except Exception as e:
			print(f"Error {e}")
			print(f"token_map {len(token_map)} and {token_pred}  {start_idx} {end_idx}")
			sys.exit(-1)

In [None]:
import re
from spacy.lang.en import English
nlp = English()

def find_span(target: list[str], document: list[str]) -> list[list[int]]:
	idx = 0
	spans = []
	span = []
	
	for i, token in enumerate(document):
		if token != target[idx]:
			idx = 0
			span = []
			continue
		span.append(i)
		
		idx += 1
		if idx == len(target):
			spans.append(span)
			span = []
			idx = 0
			continue
	
	return spans

email_regex = re.compile(r'[\w.+-]+@[\w-]+\.[\w.-]+')
phone_num_regex = re.compile(r"(\(\d{3}\)\d{3}\-\d{4}\w*|\d{3}\.\d{3}\.\d{4})\s")
emails = []
phone_nums = []

for _data in test_ds:
	# email
	for token_idx, token in enumerate(_data["tokens"]):
		if re.fullmatch(email_regex, token) is not None:
			emails.append(
				{"document": _data["document"], "token": token_idx, "label": "B-EMAIL", "token_str": token}
			)
	# phone number
	matches = phone_num_regex.findall(_data["full_text"])
	if not matches:
		continue
		
	for match in matches:
		target = [t.text for t in nlp.tokenizer(match)]
		matched_spans = find_span(target, _data["tokens"])
		
	for matched_span in matched_spans:
		for intermediate, token_idx in enumerate(matched_span):
			prefix = "I" if intermediate else "B"
			phone_nums.append(
				{"document": _data["document"], "token": token_idx, "label": f"{prefix}-PHONE_NUM", "token_str": _data["tokens"][token_idx]}
			)

results.extend(emails)
results.extend(phone_nums)

def remove_duplicates(df: pd.DataFrame):
	# Sort by the document and token
	df.sort_values(by=['document', 'token'])
	# Combine three columns 
	df['triplet'] = df[["document", "token", "label"]].apply(lambda row: '_'.join(row.values.astype(str)), axis=1) 
	# display(df)
	# Drop duplicated triplets and keep the first one as unique row
	df = df.drop_duplicates(subset=["triplet"], keep='first')
	# Regenerate 'row_id'
	df['row_id'] = list(range(len(df)))    
	df = df.reset_index(drop=True, inplace=False) 
	print("Remove duplicates")
#     display(df)
	return df

In [None]:
test_df = pd.DataFrame(results)
test_df = remove_duplicates(test_df)
test_df = test_df[["row_id", "document", "token", "label"]]
# Create submission df
test_df.to_csv("submission.csv", index=False)
display(test_df)