In [1]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
import torch
import numpy as np
from torch.utils.data import Dataset
import json

In [2]:
class PiiDataset(Dataset):
    def __init__(self, text_input_ids, labels_input_ids):
        if(len(text_input_ids) != len(labels_input_ids)):
            raise ValueError("Length of text_input_ids and labels_input_ids should be same")

        self.text_input_ids = text_input_ids
        self.labels_input_ids = labels_input_ids

    def __len__(self):
        return len(self.text_input_ids)

    def __getitem__(self, idx):
        return {"input_ids": self.text_input_ids[idx], "labels": self.labels_input_ids[idx]}

In [3]:
train_data_path = "pii-detection-data/train.json"
test_data_path = "pii-detection-data/test.json"

# Loading Dataset
with open(train_data_path) as file:
    train_data_json = json.load(file)
    print("Training Data: ", len(train_data_json))

with open(test_data_path ) as file:
    test_data_json = json.load(file)
    print("Test Data: ", len(test_data_json))

Training Data:  6807
Test Data:  10


In [4]:
# Limiting the data for testing
train_data_size = int(len(train_data_json) * 0.8)
print("Train Data Size: ", train_data_size)
# round test data size to integer

train_data = train_data_json[:train_data_size]
test_data = train_data_json[train_data_size:]

Train Data Size:  5445


In [5]:
print("Train Data: ", len(train_data))
print("Test Data: ", len(test_data))

Train Data:  5445
Test Data:  1362


In [6]:
print("Length of Text", len(train_data[0]['full_text']))
print("Length of Labels", len(train_data[0]['labels']))
print("Example of Label", train_data[0]['labels'][0])

Length of Text 3709
Length of Labels 753
Example of Label O


In [7]:
model_checkpoint = "t5-small"
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
tokenizer(["0", "01", "20"])

{'input_ids': [[3, 632, 1], [7088, 1], [460, 1]], 'attention_mask': [[1, 1, 1], [1, 1], [1, 1]]}

In [8]:
text = "Design Thinking for innovation reflexion-Avril"
test_text = tokenizer(text)["input_ids"]
test_ids = [tokenizer(token)["input_ids"] for token in train_data[0]["tokens"]]
print(train_data[0]["tokens"][0])
print(len(test_ids))
print(test_text)
print(test_ids)

Design
753
[1642, 20273, 21, 4337, 3, 60, 31898, 18, 188, 208, 52, 173, 1]
[[1642, 1], [20273, 1], [21, 1], [4337, 1], [3, 60, 31898, 1], [3, 18, 1], [71, 208, 52, 173, 1], [460, 2658, 1], [3, 18, 1], [9267, 1024, 1896, 1], [5224, 195, 9, 1], [1], [7729, 1], [3, 184, 1], [1801, 1], [1], [37, 1], [1464, 1], [27, 1], [169, 1], [12, 1], [199, 1], [66, 1], [10588, 1], [2342, 1], [70, 1], [194, 1], [190, 1], [8, 1], [11641, 1], [13, 1], [3, 9, 1], [516, 1], [19, 1], [8, 1], [1], [809, 1], [2828, 1], [3, 5, 1], [1], [363, 1], [1776, 1], [19, 1], [3, 9, 1], [809, 1], [2828, 1], [3, 58, 1], [2150, 1], [12, 1], [8, 1], [4903, 1], [13, 1], [4708, 10241, 1], [332, 5, 1], [11, 1], [4708, 10241, 1], [272, 5, 1], [41, 1], [5247, 1], [3, 6, 1], [2973, 7, 630, 1], [3, 18, 1], [5387, 1], [1], [3, 40, 31, 29563, 1], [3, 5, 1], [1919, 1], [3, 10, 1], [622, 1], [7983, 10569, 7, 1], [3, 26, 31, 7395, 2565, 2121, 1], [3, 5, 1], [3, 61, 1], [3, 6, 1], [8, 1], [809, 1], [2828, 1], [41, 1], [42, 1], [3, 88, 45

In [9]:
full_text_test = " ".join(train_data[0]["labels"])
print(full_text_test)
print(tokenizer(full_text_test))

Token indices sequence length is longer than the specified maximum sequence length for this model (796 > 512). Running this sequence through the model will result in indexing errors


O O O O O O O O O B-NAME_STUDENT I-NAME_STUDENT O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O B-NAME_STU

In [10]:
train_text_input_ids = []
train_labels_input_ids = []
max_length = 400

for i, data in enumerate(train_data):
    # Loop through data in batches of 400 tokens
    for j in range(0, len(data["tokens"]), max_length):
        batch_size = min(j + max_length, len(data["tokens"]))

        train_text_input_ids.append(tokenizer(" ".join(data["tokens"][j: batch_size]))["input_ids"])
        train_labels_input_ids.append(tokenizer(" ".join(data["labels"][j: batch_size]))["input_ids"])
    

In [11]:
print(len(train_data[0]["tokens"]))

753


In [12]:
for j in range(0, len(train_data[0]["tokens"]), 400):
    batch_size = min(j + 400, len(train_data[0]["tokens"]))
    print(j, batch_size)
print(j)
if(j < len(train_data[0]["tokens"])):
    print("Last Batch", j, len(train_data[0]["tokens"]))

0 400
400 753
400
Last Batch 400 753


In [13]:
print(len(train_text_input_ids[0]))
print(train_text_input_ids[0])
print(len(train_labels_input_ids[0]))
print(train_labels_input_ids[0])

465
[1642, 20273, 21, 4337, 3, 60, 31898, 3, 18, 71, 208, 52, 173, 460, 2658, 3, 18, 9267, 1024, 1896, 5224, 195, 9, 7729, 3, 184, 1801, 37, 1464, 27, 169, 12, 199, 66, 10588, 2342, 70, 194, 190, 8, 11641, 13, 3, 9, 516, 19, 8, 809, 2828, 3, 5, 363, 1776, 19, 3, 9, 809, 2828, 3, 58, 2150, 12, 8, 4903, 13, 4708, 10241, 332, 5, 11, 4708, 10241, 272, 5, 41, 5247, 3, 6, 2973, 7, 630, 3, 18, 5387, 3, 40, 31, 29563, 3, 5, 1919, 3, 10, 622, 7983, 10569, 7, 3, 26, 31, 7395, 2565, 2121, 3, 5, 3, 61, 3, 6, 8, 809, 2828, 41, 42, 3, 88, 450, 3040, 6423, 3, 61, 19, 3, 9, 7693, 6497, 3317, 24, 6963, 8, 793, 11850, 13, 8, 809, 11, 1250, 8, 2241, 3, 31, 7, 1055, 12, 36, 1883, 3, 5, 205, 89, 8977, 226, 536, 100, 1464, 65, 186, 7648, 3, 10, 1697, 94, 19, 3551, 12, 66, 11, 405, 59, 1457, 1516, 1037, 1729, 11, 54, 36, 612, 1224, 1697, 94, 19, 3, 24079, 1697, 94, 1250, 9624, 122, 127, 1707, 11, 17988, 13, 251, 1697, 94, 54, 36, 2930, 12, 136, 686, 13, 1419, 3, 10, 2232, 14867, 3, 6, 682, 11795, 3, 6, 1693,

In [14]:
test_text_input_ids = []
test_labels_input_ids = []
for i, data in enumerate(test_data):
    test_text_input_ids.append(tokenizer(" ".join(data["tokens"][:400]))["input_ids"])
    test_labels_input_ids.append(tokenizer(" ".join(data["labels"][:400]))["input_ids"])

In [17]:
print(len(test_text_input_ids[0]))
print(test_text_input_ids[0])
print(len(test_labels_input_ids[0]))
print(test_labels_input_ids[0])

238
[6630, 3289, 6008, 16, 13226, 3732, 3, 5, 209, 7, 17, 1726, 4769, 21, 1277, 24, 54, 36, 3934, 16, 8, 512, 3, 6, 45, 204, 12, 6180, 7393, 7, 3, 6, 48, 97, 56, 36, 2425, 12, 2342, 8, 1277, 24, 7864, 8, 831, 1124, 204, 727, 1726, 31713, 11, 2025, 12, 8, 1475, 16, 3, 9, 1126, 97, 3, 6, 45, 204, 12, 220, 767, 3, 6, 56, 36, 1316, 12, 2331, 91, 8, 1530, 53, 13, 8, 9701, 3, 6, 574, 8, 981, 12311, 3, 6, 143, 8, 1942, 190, 135, 3, 6, 11, 4797, 8, 3365, 2175, 338, 8, 796, 1308, 13, 16003, 667, 5946, 4211, 12, 356, 8, 2025, 1124, 220, 52, 26, 1726, 21582, 257, 54, 36, 787, 11609, 28, 8, 761, 3, 6, 45, 204, 12, 220, 3, 6, 767, 3, 6, 8, 2545, 13, 8, 1437, 56, 36, 1702, 3, 6, 11, 762, 24, 174, 12, 36, 8794, 3, 6, 4850, 579, 3, 6, 2986, 11, 2453, 3, 6, 166, 3830, 11, 889, 758, 3, 6, 859, 717, 3, 5, 314, 189, 1726, 3527, 53, 11, 5002, 3, 6, 728, 8, 220, 1767, 6518, 43, 118, 2012, 3, 6, 1099, 7, 578, 20079, 3600, 120, 12179, 66, 13, 8, 756, 3, 6, 8, 6566, 7, 56, 582, 294, 13, 8, 999, 433, 3, 6, 78, 

In [19]:
batch_size = 32
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-xsum",
    evaluation_strategy = "epoch",
    learning_rate=2e-3,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=50,
    predict_with_generate=True,
    push_to_hub=False,
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [20]:
train_dataset = PiiDataset(train_text_input_ids, train_labels_input_ids)
test_dataset = PiiDataset(test_text_input_ids, test_labels_input_ids)

In [21]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer.train()

Epoch,Training Loss,Validation Loss


In [None]:
# test = AdditionDataset(2,"test")
# count = 0
# for index in range(len(test)):
#   device = 'cuda'
#   preds = model.generate(input_ids = torch.tensor(test[index]["input_ids"]).to(device).view(1,-1))
#   count+=1.0*(int(tokenizer.decode(np.array(preds.cpu()[0]))[5:-4]) == int(tokenizer.decode(np.array(test[index]["labels"]))[:-4]))
#   if index%10==0:
#     print(tokenizer.decode(np.array(preds.cpu()[0]))[5:-4],tokenizer.decode(np.array(test[index]["labels"]))[:-4])
# count/len(test)

 126 126
 104 104
 072 072
 104 104
 027 027
 062 062
 014 014
 167 168
 100 100
 062 062
 082 082
 057 057
 174 174
 123 123
 061 061
 123 123
 027 027
 072 072
 139 139
 069 069
 089 089
 036 036
 130 130
 142 142
 067 067
 122 122
 049 049
 089 089
 112 112
 067 067
 007 007
 121 121
 060 060
 138 138
 136 137
 109 109
 110 110
 096 096
 106 106
 084 084
 063 063
 011 010
 101 101
 032 032
 056 056
 157 157
 160 160
 125 125
 104 104
 132 132
 057 057
 135 135
 034 034
 099 099
 124 124
 021 021
 051 051
 146 146
 108 108
 086 086
 146 146
 155 155
 046 046
 121 121
 119 119
 084 084
 083 083
 107 107
 115 115
 046 046
 065 065
 071 071
 090 090
 113 113
 159 159
 044 044
 079 079
 113 113
 158 157
 070 070
 056 056
 150 150
 110 110
 142 142
 167 168
 062 062
 133 133
 056 056
 100 100
 142 142
 173 173
 098 098
 134 134
 108 108
 108 108
 101 101
 064 064
 115 115
 060 060
 146 146


0.958

In [None]:
type(model)

transformers.models.t5.modeling_t5.T5ForConditionalGeneration

In [None]:
! zip -r mathformer t5-small-finetuned-xsum/

  adding: t5-small-finetuned-xsum/ (stored 0%)
  adding: t5-small-finetuned-xsum/checkpoint-3500/ (stored 0%)
  adding: t5-small-finetuned-xsum/checkpoint-3500/tokenizer_config.json (deflated 80%)
  adding: t5-small-finetuned-xsum/checkpoint-3500/tokenizer.json (deflated 74%)
  adding: t5-small-finetuned-xsum/checkpoint-3500/rng_state.pth (deflated 27%)
  adding: t5-small-finetuned-xsum/checkpoint-3500/trainer_state.json (deflated 82%)
  adding: t5-small-finetuned-xsum/checkpoint-3500/pytorch_model.bin (deflated 9%)
  adding: t5-small-finetuned-xsum/checkpoint-3500/special_tokens_map.json (deflated 83%)
  adding: t5-small-finetuned-xsum/checkpoint-3500/config.json (deflated 62%)
  adding: t5-small-finetuned-xsum/checkpoint-3500/scheduler.pt (deflated 49%)
  adding: t5-small-finetuned-xsum/checkpoint-3500/optimizer.pt (deflated 6%)
  adding: t5-small-finetuned-xsum/checkpoint-3500/training_args.bin (deflated 48%)
  adding: t5-small-finetuned-xsum/checkpoint-3000/ (stored 0%)
  adding: t