In [1]:
import pandas as pd
import numpy as np
import torch
import csv
from sentence_transformers import SentenceTransformer, util
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as func
from torch.nn import Linear as lin
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import string
from nltk import word_tokenize
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import spacy
from torch.distributions.multinomial import Multinomial
from transformers import BertForTokenClassification
from transformers import PreTrainedTokenizer
from transformers import AutoTokenizer
from transformers import DistilBertTokenizerFast

  from .autonotebook import tqdm as notebook_tqdm
2022-11-03 15:35:51.813528: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-03 15:35:51.813844: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-03 15:35:51.813913: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero


In [2]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [2]:
df = pd.read_csv("Train_Tagged_Titles.tsv.gz", sep="\t", dtype=str, keep_default_na=False, na_values=[""], quoting=csv.QUOTE_NONE)

In [3]:
def get_text(df):
    text = []
    labels = []
    for i in range(1,5001):
        listing = df[df["Record Number"] == str(i)]
        
        text.append(listing["Token"].values.tolist())
        labels.append(listing["Tag"].values.tolist())
    return text, labels

In [4]:
last = None
for i in range(len(df)):
    if type(df["Tag"][i]) == float:
        df["Tag"][i] = last
    last = df["Tag"].values[i]
    

In [5]:
text, labels = get_text(df)

In [6]:
train_text, val_text, train_labels, val_labels = train_test_split(text, labels, test_size=0.2)

In [7]:
tag_to_id = {}
cnt = 0
for i in set(df["Tag"].values.tolist()):
    
    tag_to_id[i] = cnt
    cnt += 1

In [8]:
id_to_tag = {}
cnt = 0
for i in set(df["Tag"].values.tolist()):
    id_to_tag[cnt] = i
    cnt += 1

In [41]:

for i in set(df["Tag"].values.tolist()):
    print(len(df[df["Tag"] == i]), i)

6974 Brand
1125 Accents
54 Trim Material
402 Closure
3243 Material
4275 Color
509 Style
706 Features
7 Lining Material
3577 Model
5 Strap Drop
1010 Department
1226 Size
240 Country/Region of Manufacture
491 Theme
1704 Pattern
263 Measurement, dimension
11229 No Tag
631 Occasion
687 Fabric Type
92 Pocket Type
81 Hardware Material
26 Season
8 Handle Drop
303 Handle/Strap Material
1454 Product Line
107 Character
12449 Type
711 Obscure
512 Handle Style
969 MPN
51 Character Family


In [127]:
tokenizer(["hell hell he [PAD]", "yes yes"], return_offsets_mapping=True, padding=True, truncation=True)

{'input_ids': [[101, 2630, 2630, 1119, 0, 102], [101, 4208, 4208, 102, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 0, 0]], 'offset_mapping': [[(0, 0), (0, 4), (5, 9), (10, 12), (13, 18), (0, 0)], [(0, 0), (0, 3), (4, 7), (0, 0), (0, 0), (0, 0)]]}

In [126]:
tokenizer.pad_token

'[PAD]'

In [53]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
train_encodings = tokenizer(train_text, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
val_encodings = tokenizer(val_text, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)

In [54]:
def encode_tags(tags, encodings):
    labels = []
    for doc in tags:
        f = []
        for tag in doc:
            f.append(tag_to_id[tag])
        labels.append(f)
    encoded_labels = []
    for doc_labels, doc_offset in zip(labels, encodings.offset_mapping):
        d = doc_offset
        for i in range(40 - len(d)):
            d.append((0,0))
        doc_enc_labels = np.ones(40,dtype=int) * -100
        arr_offset = np.array(d)
        # set labels whose first offset position is 0 and the second is not 0
        doc_enc_labels[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)] = np.array(doc_labels)
        """last = 0
        c = arr_offset[:,1].flatten()
        for v in range(1, len(c)):
            if c[v] == 0:
                for lab in range(1, v+1):
                    if doc_enc_labels[lab] != -100:
                        last = doc_enc_labels[lab]
                    if doc_enc_labels[lab] == -100:
                        doc_enc_labels[lab] = last
                break
            """

        encoded_labels.append(doc_enc_labels.tolist())
    

    return encoded_labels

train_tags = encode_tags(train_labels, train_encodings)
val_tags = encode_tags(val_labels, val_encodings)

In [25]:
train_tags

[[-100,
  3,
  -100,
  2,
  27,
  27,
  7,
  9,
  20,
  15,
  15,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100],
 [-100,
  20,
  27,
  27,
  -100,
  -100,
  19,
  19,
  23,
  -100,
  15,
  -100,
  15,
  26,
  -100,
  -100,
  -100,
  20,
  20,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100],
 [-100,
  27,
  -100,
  -100,
  2,
  7,
  9,
  7,
  7,
  21,
  26,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100],
 [-100,
  20,
  -100,
  20,
  27,
  27,
  -100,
  31,
  31,
  1,
  12,
  23,
  23,
  7,
  20,
  -100,


In [52]:
class Data(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        for encoding in self.encodings.keys():
            self.encodings[encoding] = torch.tensor(self.encodings[encoding]).to("cuda")
        self.labels = torch.tensor(labels).to("cuda")

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [64]:

for i in range(len(val_encodings["attention_mask"])):
    for j in range(len(val_encodings["attention_mask"][i])):
        if val_encodings["attention_mask"][i][j] == 0:
            val_encodings["attention_mask"][i][j - 1] = 0
        if j == 0:
            val_encodings["attention_mask"][i][j] = 0
for i in range(len(train_encodings["attention_mask"])):
    for j in range(len(train_encodings["attention_mask"][i])):
        if train_encodings["attention_mask"][i][j] == 0:
            train_encodings["attention_mask"][i][j - 1] = 0
        if j == 0:
            train_encodings["attention_mask"][i][j] = 0

In [397]:
len(train_tags[0])

40

In [13]:
len(val_dataset[0]["labels"])

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


In [384]:
len(val_tags[0])

31

In [16]:
val_dataset.labels

tensor([], device='cuda:0')

In [55]:

train_dataset = Data(train_encodings, train_tags)
val_dataset = Data(val_encodings, val_tags)
train_encodings.pop("offset_mapping")
val_encodings.pop("offset_mapping")

tensor([[[0, 0],
         [0, 4],
         [4, 6],
         ...,
         [0, 0],
         [0, 0],
         [0, 0]],

        [[0, 0],
         [0, 2],
         [2, 3],
         ...,
         [0, 0],
         [0, 0],
         [0, 0]],

        [[0, 0],
         [0, 6],
         [0, 2],
         ...,
         [0, 0],
         [0, 0],
         [0, 0]],

        ...,

        [[0, 0],
         [0, 2],
         [2, 4],
         ...,
         [0, 0],
         [0, 0],
         [0, 0]],

        [[0, 0],
         [0, 2],
         [0, 2],
         ...,
         [0, 0],
         [0, 0],
         [0, 0]],

        [[0, 0],
         [0, 2],
         [2, 4],
         ...,
         [0, 0],
         [0, 0],
         [0, 0]]], device='cuda:0')

In [18]:

from transformers import DistilBertForTokenClassification


In [435]:
class CustomTrainer(Trainer):
        
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        print(labels)
        outputs = model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])
        logits = outputs.get("logits")
        l1 = 0
        for param in model.classifier.parameters():
            l1 += torch.abs(param).sum()

        # compute custom loss (suppose one has 3 labels with different weights)
        loss_fct = nn.CrossEntropyLoss(reduction="mean")
        print(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1)) + l1*0.000001
        return (loss, outputs) if return_outputs else loss

In [353]:
len(unique_tags)

32

In [77]:
class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.temp = None
        self.bert = DistilBertForTokenClassification.from_pretrained('distilbert-base-uncased', num_labels=32, dropout=0, id2label=id_to_tag).to("cuda")
        for p in self.bert.distilbert.parameters():
            p.requires_grad = False
        self.h1 = Parameter(torch.randn((256, 768), requires_grad=True, device="cuda"))
        self.h2 = Parameter(torch.randn((128,256), requires_grad=True, device="cuda"))
        self.h3 = Parameter(torch.randn((128,128), requires_grad=True, device="cuda"))
        self.h4 = Parameter(torch.randn((64,128), requires_grad=True, device="cuda"))
        self.h5 = Parameter(torch.randn((32,64), requires_grad=True, device="cuda"))
    def forward(self, input_ids, attention_mask):
        bert_out = self.bert.distilbert(input_ids=input_ids, attention_mask=attention_mask)
        print(bert_out[0].shape())
        x = func.linear(bert_out[0], self.h1)
        x = func.silu(x)
        #x = dropout(x)
        x = func.linear(x, self.h2)
        x = func.silu(x)
        #x = dropout(x)   
        #skip = x
        x = func.linear(x, self.h3)
        x = func.silu(x)
        #x = x + skip
        x = func.linear(x, self.h4)
        x = func.silu(x)
        x = torch.abs(func.linear(x, self.h5))
        return func.softmax(x, dim=2)
    def parameters(self):
        return [self.h1,self.h2,self.h3, self.h4, self.h5]

In [408]:
check

[tensor([[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]],
 
         [[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]],
 
         [[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 1., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]],
 
         ...,
 
         [[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 

In [42]:
a = torch.arange(0,24,1.0).reshape(-1,3,4)
print(a)
print(func.softmax(a[1][0], dim=0))

tensor([[[ 0.,  1.,  2.,  3.],
         [ 4.,  5.,  6.,  7.],
         [ 8.,  9., 10., 11.]],

        [[12., 13., 14., 15.],
         [16., 17., 18., 19.],
         [20., 21., 22., 23.]]])
tensor([0.0321, 0.0871, 0.2369, 0.6439])


In [72]:
def parse_for_loss(out, tags):

    pred = []
    true = []
    for i in range(len(tags)):
        for j in range(len(tags[t])):
            if tags[i][j] != -100:
                init = torch.zeros((32,))
                init[tags[i][j].item()] = 1
                true.append(init.tolist())
                pred.append(out[i][j].tolist())
    return torch.tensor(pred).cuda(), torch.tensor(true).cuda()

    
    
    

In [48]:
val_dataset[0]

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])


{'input_ids': tensor([  101, 10309,  2829, 21843,  5898,  2308,  1005,  1055,  3244,  4524,
           102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0], device='cuda:0'),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0'),
 'labels': tensor([-100,   27,   15,   23,    9,    3,    3, -100,    7,    7, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100], device='cuda:0')}

In [78]:
model = Model()
loss_fn = torch.nn.CrossEntropyLoss(reduction="mean")
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
current_min = 999999999
num_since = 0
best_params = model.parameters()
for t in range(200000):
    inputs = train_dataset[torch.randperm(len(train_encodings["input_ids"]))[:250]]
    y_pred = model(inputs["input_ids"], inputs["attention_mask"])
    preds, true = parse_for_loss(y_pred, inputs["labels"])
    print(preds)
    loss = loss_fn(preds, true)
    for i in model.parameters():
        loss += (torch.square(i).sum())*0.0002
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step(lambda: loss)
    if t%100 == 0:
        print(t, loss)
    if loss.item() < current_min:
        num_since = 0
        current_min = loss
        best_params = model.parameters()
    elif num_since >= 5000:
        print("max iter")
        break
    
    else:
        num_since += 1

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN t

In [439]:
from transformers import TrainingArguments

training_args = TrainingArguments(output_dir="test_trainer")
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    m = predictions - labels[labels != -100]
    print(m)
    return np.count_nonzero(m)

from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch", per_device_eval_batch_size=250, num_train_epochs=200,disable_tqdm=False, learning_rate=0.0003, dataloader_pin_memory=False)

trainer = CustomTrainer(model=model,args=training_args,train_dataset=train_dataset,eval_dataset=val_dataset, tokenizer=tokenizer)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [440]:
trainer.train()

***** Running training *****
  Num examples = 4000
  Num Epochs = 200
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 100000
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])


tensor([[-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100],
        [-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100],
        [-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100],
        [-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100],
        [-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, 

  0%|          | 0/100000 [00:32<?, ?it/s]


In [128]:
len(val_encodings["input_ids"][0])

31

In [129]:
len(val_dataset.labels)

1000

In [280]:
predicts = trainer.predict(val_dataset)


***** Running Prediction *****
  Num examples = 1000
  Batch size = 250
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])


In [194]:
len(predicts.predictions[0])

31

In [226]:
val_encodings["attention_mask"]

tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')

In [281]:
cp = val_encodings["attention_mask"].cpu().numpy()


In [207]:
for i in cp:
    print(i)

[0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0]
[0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0]
[0 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0]
[0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0]
[0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0]
[0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0]
[0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0

In [216]:
v = np.where(cp == 1)

In [282]:
preds = np.argmax(predicts.predictions, axis=-1)

In [218]:
a = preds[v]

In [283]:
f = predicts.label_ids

In [285]:
preds

array([[ 8,  8, 12, ...,  8,  3,  1],
       [ 9, 12, 12, ...,  8,  9,  9],
       [12,  8, 12, ..., 12, 12, 12],
       ...,
       [ 8,  0, 25, ...,  1,  3,  3],
       [ 8,  8,  8, ...,  3,  1,  3],
       [ 8,  8, 12, ...,  3, 24, 24]])

In [284]:
true = 0
total = 0
for i in range(len(preds)):
    for j in range(len(preds[i])):
        if cp[i][j] == 1:

            if preds[i][j] == f[i][j]:
                true += 1
            if f[i][j] != -100:
                total += 1
print(true/total)

0.8258024027254797


In [132]:
for i in np.argmax(predicts.predictions[1][0], axis=-1):
    print(len(i))

12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
1

In [135]:
for i in preds:
    print(i)

[ 8  8  8 31  3  3  3  3  3  3  8 12  8  8 12  3  8  8  8  8  8  8  8  8
  8  8  8  8  8  3  8]
[ 8 12 12 12 12 12  3  3 12 12  8  3  3  8  8  8  8  8  8  8  8  8  8  8
  8  8  8  8  8  8  8]
[ 8  8 12 12 12 12 12 12  3 31  3  3  8  3  3  3  8  3  0  8  8  8  8  8
  8  8  8  8  8 12  8]
[ 8  8  3  3  3  8  8  8  8 27  3  3  3  3  0  8  8  8  8  8  8  8  8  8
  8  8  8  8  8  8  8]
[ 8 12 12 12 12  8  8  8  8  8  8 12 12  3  3  3  8  3  3  3  3  3  8  3
  3  8  0  8  8  8  8]
[ 8 12 12  3  3  3 31 27  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8
  8  8  8  8  8  8  8]
[ 8  8  8  8  8 27  3  8  3  3  3  3  8  8 12  8  8  8  8  8  8  8  8  8
  8  8  8  8  8  8  8]
[ 8 12  8  8 12 12 12 12  8  3  3  8  3  8  3  3  3  3  0  8  8  8  8  8
  8  8  8  8  8 12  8]
[ 8 12 12 12 12  8  3  8  8  8  8  3 31  3  3  8  8  3  8  8  8  8  8  8
  8  8 12 12  8  8  8]
[ 8 12 12 12 12  8  8 12  3  8  3  3  3  8 31 31  3  0  8  8  8  8  8  8
  8  8 12  8 12  8  8]
[ 8  8  8  8  8 12  3 12  3 12  8  3  8 

In [70]:
for i in predicts.label_ids:
    print(i)

[-100   12   12   27    3 -100    1    1    3    8    8    1    8   30
    5    5   26 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100
 -100 -100 -100]
[-100   12   12 -100 -100   26 -100    9 -100 -100    9    3    3    8
    7 -100 -100 -100   12 -100    8 -100    8 -100 -100 -100 -100 -100
 -100 -100 -100]
[-100    8   12 -100    9 -100 -100 -100    3   31   25   25    9    3
 -100    3    8    3 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100
 -100 -100 -100]
[-100    0   11    3    3    8 -100    8    8   27    3 -100    3    3
 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100
 -100 -100 -100]
[-100   12   12 -100 -100    7 -100 -100 -100   27 -100    9 -100 -100
    9 -100 -100   13    3    3    3   27 -100 -100 -100   31 -100 -100
 -100 -100 -100]
[-100   12 -100    3 -100    3   31   27    8 -100 -100 -100 -100 -100
 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100
 -100 -100 -100]
[-100    8 -100   12    9   27    6 -100    3 

In [32]:
for i in preds:
    print(i)

[ 7 13 13 13 13 13 13 13  7 13 13 13 10  7 27 10 10  7  7 13  7  7  7  7
  7  7  7  7 10 10 10 13 13 13 13 13  7  7 13 13  7 13 13 13 10 10 10 10]
[15  7 13 13 13 13  7 13 30 13 13 13 13 13 13 13 13 13 13 10  7 13 13 13
 13 13 10  7  7 13  7 13  7 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13]
[15 13 13 13  7  7 13 13 10 20  7 13  7 13 13 13 20 13 20 13 20 13 10  7
 27 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13]
[ 7 13 13 13  7 13 13 13 13 13 13 13 13  7 13 13 13 13 13 13 13 13 13 13
 13 13 13 14 24 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13]
[ 7 13 13 13 13 13 13 13 13 13 13  7  7 13  7 13  7  7 18  7 18  7  7  7
  7  7  7  7  7  7  7  7 13  7  7  7 18  7 13  7 13 13 13  7  7 13  7 13]
[15 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13
 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13]
[15 13  7 20 10 10 13 13 14 10 13 13 13 13 13  7 13 20 19 13 27 13 13 10
 10 10 10 13 13 13 13 13 10 13 13 10 10 10 13

In [145]:
len(predicts.label_ids[0])

50

In [144]:
metric = evaluate.load("glue", "mrpc")
metric.compute(predictions=preds, references=predicts.label_ids)

In [6]:
def get_labels(df, idx, t):
    out = [-100]
    listing = df[df["Record Number"] == str(idx + 1)]

    for idx, row in listing.iterrows():
        out += (t(row["Token"], padding='max_length', max_length = 512, truncation=True, return_tensors="pt")["input_ids"].argmin().item() - 2)*[tags[row["Tag"]]]
    out += [-100]*(512 - len(out))
    return out

In [None]:

total_data = None
for i in range(1,5001):
    if i == 1:
        total_data = df[df["Record Number"] == str(i)]
    else:
        total_data.append(df[df["Record Number"] == str(i)])
    


In [None]:

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
a = tokenizer(get_text(df), padding='max_length', max_length=512, is_split_into_words=True, return_offsets_mapping=True, truncation=True, return_tensors="pt")
labels = []
for i in range(len(a)):
    labels.append(get_labels(df, i, tokenizer))
labels = torch.tensor(labels)
a.token_type_ids = labels
    


In [86]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained("bert-base-uncased", num_labels=32)
model.to("cuda")

loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /home/henry/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LABEL_18",
    "19": "

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [8]:
from transformers import TrainingArguments

training_args = TrainingArguments(output_dir="test_trainer")

In [9]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

In [19]:
class DataSequence(torch.utils.data.Dataset):

    def __init__(self, d):
        tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
        a = tokenizer(get_text(d), padding='max_length', max_length=512, is_split_into_words=True, truncation=True, return_tensors="pt")
        labels = []
        for i in range(len(a)):
            labels.append(get_labels(d, i, tokenizer))
        labels = torch.tensor(labels)
        for k in a.keys():
            a[k].to(torch.int32)
        self.encodings = a
        self.labels = labels.to(torch.int32)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx]).to(torch.int32)
        return item

    def __len__(self):
        return len(self.labels)

In [11]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [32]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch", per_device_eval_batch_size=100, num_train_epochs=100,disable_tqdm=False)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [22]:
d1 = DataSequence(df)
d2 = DataSequence(df)
trainer = Trainer(model=model,args=training_args,train_dataset=d1,eval_dataset=d2)

loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /home/henry/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.20.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file https://huggingface.co/bert-base-uncased/reso

In [33]:
trainer = Trainer(model=model,args=training_args,train_dataset=d1,eval_dataset=d2)

In [85]:
trainer.train()

***** Running training *****
  Num examples = 4000
  Num Epochs = 25
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 12500

  0%|          | 0/12500 [01:16<?, ?it/s]

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                             

{'loss': 0.8358, 'learning_rate': 0.0, 'epoch': 1.0}


Model weights saved in test_trainer/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 100
                                                   

[A[A                                             
 55%|█████▌    | 2763/5000 [32:56<02:10, 17.12it/s]
[A
[A
[A

{'eval_loss': 0.8854020237922668, 'eval_runtime': 1.1701, 'eval_samples_per_second': 854.629, 'eval_steps_per_second': 8.546, 'epoch': 1.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

In [84]:
cnt = 0
for i in model.parameters():
    cnt += 1
    i.requires_grad = False
    if cnt > 100:
        i.requires_grad = True

In [61]:
class DataSequence(torch.utils.data.Dataset):

    def __init__(self, d):
        tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
        a = tokenizer(get_text(d), padding='max_length', max_length=512, is_split_into_words=True, truncation=True, return_tensors="pt")
        labels = []
        for i in range(len(a)):
            labels.append(get_labels(d, i, tokenizer))
        labels = torch.tensor(labels)
        tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
        self.texts = a
        self.labels = labels

    def __len__(self):

        return len(self.labels)

    def get_batch_data(self, idx):

        return self.texts[idx]

    def get_batch_labels(self, idx):

        return torch.LongTensor(self.labels[idx])

    def __getitem__(self, idx):

        batch_data = self.get_batch_data(idx)
        batch_labels = self.get_batch_labels(idx)
        return batch_data, batch_labels

In [39]:
class BertModel(torch.nn.Module):

    def __init__(self):

        super(BertModel, self).__init__()

        self.bert = DistilBertForTokenClassification.from_pretrained('distilbert-base-uncased', num_labels=len(tags))

    def forward(self, input_id, mask, label):

        output = self.bert(input_ids=input_id, attention_mask=mask, labels=label, return_dict=False)

        return output

In [40]:
from tqdm import tqdm

In [60]:
torch.cuda.memory_allocated()/1024/1024/1024

1.2204227447509766

In [46]:
def train_loop(model, df):
    df_train, df_val = train_test_split(df, train_size=0.8)
    train_dataset = DataSequence(df_train)
    val_dataset = DataSequence(df_val)

    train_dataloader = torch.utils.data.DataLoader(train_dataset, num_workers=1, batch_size=BATCH_SIZE, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val_dataset, num_workers=1, batch_size=BATCH_SIZE)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    optimizer = torch.optim.Adam([[v for v in model.parameters()][-1]], lr=LEARNING_RATE)

    if use_cuda:
        model = model.cuda()

    best_acc = 0
    best_loss = 1000

    for epoch_num in range(EPOCHS):

        total_acc_train = 0
        total_loss_train = 0

        model.train()

        for train_data, train_label in tqdm(train_dataloader):

            train_label = train_label.to(device)
            mask = train_data['attention_mask'].squeeze(1).to(device)
            input_id = train_data['input_ids'].squeeze(1).to(device)

            optimizer.zero_grad()
            loss, logits = model(input_id, mask, train_label)

            for i in range(logits.shape[0]):

              logits_clean = logits[i][train_label[i] != -100]
              label_clean = train_label[i][train_label[i] != -100]

              predictions = logits_clean.argmax(dim=1)
              acc = (predictions == label_clean).float().mean()
              total_acc_train += acc
              total_loss_train += loss.item()

            loss.backward()
            optimizer.step()

        model.eval()

        total_acc_val = 0
        total_loss_val = 0

        for val_data, val_label in val_dataloader:

            val_label = val_label.to(device)
            mask = val_data['attention_mask'].squeeze(1).to(device)
            input_id = val_data['input_ids'].squeeze(1).to(device)

            loss, logits = model(input_id, mask, val_label)

            for i in range(logits.shape[0]):

              logits_clean = logits[i][val_label[i] != -100]
              label_clean = val_label[i][val_label[i] != -100]

              predictions = logits_clean.argmax(dim=1)
              acc = (predictions == label_clean).float().mean()
              total_acc_val += acc
              total_loss_val += loss.item()

        print(f'Epochs: {epoch_num + 1} | Loss: {total_loss_train / len(df_train): .3f} | Accuracy: {total_acc_train / len(df_train): .3f} | Val_Loss: {total_loss_val / len(df_val): .3f} | Accuracy: {total_acc_val / len(df_val): .3f}')
torch.cuda.empty_cache()
LEARNING_RATE = 0.005
EPOCHS = 5
BATCH_SIZE = 2
model = BertModel()
train_loop(model,total_data)

loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /home/henry/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LABEL_18",
    "19": "LABEL_19",
    "20": "LABEL_20",
    "21": "LABEL_21",
    "22": "LABEL_22",
    "23": "LABEL_23

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
