In [1]:
import pandas as pd
import re
import torch

def read_dataset(file_path):
    dataset = pd.read_csv(file_path)
    
    token_docs = []
    tag_docs = []
    for i in range(len(dataset)):
        
        tokens_length = len(dataset.loc[i,'tokens'])
        
        tokens = dataset.loc[i,'tokens']
        tags = dataset.loc[i,'tags']
        
        tokens = tokens.replace('\n','')
        tags = tags.replace('\n','')
        
        tokens = tokens.split(' ')
        tags = tags.split(' ')
        
        remove_set = {''}
        
        tokens = [remove_data for remove_data in tokens if remove_data not in remove_set]
        tags = [remove_data for remove_data in tags if remove_data not in remove_set]
        
        
        
        token_docs.append(tokens)
        tag_docs.append(tags)
    
    return token_docs, tag_docs            
    

In [2]:
train_file_path = './keyword_train.csv'
valid_file_path = './keyword_valid.csv'
train_texts, train_tags = read_dataset(train_file_path)
val_texts, val_tags = read_dataset(valid_file_path)

In [3]:
# for i in range(len(train_texts)):
#     print(train_texts[i][0:-1], train_tags[i][0:-1], sep='\n')
#     print('\n')

print(train_texts[970][0:], train_tags[970][0:], sep='\n')

['find', 'digimon', 'next']
['O', 'B-object_name', 'I-object_name']


In [4]:
print(len(train_texts))

3760


In [5]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')
train_encodings = tokenizer(train_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
val_encodings = tokenizer(val_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)

In [6]:
unique_tags = set(tag for doc in train_tags for tag in doc)
tag2id = {tag: id for id, tag in enumerate(unique_tags)}
# for key,value in tag2id.items():
#     tag2id[key] = value + 1
id2tag = {id: tag for tag, id in tag2id.items()}

In [7]:
print(unique_tags)
print('\n')
print(tag2id)
print('\n')
print(id2tag)

{'B-playlist', 'B-sort', 'I-service', 'B-music_item', 'B-object_type', 'O', 'I-object_type', 'I-artist', 'I-genre', 'I-sort', 'I-object_name', 'I-music_item', 'I-track', 'B-object_name', 'I-album', 'B-album', 'I-playlist', 'B-year', 'B-track', 'B-genre', 'B-artist', 'B-service'}


{'B-playlist': 0, 'B-sort': 1, 'I-service': 2, 'B-music_item': 3, 'B-object_type': 4, 'O': 5, 'I-object_type': 6, 'I-artist': 7, 'I-genre': 8, 'I-sort': 9, 'I-object_name': 10, 'I-music_item': 11, 'I-track': 12, 'B-object_name': 13, 'I-album': 14, 'B-album': 15, 'I-playlist': 16, 'B-year': 17, 'B-track': 18, 'B-genre': 19, 'B-artist': 20, 'B-service': 21}


{0: 'B-playlist', 1: 'B-sort', 2: 'I-service', 3: 'B-music_item', 4: 'B-object_type', 5: 'O', 6: 'I-object_type', 7: 'I-artist', 8: 'I-genre', 9: 'I-sort', 10: 'I-object_name', 11: 'I-music_item', 12: 'I-track', 13: 'B-object_name', 14: 'I-album', 15: 'B-album', 16: 'I-playlist', 17: 'B-year', 18: 'B-track', 19: 'B-genre', 20: 'B-artist', 21: 'B-service'}


In [8]:
# for key,value in tag2id.items():
#     tag2id[key] = value + 1
    
# id2tag = {id: tag for tag, id in tag2id.items()}

In [9]:
import numpy as np

def encode_tags(tags, encodings):
    # print(tags)
    # num = 0
    labels = [[tag2id[tag] for tag in doc] for doc in tags]
    encoded_labels = []
    for doc_labels, doc_offset in zip(labels, encodings.offset_mapping):
        doc_enc_labels = np.ones(len(doc_offset), dtype=int) * -100
        arr_offset = np.array(doc_offset)
        # print(arr_offset)
        # print('\n')
        # print(arr_offset[:,0])
        # print('\n')
        # print(arr_offset[:,1])
        # print('\n')
        # print(doc_enc_labels)
        # print('\n')
        # print(doc_labels)
        # print('\n')
        # num += 1
        doc_enc_labels[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)] = doc_labels
        # print(doc_enc_labels)
        encoded_labels.append(doc_enc_labels.tolist())
    
        # print(num)
    return encoded_labels

In [10]:
train_labels = encode_tags(train_tags, train_encodings)
val_labels = encode_tags(val_tags, val_encodings)

In [11]:
print(train_labels[0])

[-100, 5, 5, 20, -100, -100, 5, -100, -100, 15, -100, 5, 21, -100, -100, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100]


In [12]:
from torch.utils.data import Dataset

class KeywordDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_encodings.pop("offset_mapping") # we don't want to pass this to the model
val_encodings.pop("offset_mapping")
train_dataset = KeywordDataset(train_encodings, train_labels)
val_dataset = KeywordDataset(val_encodings, val_labels)

In [13]:
print(train_dataset[0])
print(val_dataset[0])

{'input_ids': tensor([  101,  5113,  1106,  1745,  2822,  1306,  2393,  1818,  1830,  1155,
        26949,  1113,  1301,  8032,  1513,  1390,   102,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 'labels': tensor([-100,    5,    5,   20, -100, -100,    5, -100, -100,   15, -100,    5,
          21, -100, -100,    2, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100])}
{'input_ids': tensor([  101,  1169,  1128,  1508,  1113,  1176,   170, 10610,  1118,   185,
        18318,  1657,  1468,   102,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0]), 'labels': 

In [14]:
from transformers import DistilBertForTokenClassification
model = DistilBertForTokenClassification.from_pretrained('distilbert-base-cased', num_labels=len(unique_tags))

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForTokenClassification: ['vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this 

In [15]:
print(len(unique_tags))

22


In [16]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./Keyword_model',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./keyword_logs',            # directory for storing logs
    logging_steps=10,
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()

2022-05-24 16:06:42.132352: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
***** Running training *****
  Num examples = 3760
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 705


Step,Training Loss
10,3.0353
20,3.0094
30,2.9297
40,2.8112
50,2.5984
60,2.3368
70,2.1175
80,1.8024
90,1.6455
100,1.5036


Saving model checkpoint to ./Keyword_model/checkpoint-500
Configuration saved in ./Keyword_model/checkpoint-500/config.json
Model weights saved in ./Keyword_model/checkpoint-500/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=705, training_loss=0.5947291424511172, metrics={'train_runtime': 17.1843, 'train_samples_per_second': 656.411, 'train_steps_per_second': 41.026, 'total_flos': 97902728968320.0, 'train_loss': 0.5947291424511172, 'epoch': 3.0})

In [17]:
torch.save(model.state_dict(), './Keyword_model/keyword_model.pt')

In [16]:
from torch.utils.data import DataLoader
from transformers import DistilBertForSequenceClassification, AdamW

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model = DistilBertForTokenClassification.from_pretrained('distilbert-base-uncased')
model.to(device)
model.train()

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)

optim = AdamW(model.parameters(), lr=5e-5)

for epoch in range(3):
    for batch in train_loader:
        print(batch)
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optim.step()

model.eval()

# torch.save(model.state_dict(), './Keyword_model/keyword_model.pt')

2022-06-08 02:09:36.074643: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized fr

{'input_ids': tensor([[  101,  1437,  1143,  1103,  4719,  2044,  3504,   102,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0],
        [  101,  1525,   170,  5945,  1270,  1103,  1520,  1104,   175, 21383,
           102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0],
        [  101,  1505,  1199,  1207,  1425,  1390,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0],
        [  101,  1505,  1103,  1461,  5837,  1231,  2728, 27377,   102,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
   

RuntimeError: CUDA error: CUBLAS_STATUS_ALLOC_FAILED when calling `cublasCreate(handle)`

In [21]:
sequence = ("I want to know about news articles about the outcome of the Korean election.")

device = torch.device('cuda')

inputs = tokenizer(sequence, return_tensors = "pt").to(device)
tokens = inputs.tokens()

outputs = model(**inputs).logits
predictions = torch.argmax(outputs, dim=2)

In [None]:
print(type(tokens))
print(type(predictions[0].numpy()))
print(predictions[0].numpy())
print(predictions[0].numpy().tolist())

In [24]:
keyword = ''
keyword_start_pos = 0
keyword_end_pos = 0
id_tags = predictions[0].cpu().numpy().tolist()
for i in range(len(tokens)):
    token = tokens[i]
    tag = id2tag[id_tags[i]]
    if tag != 'O':
        keyword_start_pos = i
        break
    
for i in range(len(tokens)-1,0,-1):
    token = tokens[i]
    tag = id2tag[id_tags[i]]
    if token != '[SEP]':
        if tag != 'O':
            keyword_end_pos = i
            break

for i in range(keyword_start_pos,keyword_end_pos+1):
    if i == keyword_start_pos:
        keyword = tokens[i]
    else:
        if '##' in tokens[i]:
            token = tokens[i].replace('##','')
            keyword += token
        else:
            token = ' ' + tokens[i]
            keyword += token

print(keyword)

news articles about the outcome of the Korean election


In [None]:
for token, prediction in zip(tokens, predictions[0].numpy()):
    print((token, id2tag[prediction]))