## Semantic Role Labelling

In [None]:
!pip install transformers -q
!pip install evaluate -q
!pip install seqeval -q
!pip install datasets -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m55.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.4/182.4 KB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m75.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 KB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m452.9/452.9 KB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m213.0/213.0 KB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.0/132.0 KB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m140.6/140.6 KB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
# from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import numpy as np
from sklearn.metrics import f1_score
from datasets import Dataset
import evaluate
from transformers import BertForSequenceClassification, BertTokenizer, BertModel, BertForTokenClassification, AdamW, TrainingArguments, Trainer, DataCollatorForTokenClassification, AutoTokenizer, AutoModelForTokenClassification, pipeline


In [None]:
!wget https://raw.githubusercontent.com/s-nlp/semantic-role-labelling/main/train.tsv
!wget https://raw.githubusercontent.com/s-nlp/semantic-role-labelling/main/test_no_answers.tsv
!wget https://raw.githubusercontent.com/s-nlp/semantic-role-labelling/main/dev_no_answers.tsv

--2023-01-06 07:42:52--  https://raw.githubusercontent.com/s-nlp/semantic-role-labelling/main/train.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 599430 (585K) [text/plain]
Saving to: ‘train.tsv’


2023-01-06 07:42:53 (17.7 MB/s) - ‘train.tsv’ saved [599430/599430]

--2023-01-06 07:42:53--  https://raw.githubusercontent.com/s-nlp/semantic-role-labelling/main/test_no_answers.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 58177 (57K) [text/plain]
Saving to: ‘test_no_answers.tsv’


2023-01-06 07:42:53 (7.08 MB/s) - ‘test_n

### Preprocessing 

In [None]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return idxs

In [None]:
def read_dataset(filename, splitter="\t"):
    data = []
    sentence = []
    tags = []
    with open(filename) as f:
        for line in f:
            if not line.isspace():
                word, tag = line.split(splitter)
                sentence.append(word)
                tags.append(tag.strip())
            else:
                data.append((sentence, tags))
                sentence = []
                tags = []
    return data

In [None]:
training_data = read_dataset("train.tsv")

In [None]:
test_data = read_dataset("dev_no_answers.tsv", splitter="\n")

In [None]:
word_to_ix = {}
for sent, tags in training_data:
    for word in sent:
        if word not in word_to_ix: 
            word_to_ix[word] = len(word_to_ix)  

for sent, tags in test_data:
    for word in sent:
        if word not in word_to_ix:  
            word_to_ix[word] = len(word_to_ix)  

tag_to_ix = {
    "O": 0,
    "B-Object": 1,
    "I-Object": 2,
    "B-Aspect": 3,
    "I-Aspect": 4,
    "B-Predicate": 5,
    "I-Predicate": 6
} 

idx_to_tag = dict(map(reversed, tag_to_ix.items()))

### Data processing

In [None]:
model_path = "distilbert-base-uncased"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_path)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
X_train = [i[0] for i in training_data]

In [None]:
y_train = [prepare_sequence(tags, tag_to_ix) for _, tags in training_data]

In [None]:
train_tokens = []
train_ner_tags = []
train_id = []
for idx, (x, y) in enumerate(zip(X_train, y_train)):
    train_tokens.append(x)
    train_ner_tags.append(y)
    train_id.append(str(idx))

In [None]:
ds = Dataset.from_dict({'labels': y_train,'tokens': train_tokens,
                                            'ner_tags': train_ner_tags, 'id': train_id})

In [None]:
ds_split = ds.train_test_split(test_size=0.2)

In [None]:
data = ds_split.map(tokenize_and_align_labels, batched=True)

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
label_list = list(idx_to_tag.values())

### Model

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [None]:
seqeval = evaluate.load("seqeval")

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [None]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [None]:
model = AutoModelForTokenClassification.from_pretrained(
    model_path, num_labels=len(tag_to_ix), id2label=idx_to_tag, label2id=tag_to_ix
)

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN t

In [None]:
training_args = TrainingArguments(
    output_dir="model_big",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)



In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=data['train'],
    eval_dataset=data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: ner_tags, tokens, id. If ner_tags, tokens, id are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1867
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1170
  Number of trainable parameters = 66368263
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.221171,0.840356,0.797423,0.818327,0.929801
2,No log,0.179818,0.834121,0.862283,0.847969,0.939646
3,No log,0.178684,0.818144,0.885384,0.850437,0.940502
4,No log,0.175162,0.837385,0.887606,0.861764,0.945039
5,0.213300,0.184407,0.847104,0.883607,0.864971,0.946323
6,0.213300,0.197273,0.833613,0.883607,0.857882,0.943584
7,0.213300,0.209552,0.84985,0.880053,0.864688,0.946066
8,0.213300,0.218294,0.831729,0.882719,0.856466,0.943156
9,0.061600,0.221641,0.85479,0.876055,0.865292,0.946152
10,0.061600,0.222553,0.84985,0.880053,0.864688,0.945981


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: ner_tags, tokens, id. If ner_tags, tokens, id are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 467
  Batch size = 16
Saving model checkpoint to model_big/checkpoint-117
Configuration saved in model_big/checkpoint-117/config.json
Model weights saved in model_big/checkpoint-117/pytorch_model.bin
tokenizer config file saved in model_big/checkpoint-117/tokenizer_config.json
Special tokens file saved in model_big/checkpoint-117/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: ner_tags, tokens, id. If ner_tags, tokens, id are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
****

TrainOutput(global_step=1170, training_loss=0.12349045256264189, metrics={'train_runtime': 5873.7164, 'train_samples_per_second': 3.179, 'train_steps_per_second': 0.199, 'total_flos': 269067126084108.0, 'train_loss': 0.12349045256264189, 'epoch': 10.0})

### Inference

In [None]:
classifier = pipeline("ner", tokenizer=tokenizer, model=model)

In [None]:
final_test = read_dataset("test_no_answers.tsv", splitter="\n")

In [None]:
with open("out_test.tsv", "w") as w:
    for sentence in tqdm(final_test):
        res = classifier(sentence[0])
        tags = []
        for i in res:
            if i:
                tags.append(i[0]['entity'])
            else: 
                tags.append('O')

        for i, y in zip(sentence[0], tags):
            w.write(f"{i}\t{y}\n")
        w.write("\n")


100%|██████████| 360/360 [06:01<00:00,  1.00s/it]


In [None]:
!zip out_test.zip out_test.tsv

updating: out_test.tsv (deflated 75%)
