# Description

### Imports

In [1]:
import os
import sys
from pathlib import Path
import re
from glob import glob
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm
from typing import List, Dict
from transformers import AutoTokenizer, AutoModelForTokenClassification, BertForTokenClassification, DistilBertForTokenClassification
from transformers import BertTokenizerFast, DistilBertTokenizerFast
from transformers import AdamW
from sklearn.metrics import confusion_matrix, classification_report

### Variables

In [2]:
#Paths 
WHD_BERT_DATA = "C:/Users/jseal/Dev/dissertation/Data/WikipediaHomographData/data/bert_data/" 
BERT_TRAIN = WHD_BERT_DATA + "train/"
BERT_VAL = WHD_BERT_DATA + "valid/"
BERT_TEST = WHD_BERT_DATA + "test/"
ALL_SPLITS = [BERT_TRAIN,BERT_VAL,BERT_TEST]

TRAIN_TMP = WHD_BERT_DATA + "train.txt.tmp"
VAL_TMP = WHD_BERT_DATA + "val.txt.tmp"
TEST_TMP = WHD_BERT_DATA + "test.txt.tmp"
TMPS = [TRAIN_TMP, VAL_TMP, TEST_TMP]

TRAIN_TXT = WHD_BERT_DATA + "train.txt"
VAL_TXT = WHD_BERT_DATA + "val.txt"
TEST_TXT = WHD_BERT_DATA + "test.txt"
OUTS = [TRAIN_TXT, VAL_TXT, TEST_TXT]

TMPS_OUTS = zip(TMPS, OUTS)

NER = "C:/Users/jseal/Dev/dissertation/transformers/examples/token-classification/"
RUN_NER = NER + "run_ner.py"

PREPROCESS = NER + "scripts/preprocess.py"

#Labels file
LABELS = WHD_BERT_DATA + "labels.txt"

# Model Variables
MAX_LENGTH = 128 #@param {type: "integer"}
OUTPUT_DIR = "C:/Users/jseal/Dev/dissertation/Models/Baselines/WHD_Bert/whd-model" #@param ["spanberta-ner", "bert-base-ml-ner"]
BATCH_SIZE = 16 #@param {type: "integer"}
NUM_EPOCHS = 3 #@param {type: "integer"}
SAVE_STEPS = 100 #@param {type: "integer"}
LOGGING_STEPS = 100 #@param {type: "integer"}
SEED = 42 #@param {type: "integer"}

MODEL_NAME = "distilbert-base-cased"
device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
print(device)

cuda:0


### Functions

In [3]:
def read_set(file_path):
    file_path = Path(file_path)

    raw_text = file_path.read_text(encoding="utf8").strip()
    raw_docs = re.split(r'\n\t?\n', raw_text)
    token_docs = []
    tag_docs = []
    for doc in raw_docs:
        tokens = []
        tags = []
        for line in doc.split('\n'):
            token, tag = line.split('\t')
            tokens.append(token)
            tags.append(tag)
        token_docs.append(tokens)
        tag_docs.append(tags)

    return token_docs, tag_docs

In [4]:
def encode_tags(tags, encodings):
    labels = [[tag2id[tag] for tag in doc] for doc in tags]
    encoded_labels = []
    for doc_labels, doc_offset in zip(labels, encodings.offset_mapping):
        # create an empty array of -100
        doc_enc_labels = np.ones(len(doc_offset),dtype=int) * -100
        arr_offset = np.array(doc_offset)

        # set labels whose first offset position is 0 and the second is not 0
        doc_enc_labels[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)] = doc_labels
        encoded_labels.append(doc_enc_labels.tolist())

    return encoded_labels

In [5]:
class WHDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [6]:
def multi_acc(y_pred, y_test):
    y_pred_softmax = torch.log_softmax(y_pred, dim = 1)
    _, y_pred_tags = torch.max(y_pred_softmax, dim = 1)    
    
    correct_pred = (y_pred_tags == y_test).float()
    acc = correct_pred.sum() / len(correct_pred)
    
    acc = torch.round(acc) * 100
    
    return acc

# Script

In [7]:
# #Preprocessing
# # Write temporary train, val, and test txt files
# for tmp in TMPS:
#     for split_path in ALL_SPLITS:
#         with open(tmp, 'w', encoding="utf8") as f_out: 
#             for f in glob(split_path + "*"):
#                 with open(f, encoding="utf8") as example:
#                     lines = example.readlines()
#                     for line in lines: 
#                         line_list = line.split('\t')
#                         f_out.write(line_list[1] + '\t' + line_list[2])
#                 f_out.write('\n')
                

In [8]:
# subword_len_counter = 0
# tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# MAX_LENGTH -= tokenizer.num_special_tokens_to_add()

# for tmp, outfile in TMPS_OUTS:
#     with open(tmp, "r", encoding="utf8") as f_p:
#         with open(outfile, "w", encoding="utf8") as out_f: 
#             for line in f_p:
#                 line = line.rstrip()

#                 if not line:
#                     out_f.write(line +"\n")
#                     subword_len_counter = 0
#                     continue

#                 token = line.split()[0]

#                 current_subwords_len = len(tokenizer.tokenize(token))

#                 # Token contains strange control characters like \x96 or \x95
#                 # Just filter out the complete line
#                 if current_subwords_len == 0:
#                     continue

#                 if (subword_len_counter + current_subwords_len) > MAX_LENGTH:
#                     out_f.write("\n")
#                     out_f.write(line +"\n")
#                     subword_len_counter = current_subwords_len
#                     continue

#                 subword_len_counter += current_subwords_len

#                 out_f.write(line + "\n")


In [9]:
train_texts, train_tags = read_set(TRAIN_TXT)
val_texts, val_tags = read_set(VAL_TXT)
test_texts, test_tags = read_set(TEST_TXT)

In [10]:
unique_tags = [label.strip("\n") for label in open(LABELS).readlines()]
tag2id = {tag: id for id, tag in enumerate(unique_tags)}
id2tag = {id: tag for tag, id in tag2id.items()}

In [11]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')
train_encodings = tokenizer(train_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
val_encodings = tokenizer(val_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)

In [12]:
train_labels = encode_tags(train_tags, train_encodings)
val_labels = encode_tags(val_tags, val_encodings)

In [13]:
#train_encodings.pop("offset_mapping") # we don't want to pass this to the model
#val_encodings.pop("offset_mapping")
train_dataset = WHDataset(train_encodings, train_labels)
val_dataset = WHDataset(val_encodings, val_labels)

In [None]:
#Model & tokenizer
model = DistilBertForTokenClassification.from_pretrained(MODEL_NAME, num_labels=len(unique_tags))
model.to(device)
model.train()

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
optim = AdamW(model.parameters(), lr=5e-5)

for epoch in range(3):
    running_loss = 0.0
    for i, batch in enumerate(train_loader, 0):
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        
        # print statistics
        running_loss += loss.item()
        if i % 10 == 9:    # print every 10 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 10))
            running_loss = 0.0
            
        loss.backward()
        optim.step()

model.eval()

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForTokenClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this 

[1,    10] loss: 3.315
[1,    20] loss: 0.660
[1,    30] loss: 0.637
[1,    40] loss: 0.644
[1,    50] loss: 0.635
[1,    60] loss: 0.616
[1,    70] loss: 0.625
[1,    80] loss: 0.618
[1,    90] loss: 0.592
[1,   100] loss: 0.617
[2,    10] loss: 0.574
[2,    20] loss: 0.571
[2,    30] loss: 0.538
[2,    40] loss: 0.517
