In [1]:
import re
def load_the_data(data_set):
    with open("train_dev/en-"+data_set+".conll", 'r') as f:
        data=f.read()
        data=data.split('\n\n')
        
    lines=[]
        
    for ind, sentence in enumerate(data):
        lines.append(data[ind].split('\n')[2:])
    
    snt=[]
    for sentences in lines:
        ow=[]
        for word in sentences:
            mtch = re.match(r"(.*) _ _ (.*)", word)
            ow.append(mtch.groups())
        snt.append(ow)
    
    return snt

    
    


In [2]:
dev_set=load_the_data('dev')
training_set=load_the_data('train')

In [3]:
class NERDataMaker:
    def __init__(self, texts):
        self.unique_entities = []
        self.processed_texts = []

        temp_processed_texts = []
        for text in texts:
            for _, ent in text:
                if ent not in self.unique_entities:
                    self.unique_entities.append(ent)
            temp_processed_texts.append(text)

        self.unique_entities.sort(key=lambda ent: ent if ent != "O" else "")

        for tokens_with_entities in temp_processed_texts:
            self.processed_texts.append([(t, self.unique_entities.index(ent)) for t, ent in tokens_with_entities])
            
    #@property decorator is a built-in python feature allowing a method to be accessed like an attribute
    @property
    def id2label(self):
        return dict(enumerate(self.unique_entities))
    
    @property
    def label2id(self):
        return {v:k for k, v in self.id2label.items()}
    
    def __len__(self):
        return len(self.processed_texts)
    
    def __getitem__(self, idx):
        def _process_tokens_for_one_text(id, tokens_with_encoded_entities):
            ner_tags=[]
            tokens=[]
            
            for t , ent in tokens_with_encoded_entities:
                ner_tags.append(ent)
                tokens.append(t)
                
            return {
                'id':id,
                'ner_tags':ner_tags,
                'tokens':tokens
            }
        
        tokens_with_encoded_entities=self.processed_texts[idx]
        
        if isinstance(idx, int):
            return _process_tokens_for_one_text(idx, tokens_with_encoded_entities)
        
        else:
            return [_process_tokens_for_one_text(i+idx.start, tee) for i, tee in enumerate(tokens_with_encoded_entities)]
        
    
    
    def as_hf_dataset(self, tokenizer):
        from datasets import Dataset, Features, Value, ClassLabel, Sequence
        
        ids, ner_tags, tokens = [], [], []
        for i, pt in enumerate(self.processed_texts[:len(self.processed_texts)-1]):
            ids.append(i)
            pt_tokens,pt_tags =list(zip(*pt))
            ner_tags.append(pt_tags)
            tokens.append(pt_tokens)
            
            
        data = {
            "id": ids,
            "ner_tags": ner_tags,
            "tokens": tokens
        }
        
        
        features = Features({
            "tokens": Sequence(Value("string")),
            "ner_tags": Sequence(ClassLabel(names=dm.unique_entities)),
            "id": Value("int32")
        })
        
        ds = Dataset.from_dict(data, features)
        
        def tokenize_and_align_labels(examples):
            tokenized_inputs = tokenizer(examples["tokens"], truncation=True,padding='max_length',
                                         max_length=30, is_split_into_words=True)#padding='max_length', max_length=30)

            labels = []
            for i, label in enumerate(examples[f"ner_tags"]):
                word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
                previous_word_idx = None
                label_ids = []
                for word_idx in word_ids:  # Set the special tokens to -100.
                    if word_idx is None:
                        label_ids.append(-100)
                    elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                        label_ids.append(label[word_idx])
                    else:
                        label_ids.append(-100)
                    previous_word_idx = word_idx
                labels.append(label_ids)

            tokenized_inputs["labels"] = labels
            return tokenized_inputs


        tokenized_ds = ds.map(tokenize_and_align_labels, batched=True)
        return tokenized_ds

    
    

In [7]:
from transformers import BertTokenizerFast, BertModel


tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

dm=NERDataMaker(training_set)
hf_dm_train=dm.as_hf_dataset(tokenizer=tokenizer)

dm_val=NERDataMaker(dev_set)
hf_dm_val=dm_val.as_hf_dataset(tokenizer=tokenizer)
h

  0%|          | 0/17 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Dataset({
    features: ['id', 'ner_tags', 'tokens', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 871
})

In [9]:
from datasets import DatasetDict
hf_dm_train

Dataset({
    features: ['id', 'ner_tags', 'tokens', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 16778
})

In [6]:
from datasets import load_dataset
ds=load_dataset('xtreme', name='PAN-X.de')

Downloading builder script:   0%|          | 0.00/37.5k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/593k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/103k [00:00<?, ?B/s]

Downloading and preparing dataset xtreme/PAN-X.de to /Users/joesh/.cache/huggingface/datasets/xtreme/PAN-X.de/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4...


Generating train split:   0%|          | 0/20000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Dataset xtreme downloaded and prepared to /Users/joesh/.cache/huggingface/datasets/xtreme/PAN-X.de/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 20000
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
})

In [7]:
#from transformers import DataCollatorForTokenClassification, BertModel, TrainingArguments, Trainer
#data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
#model_config = BertModel.from_pretrained("bert-base-cased", num_labels=len(dm.unique_entities), id2label=dm.id2label, label2id=dm.label2id)

In [8]:
import torch.nn as nn
from transformers import BertConfig
from transformers.models.bert import BertPreTrainedModel
from transformers.modeling_outputs import TokenClassifierOutput

class BertModelForTokenClassifaction(BertPreTrainedModel):
    config_class=BertModel
    
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        # Load model body
        self.bert = BertModel(config, add_pooling_layer=False)
        # Set up token classification head
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        # Load and initialize weights
        self.init_weights()
        
    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None,
                labels=None, **kwargs):
        # Use model body to get encoder representations
        outputs = self.bert(input_ids, attention_mask=attention_mask,
                               token_type_ids=token_type_ids, **kwargs)
        # Apply classifier to encoder representation
        sequence_output = self.dropout(outputs[0])
        logits = self.classifier(sequence_output)
        # Calculate losses
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        # Return model output object
        return TokenClassifierOutput(loss=loss, logits=logits,
                                     hidden_states=outputs.hidden_states,
                                     attentions=outputs.attentions) 
    

In [9]:
import torch
from transformers import AutoTokenizer

bert_model_name = "bert-base-cased"
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

from transformers import AutoConfig

bert_config = AutoConfig.from_pretrained(bert_model_name,
                                         num_labels=len(dm.unique_entities),
                                         id2label=dm.id2label, label2id=dm.label2id)



In [10]:
xlmr_model = (BertModelForTokenClassifaction
              .from_pretrained(bert_model_name, config=bert_config)
              .to(device))

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModelForTokenClassifaction: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModelForTokenClassifaction from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModelForTokenClassifaction from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModelForTokenClassif

In [11]:
#xlmr_model(torch.tensor(hf_dm['input_ids']).to(device)).logits
outputs=xlmr_model(torch.tensor(hf_dm['input_ids'][0:3]).to(device)).logits

In [12]:
predictions = torch.argmax(outputs, dim=-1)
print(f"Shape of outputs: {outputs.shape}")






Shape of outputs: torch.Size([3, 30, 67])


In [None]:
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    num_train_epochs=30,
    weight_decay=0.01,
)

train_ds = dm.as_hf_dataset(tokenizer=tokenizer)

trainer = Trainer(
    model=xlmr_model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=train_ds, # eval on training set! ONLY for DEMO!!
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

  0%|          | 0/17 [00:00<?, ?ba/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


The following columns in the training set don't have a corresponding argument in `BertModelForTokenClassifaction.forward` and have been ignored: tokens, id, ner_tags. If tokens, id, ner_tags are not expected by `BertModelForTokenClassifaction.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 16778
  Num Epochs = 30
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 3960
  Number of trainable parameters = 107771203
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: Currently logged in as: [33mjoesharratt1229[0m ([33mjoes_team[0m). Use [1m`wandb login --relogin`[0m to force relogin


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
