In [None]:
# install huggingface and datasets
!pip install -q datasets transformers
!pip install torch
!pip install pandas
!pip install pyarrow



In [None]:
from datasets import load_dataset, ClassLabel
from transformers import BertForTokenClassification, BertTokenizer, TrainingArguments, Trainer
import numpy as np
import torch
from sklearn import metrics


import pandas as pd
import datasets
from datasets.features import ClassLabel

In [None]:
torch.cuda.is_available()
device = torch.device("cuda")

In [None]:
from google.colab import drive
drive.mount('/content/drive')
!ls '/content/drive/MyDrive/e_ML4NLP2/v2.1/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
ajmc  hipe2020	letemps  newseye  sonar  topres19th


In [None]:
train_path = '/content/drive/MyDrive/e_ML4NLP2/v2.1/newseye/sv/HIPE-2022-v2.1-newseye-train-sv.tsv'
dev_path = '/content/drive/MyDrive/e_ML4NLP2/v2.1/newseye/sv/HIPE-2022-v2.1-newseye-dev-sv.tsv'
test_path = '/content/drive/MyDrive/e_ML4NLP2/v2.1/newseye/sv/HIPE-2022-v2.1-newseye-test_allmasked-sv.tsv'

#model = "KB/bert-base-swedish-cased"
#model = 'dbmdz/bert-base-swedish-europeana-cased'
#model = 'dbmdz/bert-base-historic-multilingual-cased'
#model = 'setu4993/LaBSE'
model = 'jonfd/electra-small-nordic'

In [None]:
# import dataset from cloned git repo
def load_dataset(path):
    df = pd.read_csv(path, sep='\t', skip_blank_lines=False, engine='python', quoting=3)
    # error_bad_lines=False, 
    return df


tsv_train = load_dataset(train_path)
tsv_dev = load_dataset(dev_path)
tsv_test = load_dataset(test_path)

In [None]:
def simple_preprocess(dataframe):
    # Add end_of_document token in df
    dataframe = dataframe.dropna(subset=['TOKEN'])

    # Filter out metadata rows beginning with #
    dataframe = dataframe[~dataframe['TOKEN'].astype(str).str.startswith('#')]
    dataframe = dataframe[~dataframe['TOKEN'].astype(str).str.startswith('\t')]

    #transforming nan var from Float to string to use in (***)
    dataframe.MISC = dataframe.MISC.fillna('')

    return dataframe

In [None]:
tsv_train = simple_preprocess(tsv_train)
tsv_dev = simple_preprocess(tsv_dev)
tsv_test = simple_preprocess(tsv_test)
#tsv_train = tsv_train.reset_index()
tsv_dev.head(100)

Unnamed: 0,TOKEN,NE-COARSE-LIT,NE-COARSE-METO,NE-FINE-LIT,NE-FINE-METO,NE-FINE-COMP,NE-NESTED,NEL-LIT,NEL-METO,MISC
12,Fondekorationen,O,_,O,_,_,O,_,_,NoSpaceAfter
13,",",O,_,O,_,_,O,_,_,_
14,föreställande,O,_,O,_,_,O,_,_,NoSpaceAfter
15,Kuddnäs,B-LOC,_,O,_,_,O,NIL,_,NoSpaceAfter
16,",",I-LOC,_,O,_,_,O,NIL,_,_
...,...,...,...,...,...,...,...,...,...,...
107,kort,O,_,O,_,_,O,_,_,NoSpaceAfter
108,före,O,_,O,_,_,O,_,_,_
109,senaste,O,_,O,_,_,O,_,_,_
110,julhälg,O,_,O,_,_,O,_,_,_


In [None]:
label_set = tsv_train['NE-COARSE-LIT'].unique()
# define the label mapping for NER
label_list = label_set.tolist()
label_list.append('_')
label_num = len(label_list)

labels = ClassLabel(num_classes=label_num, names=label_list)

labels

ClassLabel(num_classes=10, names=['O', 'B-LOC', 'I-LOC', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-HumanProd', 'I-HumanProd', '_'], id=None)

In [None]:
def create_huggingface_file(dataframe):
    import pyarrow as pa
    import re
    from datasets import Dataset

    #creating dataset in json
    hug_out = []
    idx = 0 
    items = {'id': idx,'words':[ ], 'ner': [ ]}
    hug_out.append(items)
    for index, row in dataframe.iterrows():
        if  not re.search(r'EndOfSentence', row['MISC']):
            items['words'].append(row['TOKEN'])
            items['ner'].append(labels.str2int(row['NE-COARSE-LIT']))
        else:
            items['words'].append(row['TOKEN'])
            items['ner'].append(labels.str2int(row['NE-COARSE-LIT']))
            idx += 1
            items = {'id': idx,'words':[ ], 'ner': [ ]}
            hug_out.append(items)
    #filter hug_out out, delete items which has len(words) > 380
    #hug_out = filter(lambda x: len(x['words']) < 380, hug_out)
    #json to df
    hug_out = pd.DataFrame(hug_out)

    # delete all sentences that are too long
    #hug_out = hug_out[hug_out['words'].map(len) < 512] #why does not work? QA

    ### convert to Huggingface dataset
    hug_out = Dataset(pa.Table.from_pandas(hug_out))

    return hug_out

In [None]:
train = create_huggingface_file(tsv_train)
val = create_huggingface_file(tsv_dev)
test = create_huggingface_file(tsv_test)

#look at training data
for i in range(10):
  print(train[i])

print(len(train))
print(len(val))
print(len(test))

{'id': 0, 'words': ['Lördagen', 'den', '22', 'Januari', 'HUFVUDSTADSBLADET', '1898', '—', 'N:o', '20', '—', '3', 'vid', 'laudtdagen', '1897', 'äskade', 'ansla', '-', 'gen', 'och', 'utvägarna', 'för', 'deras', 'bestri', '-', 'dande', '.'], 'ner': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}
{'id': 1, 'words': ['—', 'Direkt', 'trafik', 'mellan', 'ty', '-', 'ska', 'och', 'finska', 'järnvägarna', '.'], 'ner': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}
{'id': 2, 'words': ['I', 'skritvelse', 'at', 'den', '12', 'sistlidne', 'sep', '-', 'tember', 'meddelade', 'ryska', 'järnvägs', '-', 'departementet', 'järnvägsstyrelsen', ',', 'att', 'å', 'järnvägskonterensen', 'i', 'Amsterdam', 'den', '15', 'september', 'komme', 'att', 'behand', '-', 'las', 'en', 'af', 'Brombergska', 'järnvägs', '-', 'styrelsen', 'väkt', 'fråga', 'om', 'direkt', 'för', '-', 'bindelse', 'mellan', 'tyska', 'och', 'finska', 'järnvägarna', 'öfver', 'Werschbalowa', '—', 'Alexandrowo', '—', 'Mawu', '-', '

In [None]:
'''LOADERSCRIPT = train_path = '/content/drive/MyDrive/e_ML4NLP2/dataloader.py'
MODEL = "dbmdz/bert-base-german-europeana-cased"'''

'LOADERSCRIPT = train_path = \'/content/drive/MyDrive/e_ML4NLP2/dataloader.py\'\nMODEL = "dbmdz/bert-base-german-europeana-cased"'

In [None]:
'''de_total = load_dataset(LOADERSCRIPT, "de")'''

'de_total = load_dataset(LOADERSCRIPT, "de")'

In [None]:
'''train = de_total["train"]
val = de_total["validation"]
test = de_total["test"]'''

'train = de_total["train"]\nval = de_total["validation"]\ntest = de_total["test"]'

In [None]:
'''tiny_shuffle_train =  train.shuffle(seed=42).select(range(100))
tiny_train = tiny_shuffle_train.select(range(0,80))
tiny_shuffle_test = test.shuffle(seed=42).select(range(100))
tiny_test = tiny_shuffle_test.select(range(80,100))'''

'tiny_shuffle_train =  train.shuffle(seed=42).select(range(100))\ntiny_train = tiny_shuffle_train.select(range(0,80))\ntiny_shuffle_test = test.shuffle(seed=42).select(range(100))\ntiny_test = tiny_shuffle_test.select(range(80,100))'

In [None]:
'''tiny_shuffle_train[10]'''

'tiny_shuffle_train[10]'

In [None]:
'''labels = train.info.features["NE_COARSE_LIT"].feature
labels'''

'labels = train.info.features["NE_COARSE_LIT"].feature\nlabels'

In [None]:
tokenizer = BertTokenizer.from_pretrained(model)

https://huggingface.co/jonfd/electra-small-nordic/resolve/main/vocab.txt not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpi1y7pgpw


Downloading:   0%|          | 0.00/754k [00:00<?, ?B/s]

storing https://huggingface.co/jonfd/electra-small-nordic/resolve/main/vocab.txt in cache at /root/.cache/huggingface/transformers/9b128296428bff3da23912e1fb8af22ff435c5a6d5701776c5e37e205dc92b35.00119426595c97f9efaacc699e9267953a5b980d3aeeac10acb21bdc062d1a0c
creating metadata file for /root/.cache/huggingface/transformers/9b128296428bff3da23912e1fb8af22ff435c5a6d5701776c5e37e205dc92b35.00119426595c97f9efaacc699e9267953a5b980d3aeeac10acb21bdc062d1a0c
https://huggingface.co/jonfd/electra-small-nordic/resolve/main/special_tokens_map.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmphsocb0b9


Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

storing https://huggingface.co/jonfd/electra-small-nordic/resolve/main/special_tokens_map.json in cache at /root/.cache/huggingface/transformers/f7c4f3649df8d9c91df3235d24e92f786993ec2a7b1da2689707a90d0ae84808.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d
creating metadata file for /root/.cache/huggingface/transformers/f7c4f3649df8d9c91df3235d24e92f786993ec2a7b1da2689707a90d0ae84808.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d
https://huggingface.co/jonfd/electra-small-nordic/resolve/main/tokenizer_config.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmp1qi2372l


Downloading:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

storing https://huggingface.co/jonfd/electra-small-nordic/resolve/main/tokenizer_config.json in cache at /root/.cache/huggingface/transformers/5906c6786816deec422304fc68802e71403a1fb0a5fd3af4d74454129622ee6b.e6e0e5d9fb3b118f16c720925c6b5e4e74ec58f5e3b5e9b4f0c9d77e642d4669
creating metadata file for /root/.cache/huggingface/transformers/5906c6786816deec422304fc68802e71403a1fb0a5fd3af4d74454129622ee6b.e6e0e5d9fb3b118f16c720925c6b5e4e74ec58f5e3b5e9b4f0c9d77e642d4669
loading file https://huggingface.co/jonfd/electra-small-nordic/resolve/main/vocab.txt from cache at /root/.cache/huggingface/transformers/9b128296428bff3da23912e1fb8af22ff435c5a6d5701776c5e37e205dc92b35.00119426595c97f9efaacc699e9267953a5b980d3aeeac10acb21bdc062d1a0c
loading file https://huggingface.co/jonfd/electra-small-nordic/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/jonfd/electra-small-nordic/resolve/main/special_tokens_map.json from cache at /root/.cache/huggingface/transformers

Downloading:   0%|          | 0.00/624 [00:00<?, ?B/s]

storing https://huggingface.co/jonfd/electra-small-nordic/resolve/main/config.json in cache at /root/.cache/huggingface/transformers/fcd832221f4dec48dbbdbf74d95865fb5e435d34215d91d9ccfcadf83e75afad.f89d5cb4d821d04c96937cf6dcb4cbd3d303fe2f7418d720b55399b67402b0ce
creating metadata file for /root/.cache/huggingface/transformers/fcd832221f4dec48dbbdbf74d95865fb5e435d34215d91d9ccfcadf83e75afad.f89d5cb4d821d04c96937cf6dcb4cbd3d303fe2f7418d720b55399b67402b0ce
loading configuration file https://huggingface.co/jonfd/electra-small-nordic/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/fcd832221f4dec48dbbdbf74d95865fb5e435d34215d91d9ccfcadf83e75afad.f89d5cb4d821d04c96937cf6dcb4cbd3d303fe2f7418d720b55399b67402b0ce
Model config ElectraConfig {
  "_name_or_path": "jonfd/electra-small-nordic",
  "architectures": [
    "ElectraForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 128,
  "hidden_act": "gelu",
  "hidde

In [None]:
def tokenize_func(batch):
    tokenized = tokenizer(batch["words"], is_split_into_words=True, padding="max_length", max_length=100, truncation=True)
    tokenized_words = [
        [
            tokenizer(word, add_special_tokens=False)["input_ids"]
            for word in sent
        ]
        for sent in batch["words"]
    ]
    tokenized_ner = []

    for sent_words, sent_nes in zip(tokenized_words, batch["ner"]):
        tokenized_ner.append(
            [labels.str2int("O")]  # BOS symbol
            + [
                ne
                for subwords, ne in zip(sent_words, sent_nes)
                for _ in range(len(subwords))
            ]
            + [labels.str2int("O")]  # EOS symbol
        )

    # Padding with "O"
    tokenized["labels"] = [(ner + [labels.str2int("O")] * (100 - len(ner)))[:100] for ner in tokenized_ner]
    tokenized["subwords"] = tokenized_words
    return tokenized

def tokenize_nolabel_func(batch):
    tokenized = tokenizer(batch["words"], is_split_into_words=True, padding="max_length", max_length=100, truncation=True)
    tokenized_words = [
        [
            tokenizer(word, add_special_tokens=False)["input_ids"]
            for word in sent
        ]
        for sent in batch["words"]
    ]

    tokenized["subwords"] = tokenized_words
    return tokenized

#tiny_train_tokenized = tiny_train.map(tokenize_func, batched=True, batch_size=50)
#tiny_test_tokenized = tiny_test.map(tokenize_nolabel_func, batched=True, batch_size=50)

train_tokenized = train.map(tokenize_func, batched=True, batch_size=50)
val_tokenized = val.map(tokenize_func, batched=True, batch_size=50)
test_tokenized = test.map(tokenize_nolabel_func, batched=True, batch_size=50)


  0%|          | 0/22 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/7 [00:00<?, ?ba/s]

In [None]:
'''# train for 1 epochs, evaluate after each epoch
training_args = TrainingArguments(
    "test_trainer",
    num_train_epochs=3,
    evaluation_strategy="epoch"
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_dev,
)
trainer.train()'''

'# train for 1 epochs, evaluate after each epoch\ntraining_args = TrainingArguments(\n    "test_trainer",\n    num_train_epochs=3,\n    evaluation_strategy="epoch"\n)\n\n\ntrainer = Trainer(\n    model=model,\n    args=training_args,\n    train_dataset=tokenized_train,\n    eval_dataset=tokenized_dev,\n)\ntrainer.train()'

In [None]:
training_args = TrainingArguments(
    output_dir = "content/drive/MyDrive/e_ML4NLP2/"
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
model = BertForTokenClassification.from_pretrained(model, num_labels=labels.num_classes)
trainer = Trainer(model=model, args=training_args, train_dataset=train_tokenized)
trainer.train()

loading configuration file https://huggingface.co/jonfd/electra-small-nordic/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/fcd832221f4dec48dbbdbf74d95865fb5e435d34215d91d9ccfcadf83e75afad.f89d5cb4d821d04c96937cf6dcb4cbd3d303fe2f7418d720b55399b67402b0ce
You are using a model of type electra to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Model config BertConfig {
  "architectures": [
    "ElectraForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 128,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9"
  },
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "label2id": {
    "LABEL_0": 0,
  

Downloading:   0%|          | 0.00/83.8M [00:00<?, ?B/s]

storing https://huggingface.co/jonfd/electra-small-nordic/resolve/main/pytorch_model.bin in cache at /root/.cache/huggingface/transformers/5450502f540a6c3576ed7c32d35938329201d31e730c494208e59fb379cbf560.a2a07d32d50ddcbad69c927d357668f85c5c6979bfd6dbd0d06d78cf330df447
creating metadata file for /root/.cache/huggingface/transformers/5450502f540a6c3576ed7c32d35938329201d31e730c494208e59fb379cbf560.a2a07d32d50ddcbad69c927d357668f85c5c6979bfd6dbd0d06d78cf330df447
loading weights file https://huggingface.co/jonfd/electra-small-nordic/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/5450502f540a6c3576ed7c32d35938329201d31e730c494208e59fb379cbf560.a2a07d32d50ddcbad69c927d357668f85c5c6979bfd6dbd0d06d78cf330df447
Some weights of the model checkpoint at jonfd/electra-small-nordic were not used when initializing BertForTokenClassification: ['electra.encoder.layer.10.output.dense.weight', 'electra.encoder.layer.3.attention.output.dense.weight', 'electra.encoder.la

Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=399, training_loss=0.3488930532508028, metrics={'train_runtime': 43.3614, 'train_samples_per_second': 73.614, 'train_steps_per_second': 9.202, 'total_flos': 18156482870400.0, 'train_loss': 0.3488930532508028, 'epoch': 3.0})

In [None]:
predictions = trainer.predict(test_tokenized)

The following columns in the test set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: words, subwords, id, ner. If words, subwords, id, ner are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 343
  Batch size = 8


In [None]:
outfile = open("/content/drive/MyDrive/e_ML4NLP2/teamname_bundle4_de_0.tsv", "w")
outfile.write("TOKEN\tNE-COARSE-LIT\tNE-COARSE-METO\tNE-FINE-LIT\tNE-FINE-METO\tNE-FINE-COMP\tNE-NESTED\tNEL-LIT\tNEL-METO\tMISC")

for pred, tinput in zip(predictions.predictions, test_tokenized):
    pred_word_labels = []

    try:
        sent_len = tinput["input_ids"].index(0)
    except ValueError:
        sent_len = len(tinput["input_ids"])
    # Start at 1 to skip pred for cls token
    i = 1

    for word in tinput['subwords']:
        
        word_score = np.zeros_like(pred[0])
        eos_reached = False
        try:
            for subword in word:
                word_score += pred[i]
                i += 1
            label = labels.int2str(int(np.argmax(word_score)))
            pred_word_labels.append(label)
        except IndexError:
            pred_word_labels.append("O")

    for token, label in zip(tinput['words'], pred_word_labels):
        outfile.write("\n" + f"{token}\t{label}\t" + ('\t'.join(['_'] * 8)))

outfile.close()

In [None]:
accuracy = datasets.load_metric("accuracy")
f1_metric = datasets.load_metric("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    refs = labels.flatten()
    predictions = np.argmax(logits, axis=-1)
    print(logits.shape)
    print(predictions.shape)
    pred = predictions.flatten()
    return {
        "accuracy": accuracy.compute(predictions=pred, references=refs)["accuracy"],
        "f1_micro": f1_metric.compute(predictions=pred, references=refs, average="micro")["f1"],
        "f1_macro": f1_metric.compute(predictions=pred, references=refs, average="macro")["f1"],
    }

In [None]:
Trainer(
    model=model,
    args=training_args,
    eval_dataset=val_tokenized,
    compute_metrics=compute_metrics,
).evaluate()

The following columns in the evaluation set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: words, subwords, id, ner. If words, subwords, id, ner are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 127
  Batch size = 8


(127, 100, 10)
(127, 100)


{'eval_accuracy': 0.9365354330708662,
 'eval_f1_macro': 0.13265950323935816,
 'eval_f1_micro': 0.9365354330708662,
 'eval_loss': 0.2978798449039459,
 'eval_runtime': 0.6827,
 'eval_samples_per_second': 186.013,
 'eval_steps_per_second': 23.435}