In [20]:
# install huggingface and datasets
!pip install -q datasets transformers
!pip install torch
!pip install pandas
!pip install pyarrow



In [21]:
from datasets import load_dataset, ClassLabel
from transformers import BertForTokenClassification, BertTokenizer, TrainingArguments, Trainer
import numpy as np
import torch
from sklearn import metrics


import pandas as pd
import datasets
from datasets.features import ClassLabel

In [22]:
torch.cuda.is_available()
device = torch.device("cuda")

In [23]:
from google.colab import drive
drive.mount('/content/drive')
!ls '/content/drive/MyDrive/e_ML4NLP2/v2.1/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
ajmc  hipe2020	letemps  newseye  sonar  topres19th


In [24]:
train_path = '/content/drive/MyDrive/e_ML4NLP2/v2.1/newseye/fr/HIPE-2022-v2.1-newseye-train-fr.tsv'
dev_path = '/content/drive/MyDrive/e_ML4NLP2/v2.1/newseye/fr/HIPE-2022-v2.1-newseye-dev-fr.tsv'
test_path = '/content/drive/MyDrive/e_ML4NLP2/v2.1/newseye/fr/HIPE-2022-v2.1-newseye-test_allmasked-fr.tsv'

#model  = 'dbmdz/bert-base-historic-multilingual-cased'
#model = 'setu4993/LaBSE'
#model = 'dbmdz/bert-base-french-europeana-cased'
model = 'dbmdz/electra-base-french-europeana-cased-discriminator'

In [25]:
# import dataset from cloned git repo
def load_dataset(path):
    df = pd.read_csv(path, sep='\t', skip_blank_lines=False, engine='python', quoting=3)
    # error_bad_lines=False, 
    return df


tsv_train = load_dataset(train_path)
tsv_dev = load_dataset(dev_path)
tsv_test = load_dataset(test_path)

In [26]:
def simple_preprocess(dataframe):
    # Add end_of_document token in df
    dataframe = dataframe.dropna(subset=['TOKEN'])

    # Filter out metadata rows beginning with #
    dataframe = dataframe[~dataframe['TOKEN'].astype(str).str.startswith('#')]
    dataframe = dataframe[~dataframe['TOKEN'].astype(str).str.startswith('\t')]

    #transforming nan var from Float to string to use in (***)
    dataframe.MISC = dataframe.MISC.fillna('')

    return dataframe

In [27]:
tsv_train = simple_preprocess(tsv_train)
tsv_dev = simple_preprocess(tsv_dev)
tsv_test = simple_preprocess(tsv_test)
#tsv_train = tsv_train.reset_index()
tsv_dev.head(100)

Unnamed: 0,TOKEN,NE-COARSE-LIT,NE-COARSE-METO,NE-FINE-LIT,NE-FINE-METO,NE-FINE-COMP,NE-NESTED,NEL-LIT,NEL-METO,MISC
12,On,O,_,O,_,_,O,_,_,_
13,pense,O,_,O,_,_,O,_,_,_
14,bien,O,_,O,_,_,O,_,_,_
15,que,O,_,O,_,_,O,_,_,_
16,l,O,_,O,_,_,O,_,_,NoSpaceAfter
...,...,...,...,...,...,...,...,...,...,...
107,ne,O,_,O,_,_,O,_,_,_
108,soit,O,_,O,_,_,O,_,_,_
109,celle,O,_,O,_,_,O,_,_,_
110,de,O,_,O,_,_,O,_,_,_


In [28]:
label_set = tsv_train['NE-COARSE-LIT'].unique()
# define the label mapping for NER
label_list = label_set.tolist()
label_list.append('_')
label_num = len(label_list)

labels = ClassLabel(num_classes=label_num, names=label_list)

labels

ClassLabel(num_classes=10, names=['O', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-PER', 'I-PER', 'B-HumanProd', 'I-HumanProd', '_'], id=None)

In [29]:
def create_huggingface_file(dataframe):
    import pyarrow as pa
    import re
    from datasets import Dataset

    #creating dataset in json
    hug_out = []
    idx = 0 
    items = {'id': idx,'words':[ ], 'ner': [ ]}
    hug_out.append(items)
    for index, row in dataframe.iterrows():
        if  not re.search(r'EndOfSentence', row['MISC']):
            items['words'].append(row['TOKEN'])
            items['ner'].append(labels.str2int(row['NE-COARSE-LIT']))
        else:
            items['words'].append(row['TOKEN'])
            items['ner'].append(labels.str2int(row['NE-COARSE-LIT']))
            idx += 1
            items = {'id': idx,'words':[ ], 'ner': [ ]}
            hug_out.append(items)
    #filter hug_out out, delete items which has len(words) > 380
    #hug_out = filter(lambda x: len(x['words']) < 380, hug_out)
    #json to df
    hug_out = pd.DataFrame(hug_out)

    # delete all sentences that are too long
    #hug_out = hug_out[hug_out['words'].map(len) < 512] #why does not work? QA

    ### convert to Huggingface dataset
    hug_out = Dataset(pa.Table.from_pandas(hug_out))

    return hug_out

In [30]:
train = create_huggingface_file(tsv_train)
val = create_huggingface_file(tsv_dev)
test = create_huggingface_file(tsv_test)

#look at training data
for i in range(10):
  print(train[i])

print(len(train))
print(len(val))
print(len(test))

{'id': 0, 'words': ['18', 'Centimes', 'ÉDITION', 'DE', 'PIHIS', 'N', '.', '2663', '.', '—', 'LUNDI', '15', 'JANVIEE', '1103', '9', ',', 'rue', 'Louis', '-', 'le', '-', 'Grand', '(', '2e', ')', 'Adr', '.', 'telég', '.', ':', 'ŒUVRE', '-', 'PARIS', 'Chèque', 'nostal', ':', 'Compte', '1046', 'GUSTAVE', 'TÉRY', 'Tééies', '(', 'Otonier', '59', '-', '96', ',', '59', '-', '57', ',', ',', '076', '-', '83', '.', 'Cent', '.', '03', '-', '15', 'Au', 'moment', 'où', 'nos', 'troupes', 'entrent', 'dans', 'la', 'Rubr', ',', 'on', 'nous', 'annonce', 'que', 'l', "'", 'on', 'veut', 'augmentes', 'nos', 'impôts', 'de', '20', '%', '.', 'Simple', 'coïncidence', '.'], 'ner': [0, 0, 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 3, 4, 4, 0, 0, 0, 0, 0, 5, 6, 0, 0, 0, 3, 4, 4, 4, 3, 4, 4, 0, 0, 3, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}
{'id': 1, 'words': ['A', 'LA', 'TRIBUNE', 'Glissement', 'des', 'allian

In [31]:
'''LOADERSCRIPT = train_path = '/content/drive/MyDrive/e_ML4NLP2/dataloader.py'
MODEL = "dbmdz/bert-base-german-europeana-cased"'''

'LOADERSCRIPT = train_path = \'/content/drive/MyDrive/e_ML4NLP2/dataloader.py\'\nMODEL = "dbmdz/bert-base-german-europeana-cased"'

In [32]:
'''de_total = load_dataset(LOADERSCRIPT, "de")'''

'de_total = load_dataset(LOADERSCRIPT, "de")'

In [33]:
'''train = de_total["train"]
val = de_total["validation"]
test = de_total["test"]'''

'train = de_total["train"]\nval = de_total["validation"]\ntest = de_total["test"]'

In [34]:
'''tiny_shuffle_train =  train.shuffle(seed=42).select(range(100))
tiny_train = tiny_shuffle_train.select(range(0,80))
tiny_shuffle_test = test.shuffle(seed=42).select(range(100))
tiny_test = tiny_shuffle_test.select(range(80,100))'''

'tiny_shuffle_train =  train.shuffle(seed=42).select(range(100))\ntiny_train = tiny_shuffle_train.select(range(0,80))\ntiny_shuffle_test = test.shuffle(seed=42).select(range(100))\ntiny_test = tiny_shuffle_test.select(range(80,100))'

In [35]:
'''tiny_shuffle_train[10]'''

'tiny_shuffle_train[10]'

In [36]:
'''labels = train.info.features["NE_COARSE_LIT"].feature
labels'''

'labels = train.info.features["NE_COARSE_LIT"].feature\nlabels'

In [37]:
tokenizer = BertTokenizer.from_pretrained(model)

Downloading:   0%|          | 0.00/222k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/610 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'ElectraTokenizer'. 
The class this function is called from is 'BertTokenizer'.


In [38]:
def tokenize_func(batch):
    tokenized = tokenizer(batch["words"], is_split_into_words=True, padding="max_length", max_length=100, truncation=True)
    tokenized_words = [
        [
            tokenizer(word, add_special_tokens=False)["input_ids"]
            for word in sent
        ]
        for sent in batch["words"]
    ]
    tokenized_ner = []

    for sent_words, sent_nes in zip(tokenized_words, batch["ner"]):
        tokenized_ner.append(
            [labels.str2int("O")]  # BOS symbol
            + [
                ne
                for subwords, ne in zip(sent_words, sent_nes)
                for _ in range(len(subwords))
            ]
            + [labels.str2int("O")]  # EOS symbol
        )

    # Padding with "O"
    tokenized["labels"] = [(ner + [labels.str2int("O")] * (100 - len(ner)))[:100] for ner in tokenized_ner]
    tokenized["subwords"] = tokenized_words
    return tokenized

def tokenize_nolabel_func(batch):
    tokenized = tokenizer(batch["words"], is_split_into_words=True, padding="max_length", max_length=100, truncation=True)
    tokenized_words = [
        [
            tokenizer(word, add_special_tokens=False)["input_ids"]
            for word in sent
        ]
        for sent in batch["words"]
    ]

    tokenized["subwords"] = tokenized_words
    return tokenized

#tiny_train_tokenized = tiny_train.map(tokenize_func, batched=True, batch_size=50)
#tiny_test_tokenized = tiny_test.map(tokenize_nolabel_func, batched=True, batch_size=50)

train_tokenized = train.map(tokenize_func, batched=True, batch_size=50)
val_tokenized = val.map(tokenize_func, batched=True, batch_size=50)
test_tokenized = test.map(tokenize_nolabel_func, batched=True, batch_size=50)


  0%|          | 0/143 [00:00<?, ?ba/s]

  0%|          | 0/14 [00:00<?, ?ba/s]

  0%|          | 0/51 [00:00<?, ?ba/s]

In [39]:
training_args = TrainingArguments(
    output_dir = "content/drive/MyDrive/e_ML4NLP2/"
)

In [40]:
model = BertForTokenClassification.from_pretrained(model, num_labels=labels.num_classes)
trainer = Trainer(model=model, args=training_args, train_dataset=train_tokenized)
trainer.train()

You are using a model of type electra to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.


Downloading:   0%|          | 0.00/422M [00:00<?, ?B/s]

Some weights of the model checkpoint at dbmdz/electra-base-french-europeana-cased-discriminator were not used when initializing BertForTokenClassification: ['electra.encoder.layer.11.output.LayerNorm.bias', 'electra.encoder.layer.3.output.LayerNorm.bias', 'electra.encoder.layer.10.attention.output.dense.weight', 'electra.encoder.layer.3.attention.self.query.bias', 'electra.encoder.layer.8.output.LayerNorm.bias', 'electra.encoder.layer.6.intermediate.dense.weight', 'electra.encoder.layer.8.output.LayerNorm.weight', 'electra.encoder.layer.10.attention.self.key.weight', 'electra.encoder.layer.6.attention.self.key.bias', 'electra.encoder.layer.6.attention.output.dense.weight', 'electra.encoder.layer.3.attention.self.query.weight', 'electra.encoder.layer.6.attention.self.query.weight', 'electra.encoder.layer.4.attention.output.dense.weight', 'electra.encoder.layer.11.attention.output.dense.weight', 'electra.encoder.layer.11.attention.output.dense.bias', 'electra.encoder.layer.4.attention.ou

Step,Training Loss
500,0.195
1000,0.1464
1500,0.1275
2000,0.1074
2500,0.099


Saving model checkpoint to content/drive/MyDrive/e_ML4NLP2/checkpoint-500
Configuration saved in content/drive/MyDrive/e_ML4NLP2/checkpoint-500/config.json
Model weights saved in content/drive/MyDrive/e_ML4NLP2/checkpoint-500/pytorch_model.bin
Saving model checkpoint to content/drive/MyDrive/e_ML4NLP2/checkpoint-1000
Configuration saved in content/drive/MyDrive/e_ML4NLP2/checkpoint-1000/config.json
Model weights saved in content/drive/MyDrive/e_ML4NLP2/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to content/drive/MyDrive/e_ML4NLP2/checkpoint-1500
Configuration saved in content/drive/MyDrive/e_ML4NLP2/checkpoint-1500/config.json
Model weights saved in content/drive/MyDrive/e_ML4NLP2/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to content/drive/MyDrive/e_ML4NLP2/checkpoint-2000
Configuration saved in content/drive/MyDrive/e_ML4NLP2/checkpoint-2000/config.json
Model weights saved in content/drive/MyDrive/e_ML4NLP2/checkpoint-2000/pytorch_model.bin
Saving model ch

TrainOutput(global_step=2667, training_loss=0.13197432504774795, metrics={'train_runtime': 1000.9769, 'train_samples_per_second': 21.3, 'train_steps_per_second': 2.664, 'total_flos': 1088185760694000.0, 'train_loss': 0.13197432504774795, 'epoch': 3.0})

In [41]:
predictions = trainer.predict(test_tokenized)

The following columns in the test set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: ner, subwords, words, id. If ner, subwords, words, id are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 2535
  Batch size = 8


In [42]:
outfile = open("/content/drive/MyDrive/e_ML4NLP2/teamname_bundle4_de_0.tsv", "w")
outfile.write("TOKEN\tNE-COARSE-LIT\tNE-COARSE-METO\tNE-FINE-LIT\tNE-FINE-METO\tNE-FINE-COMP\tNE-NESTED\tNEL-LIT\tNEL-METO\tMISC")

for pred, tinput in zip(predictions.predictions, test_tokenized):
    pred_word_labels = []

    try:
        sent_len = tinput["input_ids"].index(0)
    except ValueError:
        sent_len = len(tinput["input_ids"])
    # Start at 1 to skip pred for cls token
    i = 1

    for word in tinput['subwords']:
        
        word_score = np.zeros_like(pred[0])
        eos_reached = False
        try:
            for subword in word:
                word_score += pred[i]
                i += 1
            label = labels.int2str(int(np.argmax(word_score)))
            pred_word_labels.append(label)
        except IndexError:
            pred_word_labels.append("O")

    for token, label in zip(tinput['words'], pred_word_labels):
        outfile.write("\n" + f"{token}\t{label}\t" + ('\t'.join(['_'] * 8)))

outfile.close()

In [43]:
accuracy = datasets.load_metric("accuracy")
f1_metric = datasets.load_metric("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    refs = labels.flatten()
    predictions = np.argmax(logits, axis=-1)
    print(logits.shape)
    print(predictions.shape)
    pred = predictions.flatten()
    return {
        "accuracy": accuracy.compute(predictions=pred, references=refs)["accuracy"],
        "f1_micro": f1_metric.compute(predictions=pred, references=refs, average="micro")["f1"],
        "f1_macro": f1_metric.compute(predictions=pred, references=refs, average="macro")["f1"],
    }

Downloading builder script:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.06k [00:00<?, ?B/s]

In [44]:
Trainer(
    model=model,
    args=training_args,
    eval_dataset=val_tokenized,
    compute_metrics=compute_metrics,
).evaluate()

The following columns in the evaluation set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: ner, subwords, words, id. If ner, subwords, words, id are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 663
  Batch size = 8


(663, 100, 10)
(663, 100)


{'eval_accuracy': 0.9741327300150829,
 'eval_f1_macro': 0.3760626904177392,
 'eval_f1_micro': 0.9741327300150829,
 'eval_loss': 0.09773598611354828,
 'eval_runtime': 11.0937,
 'eval_samples_per_second': 59.764,
 'eval_steps_per_second': 7.482}