### Data Loading and Preprocessing
In this section, we load the datasets from both the source (English) and target (Wolof) languages. 
We are using datasets within the Universal Dependencies framework for the source language (English) to fine-tune the models.
For the target language (Wolof), we will be testing the zero-shot capabilities of the models.

In [3]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.9 MB/s[0m eta [36m0:00

In [4]:
!pip install conllu

Collecting conllu
  Downloading conllu-5.0.2-py3-none-any.whl.metadata (21 kB)
Downloading conllu-5.0.2-py3-none-any.whl (16 kB)
Installing collected packages: conllu
Successfully installed conllu-5.0.2


In [10]:
!pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16161 sha256=1369e9a17d65ea2999d64579157f65ffe260bea1a445138bb0ed2efbc184a270
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [23]:
import torch
from transformers import AutoModelForTokenClassification, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForTokenClassification
from datasets import load_dataset
from seqeval.metrics import classification_report, accuracy_score
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt

## Model Fine-tuning

We fine-tune two models: XLM-R and Glot500 on the source language (English). 
This is a supervised step where the models learn to perform POS tagging on the better-resourced language.
No further training is performed on the low-resource language (Wolof).

In [6]:
def fine_tune_model(model_name, language, output_dir):
    # Load the dataset
    dataset = load_dataset("universal_dependencies", language)

    # Investigate the labels
    all_labels = set()
    for split in dataset.keys():
        all_labels.update(label for example in dataset[split]["upos"] for label in example)
    num_labels = len(all_labels)
    print(f"Number of unique labels: {num_labels}")
    print(f"Labels: {sorted(all_labels)}")

    # Load pre-trained model and tokenizer
    model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=num_labels)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Create label mapping
    label_map = {label: i for i, label in enumerate(sorted(all_labels))}
    print("Label mapping:")
    print(label_map)

    # Tokenize and prepare the dataset
    def tokenize_and_align_labels(examples):
        tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True, padding="max_length", max_length=128)
        labels = []
        for i, label in enumerate(examples["upos"]):
            word_ids = tokenized_inputs.word_ids(batch_index=i)
            previous_word_idx = None
            label_ids = []
            for word_idx in word_ids:
                if word_idx is None:
                    label_ids.append(-100)
                elif word_idx != previous_word_idx:
                    label_ids.append(label_map[label[word_idx]])
                else:
                    label_ids.append(-100)
                previous_word_idx = word_idx
            labels.append(label_ids)
        tokenized_inputs["labels"] = labels
        return tokenized_inputs

    tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True, remove_columns=dataset["train"].column_names)

    # Set up data collator
    data_collator = DataCollatorForTokenClassification(tokenizer)

    # Set up training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        evaluation_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        weight_decay=0.01,
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"],
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    # Fine-tune the model
    trainer.train()

    # Save the fine-tuned model
    trainer.save_model(output_dir)

In [7]:
fine_tune_model("xlm-roberta-base", "en_ewt", "./xlm-r-finetuned")

universal_dependencies.py:   0%|          | 0.00/87.8k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/191k [00:00<?, ?B/s]

The repository for universal_dependencies contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/universal_dependencies.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/13.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.71M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.71M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/12543 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2002 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2077 [00:00<?, ? examples/s]

Number of unique labels: 18
Labels: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]


config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Label mapping:
{0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17}




Map:   0%|          | 0/12543 [00:00<?, ? examples/s]

Map:   0%|          | 0/2002 [00:00<?, ? examples/s]

Map:   0%|          | 0/2077 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
1,0.3932,0.116858
2,0.0743,0.102471
3,0.0578,0.103082


In [8]:
fine_tune_model("cis-lmu/glot500-base", "en_ewt", "./glot500-finetuned")

Number of unique labels: 18
Labels: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]


config.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.58G [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at cis-lmu/glot500-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/496 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/7.66M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]



Label mapping:
{0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17}


Map:   0%|          | 0/12543 [00:00<?, ? examples/s]

Map:   0%|          | 0/2002 [00:00<?, ? examples/s]

Map:   0%|          | 0/2077 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
1,1.0643,0.21826
2,0.1283,0.137257
3,0.0982,0.122559


## Zero-Shot Transfer
Once the models are fine-tuned on English, we apply them directly to the low-resource language (Wolof) without additional training. 
We will evaluate the performance of the models using POS-annotated corpora in Wolof.

In [21]:
def zero_shot_evaluation(model_path, target_language):
    # Load the fine-tuned model and tokenizer
    model = AutoModelForTokenClassification.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    # Load the target language dataset
    dataset = load_dataset("universal_dependencies", target_language)

    # Get the label mapping (assuming the model provides id2label mapping)
    label_map = model.config.id2label
    label_map_inv = {v: k for k, v in label_map.items()}  # Create reverse map for string labels

    # Tokenize and prepare the dataset
    def tokenize_and_align_labels(examples):
        tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
        labels = []
        for i, label in enumerate(examples["upos"]):
            word_ids = tokenized_inputs.word_ids(batch_index=i)
            previous_word_idx = None
            label_ids = []
            for word_idx in word_ids:
                if word_idx is None:
                    label_ids.append(-100)
                elif word_idx != previous_word_idx:
                    # Map the string label to its corresponding integer if necessary
                    if isinstance(label[word_idx], str):
                        label_ids.append(int(label_map_inv[label[word_idx]]))
                    else:
                        label_ids.append(label[word_idx])
                else:
                    label_ids.append(-100)
                previous_word_idx = word_idx
            labels.append(label_ids)
        tokenized_inputs["labels"] = labels
        return tokenized_inputs

    tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)

    # Perform prediction
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()

    all_predictions = []
    all_labels = []

    for batch in tokenized_datasets["test"]:
        input_ids = torch.tensor([batch["input_ids"]]).to(device)
        attention_mask = torch.tensor([batch["attention_mask"]]).to(device)

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)

        predictions = outputs.logits.argmax(dim=-1)
        predictions = predictions.cpu().numpy()[0]
        labels = batch["labels"]

        # Prepare lists for predictions and true labels
        true_predictions = []
        true_labels = []

        # Filter out ignored index (-100) and group by sentence
        current_prediction = []
        current_label = []

        for (p, l) in zip(predictions, labels):
            if l != -100:  # Only consider valid labels
                current_prediction.append(model.config.id2label[p])
                current_label.append(model.config.id2label[l])
            if l == -100 or (len(current_label) > 0 and l == -100):  # End of a sentence
                if current_label:  # Only add if there are true labels
                    all_predictions.append(current_prediction)
                    all_labels.append(current_label)
                    current_prediction = []
                    current_label = []

    # Compute and print the classification report
    report = classification_report(all_labels, all_predictions, zero_division=0)
    print(f"Classification Report for {target_language}:")
    print(report)

In [22]:
# target low-resource language: Wolof
zero_shot_evaluation("./xlm-r-finetuned", "wo_wtb")
zero_shot_evaluation("./glot500-finetuned", "wo_wtb")

Map:   0%|          | 0/449 [00:00<?, ? examples/s]

Classification Report for wo_wtb:
              precision    recall  f1-score   support

      ABEL_0       0.33      0.37      0.35      1727
      ABEL_1       0.99      1.00      0.99      1152
     ABEL_10       0.20      0.84      0.32       634
     ABEL_11       0.51      0.15      0.23      1283
     ABEL_12       0.00      0.00      0.00         0
     ABEL_13       0.03      0.02      0.02       309
     ABEL_14       0.02      0.04      0.03       296
     ABEL_15       0.01      0.67      0.01         3
     ABEL_16       0.37      0.14      0.20      1737
     ABEL_17       0.11      0.04      0.06       887
      ABEL_2       0.18      0.10      0.13       748
      ABEL_3       0.96      0.34      0.51       140
      ABEL_4       0.00      0.00      0.00         1
      ABEL_5       0.02      0.00      0.01       209
      ABEL_6       0.02      1.00      0.04         2
      ABEL_7       0.10      0.12      0.11       145
      ABEL_8       0.02      0.00      0.00    

Map:   0%|          | 0/1188 [00:00<?, ? examples/s]

Map:   0%|          | 0/449 [00:00<?, ? examples/s]

Map:   0%|          | 0/470 [00:00<?, ? examples/s]

Classification Report for wo_wtb:
              precision    recall  f1-score   support

      ABEL_0       0.61      0.80      0.69      1702
      ABEL_1       0.99      1.00      1.00      1146
     ABEL_10       0.79      0.92      0.85       622
     ABEL_11       0.66      0.52      0.58      1238
     ABEL_12       0.00      0.00      0.00         0
     ABEL_13       0.43      0.17      0.25       309
     ABEL_14       0.26      0.48      0.34       293
     ABEL_15       0.04      0.67      0.07         3
     ABEL_16       0.79      0.64      0.71      1725
     ABEL_17       0.63      0.48      0.54       871
      ABEL_2       0.70      0.78      0.74       748
      ABEL_3       0.82      0.66      0.73       138
      ABEL_4       0.00      0.00      0.00         1
      ABEL_5       0.40      0.63      0.49       208
      ABEL_6       0.00      0.00      0.00         2
      ABEL_7       0.39      0.76      0.51       145
      ABEL_8       0.65      0.12      0.20    

## Subword Tokenization
In this section, we analyze how differences in subword tokenization between the source (English) and target (Wolof) languages 
can affect the performance of zero-shot POS tagging. 

In [24]:
def analyze_tokenization(model_name, source_lang, target_lang):
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Load datasets
    source_dataset = load_dataset("universal_dependencies", source_lang)
    target_dataset = load_dataset("universal_dependencies", target_lang)

    def get_subword_stats(dataset):
        subword_lengths = []
        for example in dataset["train"]:
            for word in example["tokens"]:
                subwords = tokenizer.tokenize(word)
                subword_lengths.append(len(subwords))
        return subword_lengths

    source_subword_lengths = get_subword_stats(source_dataset)
    target_subword_lengths = get_subword_stats(target_dataset)

    # Plot histogram
    plt.figure(figsize=(10, 6))
    plt.hist(source_subword_lengths, alpha=0.5, label=source_lang)
    plt.hist(target_subword_lengths, alpha=0.5, label=target_lang)
    plt.legend(loc='upper right')
    plt.title(f"Subword Length Distribution: {source_lang} vs {target_lang}")
    plt.xlabel("Number of subwords per word")
    plt.ylabel("Frequency")
    plt.savefig(f"subword_distribution_{source_lang}_{target_lang}.png")
    plt.close()

    # Calculate statistics
    source_avg = sum(source_subword_lengths) / len(source_subword_lengths)
    target_avg = sum(target_subword_lengths) / len(target_subword_lengths)

    print(f"Average subwords per word in {source_lang}: {source_avg:.2f}")
    print(f"Average subwords per word in {target_lang}: {target_avg:.2f}")

    # Most common multi-subword tokens
    def get_multi_subword_tokens(dataset):
        multi_subword_tokens = []
        for example in dataset["train"]:
            for word in example["tokens"]:
                subwords = tokenizer.tokenize(word)
                if len(subwords) > 1:
                    multi_subword_tokens.append(word)
        return Counter(multi_subword_tokens).most_common(10)

    print(f"\nMost common multi-subword tokens in {source_lang}:")
    print(get_multi_subword_tokens(source_dataset))

    print(f"\nMost common multi-subword tokens in {target_lang}:")
    print(get_multi_subword_tokens(target_dataset))

In [26]:
analyze_tokenization("xlm-roberta-base", "en_ewt", "wo_wtb")
analyze_tokenization("cis-lmu/glot500-base", "en_ewt", "wo_wtb")

Average subwords per word in en_ewt: 1.31
Average subwords per word in wo_wtb: 1.81

Most common multi-subword tokens in en_ewt:
[('.', 8640), (',', 7021), ("'s", 906), ("n't", 645), ("don't", 233), ("'m", 197), ("I'm", 176), ("'ll", 148), ("'ve", 124), ("it's", 101)]

Most common multi-subword tokens in wo_wtb:
[(',', 1175), ('.', 1000), ('ñu', 355), ('yu', 159), ('leen', 131), ('moo', 117), ('moom', 104), ('ngir', 75), ('nekk', 72), ('dafa', 72)]
Average subwords per word in en_ewt: 1.19
Average subwords per word in wo_wtb: 1.39

Most common multi-subword tokens in en_ewt:
[("'s", 906), ("n't", 645), ("don't", 233), ("'m", 197), ("I'm", 176), ("'ll", 148), ("'ve", 124), ("it's", 101), ("'re", 101), ('Enron', 96)]

Most common multi-subword tokens in wo_wtb:
[('Almaañ', 42), ('nekkoon', 39), ('yooyu', 37), ('Loolu', 32), ('njëkk', 31), ('yépp', 30), ('sañ-sañ', 30), ('nataal', 30), ('Waalo', 28), ('yeneen', 28)]
