# Dropout Experiments
This notebook runs experiments using:
    - BPE dropout for XLM-R
    - Word dropout for Glot500

## setup

In [4]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-3.3.0-py3-none-any.whl.metadata (19 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.17-py311-none-any.whl.metadata (7.2 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading datasets-3.3.0-py3-none-any.whl (484 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

In [5]:
!pip install conllu

Collecting conllu
  Downloading conllu-6.0.0-py3-none-any.whl.metadata (21 kB)
Downloading conllu-6.0.0-py3-none-any.whl (16 kB)
Installing collected packages: conllu
Successfully installed conllu-6.0.0


In [6]:
!pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16161 sha256=71d450728a52e6ffdb36ead6fef448189d7995ff396d513d469f4896651752a6
  Stored in directory: /root/.cache/pip/wheels/bc/92/f0/243288f899c2eacdfa8c5f9aede4c71a9bad0ee26a01dc5ead
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
cd '/content/drive/MyDrive/Lorraine/-Zero-Shot-for-Under-Resourced-Language'

/content/drive/.shortcut-targets-by-id/1bl0mVqtZ4D0FRWLUe43BJc9qg8REnEwv/-Zero-Shot-for-Under-Resourced-Language


In [7]:
import local_library.automation_util as automation
from huggingface_hub import login

## 1. XLM-R Experiments (BPE Dropout)

In [4]:
xlmr_parameters = [
    # English -> Wolof experiments
    {
        'tuning_codes': ['en_ewt'],        # English training data
        'test_code': 'wo_wtb',             # Wolof test data
        'model_name': 'xlm-roberta-base',
        'tuned_model_name': 'xlmr-bpe-dropout-0.1-en-wo',
        'character_level_injection': False,
        'injection_vocab': '',
        'injection_prob': 0.0,
        'sample_threshold': 10000,
        'use_dropout': True,
        'dropout_prob': 0.1                # 10% dropout probability
    },
    {
        'tuning_codes': ['en_ewt'],
        'test_code': 'wo_wtb',
        'model_name': 'xlm-roberta-base',
        'tuned_model_name': 'xlmr-bpe-dropout-0.2-en-wo',
        'character_level_injection': False,
        'injection_vocab': '',
        'injection_prob': 0.0,
        'sample_threshold': 10000,
        'use_dropout': True,
        'dropout_prob': 0.2                # 20% dropout probability
    },

    # French -> Catalan experiments
    {
        'tuning_codes': ['fr_gsd'],        # French training data
        'test_code': 'ca_ancora',          # Catalan test data
        'model_name': 'xlm-roberta-base',
        'tuned_model_name': 'xlmr-bpe-dropout-0.1-fr-ca',
        'character_level_injection': False,
        'injection_vocab': '',
        'injection_prob': 0.0,
        'sample_threshold': 10000,
        'use_dropout': True,
        'dropout_prob': 0.1
    },
    {
        'tuning_codes': ['fr_gsd'],
        'test_code': 'ca_ancora',
        'model_name': 'xlm-roberta-base',
        'tuned_model_name': 'xlmr-bpe-dropout-0.2-fr-ca',
        'character_level_injection': False,
        'injection_vocab': '',
        'injection_prob': 0.0,
        'sample_threshold': 10000,
        'use_dropout': True,
        'dropout_prob': 0.2
    }
]

In [5]:
# Run XLM-R experiments
print("Starting XLM-R experiments with BPE dropout...")
xlmr_results = automation.batch_tune_eval(xlmr_parameters)

Starting XLM-R experiments with BPE dropout...

Running experiment with BPE dropout on xlmr
Dropout probability: 0.1
Model: xlm-roberta-base
Training data: ['en_ewt']
Test data: wo_wtb


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Started Training on Data: en_ewt


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mscientistamy2000[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,2.1001,1.589112,0.335515,0.19182,0.24409,0.43006
2,1.7321,1.432888,0.40065,0.241051,0.301003,0.480042


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


events.out.tfevents.1739715125.ab8d8f147799.1342.0:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

Model Pushed to Hub with name: xlmr-bpe-dropout-0.1-en-wo


Downloading data:   0%|          | 0.00/1.62M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/703k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/717k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1188 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/449 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/470 [00:00<?, ? examples/s]



Map:   0%|          | 0/1188 [00:00<?, ? examples/s]

Map:   0%|          | 0/449 [00:00<?, ? examples/s]

Map:   0%|          | 0/470 [00:00<?, ? examples/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Results:
              precision    recall  f1-score   support

           0       0.17      0.08      0.11      1677
           1       0.10      0.79      0.18       876
           2       0.00      0.00      0.00       743
           3       0.14      0.08      0.10       139
           5       0.00      0.00      0.00       211
           6       0.00      0.00      0.00         2
           7       0.00      0.00      0.00       139
           8       0.00      0.00      0.00       752
           9       0.00      0.00      0.00       258
          10       0.16      0.24      0.19       674
          11       0.22      0.08      0.12      1368
          13       0.00      0.00      0.00       304
          14       0.00      0.00      0.00       293
          15       0.00      0.00      0.00         3
          16       0.14      0.04      0.06      1681
          17       0.09      0.01      0.02       928

    accuracy                           0.12     10048
   macro avg    

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Started Training on Data: en_ewt


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,2.1768,1.778712,0.242249,0.114276,0.155295,0.36773
2,1.9014,1.647555,0.309427,0.155727,0.207183,0.408785


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


events.out.tfevents.1739715963.ab8d8f147799.1342.1:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

Model Pushed to Hub with name: xlmr-bpe-dropout-0.2-en-wo


Map:   0%|          | 0/1188 [00:00<?, ? examples/s]

Map:   0%|          | 0/449 [00:00<?, ? examples/s]

Map:   0%|          | 0/470 [00:00<?, ? examples/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Results:
              precision    recall  f1-score   support

           0       0.18      0.29      0.22      1677
           1       0.11      0.71      0.20       876
           2       0.09      0.01      0.02       743
           3       0.26      0.11      0.15       139
           5       0.50      0.00      0.01       211
           6       0.00      0.00      0.00         2
           7       0.00      0.00      0.00       139
           8       0.00      0.00      0.00       752
           9       0.00      0.00      0.00       258
          10       0.27      0.35      0.30       674
          11       0.23      0.07      0.11      1368
          13       0.00      0.00      0.00       304
          14       0.06      0.01      0.02       293
          15       0.00      0.00      0.00         3
          16       0.13      0.02      0.04      1681
          17       0.00      0.00      0.00       928

    accuracy                           0.15     10048
   macro avg    

Downloading data:   0%|          | 0.00/22.6M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.27M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/639k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14449 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1476 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/416 [00:00<?, ? examples/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Started Training on Data: fr_gsd


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,2.0587,1.737278,0.2601,0.116689,0.161102,0.354146
2,1.7215,1.589175,0.315481,0.153549,0.206561,0.400952


Token indices sequence length is longer than the specified maximum sequence length for this model (566 > 512). Running this sequence through the model will result in indexing errors
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


events.out.tfevents.1739716736.ab8d8f147799.1342.2:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

Model Pushed to Hub with name: xlmr-bpe-dropout-0.1-fr-ca


Downloading data:   0%|          | 0.00/28.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.81M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.91M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/13123 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1709 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1846 [00:00<?, ? examples/s]

Map:   0%|          | 0/13123 [00:00<?, ? examples/s]

Map:   0%|          | 0/1709 [00:00<?, ? examples/s]

Map:   0%|          | 0/1846 [00:00<?, ? examples/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Results:
              precision    recall  f1-score   support

           0       0.15      0.15      0.15     10467
           1       0.09      0.33      0.14      4894
           2       0.16      0.12      0.14      9506
           3       0.04      0.05      0.05       992
           4       0.00      0.00      0.00       555
           5       0.02      0.01      0.01       997
           6       0.05      0.03      0.03      3182
           7       0.00      0.00      0.00        21
           8       0.08      0.04      0.06      7896
           9       0.06      0.02      0.03      1583
          10       0.21      0.18      0.20      5423
          11       0.06      0.03      0.04      2466
          13       0.00      0.00      0.00       113
          14       0.07      0.03      0.05      1577
          15       0.00      0.00      0.00         3
          16       0.06      0.08      0.07      4315
          17       0.06      0.02      0.03      2322

    accuracy    

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Started Training on Data: fr_gsd


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,2.1004,1.832656,0.206064,0.081662,0.116969,0.328219
2,1.8346,1.760119,0.243371,0.104674,0.146387,0.347446


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


events.out.tfevents.1739717551.ab8d8f147799.1342.3:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

Model Pushed to Hub with name: xlmr-bpe-dropout-0.2-fr-ca


Map:   0%|          | 0/13123 [00:00<?, ? examples/s]

Map:   0%|          | 0/1709 [00:00<?, ? examples/s]

Map:   0%|          | 0/1846 [00:00<?, ? examples/s]


Results:
              precision    recall  f1-score   support

           0       0.18      0.21      0.19     10467
           1       0.13      0.37      0.19      4894
           2       0.19      0.17      0.18      9506
           3       0.08      0.09      0.08       992
           4       0.00      0.00      0.00       555
           5       0.05      0.02      0.03       997
           6       0.12      0.08      0.10      3182
           7       0.00      0.00      0.00        21
           8       0.07      0.03      0.04      7896
           9       0.14      0.04      0.06      1583
          10       0.24      0.25      0.24      5423
          11       0.11      0.03      0.05      2466
          13       0.00      0.00      0.00       113
          14       0.15      0.09      0.11      1577
          15       0.00      0.00      0.00         3
          16       0.08      0.10      0.09      4315
          17       0.07      0.03      0.04      2322

    accuracy    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## 2. GLOT500 Experiments (Word Dropout)

In [10]:
# Define parameters for GLOT500 experiments
glot500_parameters = [
    # English -> Wolof experiments
    {
        'tuning_codes': ['en_ewt'],
        'test_code': 'wo_wtb',
        'model_name': 'cis-lmu/glot500-base',
        'tuned_model_name': 'glot500-word-dropout-0.1-en-wo',
        'use_dropout': True,
        'dropout_prob': 0.1,
        'sample_threshold': 10000
    },

    # French -> Catalan experiments
    {
        'tuning_codes': ['fr_gsd'],
        'test_code': 'ca_ancora',
        'model_name': 'cis-lmu/glot500-base',
        'tuned_model_name': 'glot500-word-dropout-0.1-fr-ca',
        'use_dropout': True,
        'dropout_prob': 0.1,
        'sample_threshold': 10000
    }
]

In [11]:
# Run GLOT500 experiments
print("Starting GLOT500 experiments with word dropout...")
glot500_results = automation.batch_tune_eval(glot500_parameters)

Starting GLOT500 experiments with word dropout...

Running experiment with Word dropout on glot500
Dropout probability: 0.1
Model: cis-lmu/glot500-base
Training data: ['en_ewt']
Test data: wo_wtb


README.md:   0%|          | 0.00/191k [00:00<?, ?B/s]

universal_dependencies.py:   0%|          | 0.00/87.8k [00:00<?, ?B/s]

The repository for universal_dependencies contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/universal_dependencies.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/13.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.71M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.71M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/12543 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2002 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2077 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/496 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/7.66M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.58G [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at cis-lmu/glot500-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Started Training on Data: en_ewt


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mscientistamy2000[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,2.3858,1.979957,0.220388,0.099906,0.137487,0.362224
2,2.0113,1.767691,0.301595,0.156747,0.206283,0.409926


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


events.out.tfevents.1739721912.fe604547457b.699.0:   0%|          | 0.00/7.39k [00:00<?, ?B/s]

Model Pushed to Hub with name: glot500-word-dropout-0.1-en-wo


Downloading data:   0%|          | 0.00/1.62M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/703k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/717k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1188 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/449 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/470 [00:00<?, ? examples/s]



Map:   0%|          | 0/1188 [00:00<?, ? examples/s]

Map:   0%|          | 0/449 [00:00<?, ? examples/s]

Map:   0%|          | 0/470 [00:00<?, ? examples/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Results:
              precision    recall  f1-score   support

           0       0.22      0.39      0.28      1718
           1       0.15      0.61      0.24       963
           2       0.03      0.00      0.00       758
           3       0.32      0.24      0.27       142
           5       0.00      0.00      0.00       211
           6       0.00      0.00      0.00         2
           7       0.00      0.00      0.00       143
           8       0.00      0.00      0.00       784
           9       0.00      0.00      0.00       264
          10       0.27      0.66      0.38       694
          11       0.25      0.02      0.04      1384
          13       0.00      0.00      0.00       304
          14       0.20      0.01      0.01       299
          15       0.00      0.00      0.00         3
          16       0.21      0.10      0.13      1719
          17       0.08      0.04      0.05       942

    accuracy                           0.19     10330
   macro avg    

Downloading data:   0%|          | 0.00/22.6M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.27M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/639k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14449 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1476 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/416 [00:00<?, ? examples/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at cis-lmu/glot500-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Started Training on Data: fr_gsd


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,2.2926,1.971703,0.20987,0.080542,0.116409,0.317926
2,1.9299,1.814128,0.251394,0.10559,0.148716,0.354826


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


events.out.tfevents.1739722871.fe604547457b.699.1:   0%|          | 0.00/7.39k [00:00<?, ?B/s]

Model Pushed to Hub with name: glot500-word-dropout-0.1-fr-ca


Downloading data:   0%|          | 0.00/28.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.81M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.91M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/13123 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1709 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1846 [00:00<?, ? examples/s]

Map:   0%|          | 0/13123 [00:00<?, ? examples/s]

Map:   0%|          | 0/1709 [00:00<?, ? examples/s]

Map:   0%|          | 0/1846 [00:00<?, ? examples/s]


Results:
              precision    recall  f1-score   support

           0       0.18      0.25      0.21     10644
           1       0.15      0.39      0.22      5654
           2       0.21      0.16      0.18      9600
           3       0.12      0.15      0.13      1006
           4       0.00      0.00      0.00       583
           5       0.00      0.00      0.00       999
           6       0.16      0.05      0.07      3245
           7       0.00      0.00      0.00        21
           8       0.05      0.02      0.02      7964
           9       0.00      0.00      0.00      1604
          10       0.27      0.32      0.29      5554
          11       0.10      0.05      0.07      2479
          13       0.00      0.00      0.00       115
          14       0.18      0.19      0.18      1595
          15       0.00      0.00      0.00         3
          16       0.09      0.13      0.11      4336
          17       0.05      0.01      0.02      2333

    accuracy    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Results Analysis

In [None]:
def print_experiment_results(results, experiment_name):
    """
    Print experiment results in a readable format

    Args:
        results: List of experiment results
        experiment_name: Name of the experiment set
    """
    print(f"\n{experiment_name} Results:")
    print("="*80)

    for result in results:
        print(f"\nModel: {result['model_name']}")
        print(f"Training Data: {result['tuning_codes']}")
        print(f"Test Data: {result['test_code']}")
        print(f"Dropout Type: {result.get('dropout_type', 'N/A')}")
        print(f"Dropout Probability: {result.get('dropout_prob', 'N/A')}")
        print("\nResults:")
        print(result['result'])
        print("-"*80)

In [None]:
# Print results for both experiment sets
print("\nFinal Results Summary")
print("="*80)
print_experiment_results(xlmr_results, "XLM-R BPE Dropout")
print_experiment_results(glot500_results, "GLOT500 Word Dropout")