In [1]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch

### Importing the model and tokenizer

In [2]:
tokenizer = AutoTokenizer.from_pretrained("UBC-NLP/serengeti-E250", token="XXX")

In [4]:
model = AutoModelForMaskedLM.from_pretrained("UBC-NLP/serengeti-E250", token="XXX")

Some weights of ElectraForMaskedLM were not initialized from the model checkpoint at UBC-NLP/serengeti-E250 and are newly initialized: ['generator_lm_head.bias', 'generator_predictions.LayerNorm.bias', 'generator_predictions.LayerNorm.weight', 'generator_predictions.dense.bias', 'generator_predictions.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
model

ElectraForMaskedLM(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(250000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0-11): 12 x ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768

### Setting up the pipeline

In [6]:
from transformers import pipeline
classifier = pipeline("fill-mask", model=model, tokenizer=tokenizer)
classifier("ẹ jọwọ , ẹ [MASK] mi")

[{'score': 0.00017801884678192437,
  'token': 202786,
  'token_str': 'wafana',
  'sequence': 'e jowo, e wafana mi'},
 {'score': 0.0001490649301558733,
  'token': 55487,
  'token_str': 'alimwambia',
  'sequence': 'e jowo, e alimwambia mi'},
 {'score': 0.00013251979544293135,
  'token': 229622,
  'token_str': 'geskrik',
  'sequence': 'e jowo, e geskrik mi'},
 {'score': 0.00012925111514050514,
  'token': 189535,
  'token_str': 'omele',
  'sequence': 'e jowo, e omele mi'},
 {'score': 0.00012695520126726478,
  'token': 76633,
  'token_str': 'papara',
  'sequence': 'e jowo, e papara mi'}]

### Read dataset

In [7]:
import pandas as pd

train_data = pd.read_csv("kr_train.tsv", sep="\t")

train_data

Unnamed: 0,ID,tweet,label
0,kr_train_00001,@user @user @user @user @user @user @user Hhhh...,negative
1,kr_train_00002,"@user Amahano?! Ni impanuka, inkangu, inzara.....",negative
2,kr_train_00003,Ese umuntu aguhaye miliyoni 7 zidorali ngo ary...,negative
3,kr_train_00004,Ugira amagambo😏 kandi Ubwo wasanga nawe byagut...,negative
4,kr_train_00005,Ukuntu inama zose zikomeye zirikubera Mu Rwand...,negative
...,...,...,...
3297,kr_train_03298,"Tugukunda kurusha mukobwa mwiza! Amahoro, ibyi...",positive
3298,kr_train_03299,*Sobanukirwa IBYIZA MASSAGE IFITEYE UMUBIRI* 👉...,positive
3299,kr_train_03300,Mushobora kugira uruhare muri iki kiganiro mut...,positive
3300,kr_train_03301,"2/2 Ntuduhane mu bitwoshya, Ahubwo udukize Umu...",positive


In [8]:
test_data = pd.read_csv("kr_test.tsv", sep="\t")
test_data

Unnamed: 0,ID,tweet,label
0,kr_test_00001,@user @user @user Kubeshya ntabwo Ari icyaha u...,neutral
1,kr_test_00002,@user Itegeko ry'umuryango rivuga ko n'umugore...,neutral
2,kr_test_00003,Abafollowers ba byimbisha intugu ba @user @use...,negative
3,kr_test_00004,@user @user Aha niho bita ku kirenge ra?,neutral
4,kr_test_00005,Hari abantu bahisemo brain kuyikoresha nk’amas...,negative
...,...,...,...
1021,kr_test_01028,@user Kuki wumva ko ibicuruzwa byo Mugihugu cy...,neutral
1022,kr_test_01030,@user @user @user @user @user @user @user Bina...,neutral
1023,kr_test_01032,"Amahoro mwese bantu ba hano,Uyu munsi Imana ib...",positive
1024,kr_test_01033,@user Gus buriya ninkikibazo jyanibaza burigih...,neutral


### Cleaning the dataset


In [9]:
import pandas as pd
import re

In [10]:
def clean_text(text):

    text = re.sub(r'http\S+|www.\S+|@\w+', '', text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = text.strip()
    
    return text

In [11]:
train_data['tweet'] = train_data['tweet'].apply(clean_text)

In [12]:
train_data['tweet']

0          Hhhhhh ntabyihogoza ubu x abo yishe bangana ik
1       Amahano Ni impanuka inkangu inzara Muyite izin...
2       Ese umuntu aguhaye miliyoni 7 zidorali ngo ary...
3       Ugira amagambo kandi Ubwo wasanga nawe byaguta...
4       Ukuntu inama zose zikomeye zirikubera Mu Rwand...
                              ...                        
3297    Tugukunda kurusha mukobwa mwiza Amahoro ibyish...
3298    Sobanukirwa IBYIZA MASSAGE IFITEYE UMUBIRI   h...
3299    Mushobora kugira uruhare muri iki kiganiro mut...
3300    22 Ntuduhane mu bitwoshya Ahubwo udukize Umubi...
3301    Ni umuyobozi wintangarugero aho ageze hose Ni ...
Name: tweet, Length: 3302, dtype: object

### Selecting sentences for augmentatiom

In [13]:
import random

num_sentences = int(len(train_data)*0.2)

random_indexes = random.sample(range(len(train_data)), num_sentences)

random_indexes

[1086,
 2055,
 447,
 739,
 2016,
 3121,
 2885,
 1407,
 130,
 2535,
 1732,
 840,
 3019,
 267,
 751,
 1865,
 2357,
 1054,
 529,
 125,
 77,
 1765,
 2845,
 1903,
 166,
 2860,
 296,
 870,
 1343,
 1409,
 1586,
 317,
 2172,
 1053,
 299,
 2292,
 895,
 731,
 1375,
 2011,
 421,
 1026,
 1601,
 1762,
 920,
 3298,
 1374,
 251,
 1307,
 2391,
 668,
 593,
 1241,
 175,
 947,
 3169,
 363,
 1172,
 3002,
 2352,
 380,
 3081,
 3127,
 2626,
 261,
 1921,
 2017,
 2798,
 1776,
 2662,
 85,
 1560,
 902,
 1291,
 2505,
 630,
 1720,
 1822,
 1481,
 2446,
 193,
 1029,
 3036,
 1737,
 2300,
 1167,
 2439,
 1529,
 3077,
 1469,
 927,
 2297,
 403,
 3040,
 1340,
 2339,
 1051,
 2346,
 2131,
 17,
 3105,
 475,
 370,
 2340,
 2125,
 905,
 114,
 2510,
 752,
 2990,
 2628,
 2184,
 1494,
 2645,
 1753,
 1579,
 1916,
 622,
 443,
 2290,
 3048,
 489,
 3216,
 2723,
 1998,
 2364,
 1666,
 468,
 864,
 1330,
 650,
 712,
 2877,
 2408,
 2507,
 1668,
 1346,
 2087,
 2328,
 1280,
 1035,
 1226,
 1121,
 944,
 786,
 169,
 1028,
 461,
 3197,
 2466,
 8

In [14]:
random_sentences = train_data.loc[random_indexes, "tweet"].tolist()
random_labels = train_data.loc[random_indexes, "label"].tolist()

print(random_sentences[0], random_labels[0]) # read random sentences and corresponding labels

Sosiyete yubucuruzi ikandika icyapa kinini cyane iti Guhamagara nubuntu  kandi bashaka kuvuga ngo Guhamagara ni ubuntu Birababaza negative


In [15]:
# iterative mask filling

sentence = random_sentences[0]

words = sentence.split()
length = len(words)

new_sentence = sentence

for i in range(0, length):
    words_new = new_sentence.split()
    words_new[i] = "[MASK]"
    masked_sentence = " ".join(words_new)
    print("masked : ", masked_sentence)
    predictions = classifier(masked_sentence)
    predicted_token = predictions[0]['token_str']
    new_sentence = masked_sentence.replace("[MASK]", predicted_token, 1)
    print("predicted : ", new_sentence)

masked :  [MASK] yubucuruzi ikandika icyapa kinini cyane iti Guhamagara nubuntu kandi bashaka kuvuga ngo Guhamagara ni ubuntu Birababaza
predicted :  gewag yubucuruzi ikandika icyapa kinini cyane iti Guhamagara nubuntu kandi bashaka kuvuga ngo Guhamagara ni ubuntu Birababaza
masked :  gewag [MASK] ikandika icyapa kinini cyane iti Guhamagara nubuntu kandi bashaka kuvuga ngo Guhamagara ni ubuntu Birababaza
predicted :  gewag fighter ikandika icyapa kinini cyane iti Guhamagara nubuntu kandi bashaka kuvuga ngo Guhamagara ni ubuntu Birababaza
masked :  gewag fighter [MASK] icyapa kinini cyane iti Guhamagara nubuntu kandi bashaka kuvuga ngo Guhamagara ni ubuntu Birababaza
predicted :  gewag fighter wafana icyapa kinini cyane iti Guhamagara nubuntu kandi bashaka kuvuga ngo Guhamagara ni ubuntu Birababaza
masked :  gewag fighter wafana [MASK] kinini cyane iti Guhamagara nubuntu kandi bashaka kuvuga ngo Guhamagara ni ubuntu Birababaza
predicted :  gewag fighter wafana fighter kinini cyane iti G

In [16]:
# split sentences into words
# mask one word
# pass the sentence to the model
# get the prediction
# replace sentence with predicted sentence
# repeat
# return new sentence


def iterative_mask_fill(sentence):
    try:
        words = sentence.split()
        length = len(words)

        new_sentence = sentence

        for i in range(0, length):

            words_new = new_sentence.split()
            words_new[i] = "[MASK]" # mask ith word

            masked_sentence = " ".join(words_new)

            prediction = classifier(masked_sentence)
            prediction = prediction[0]["token_str"]

            new_sentence = masked_sentence.replace("[MASK]", prediction, 1)
            

        return new_sentence
    
    except:
        print(f"error at iteration {i}")
        return random_mask_fill(sentence)

# select random word in sentence
# replace with mask
# pass to model
# get prediction
# replace sentence with prediction
# return new sentence

def random_mask_fill(sentence):

    words = sentence.split()
    length = len(words)

    new_sentence = [] # empty new sentence

    index = random.randint(0, length-1)

    words_new = sentence.split()

    words_new[index] = "[MASK]" # mask random word

    masked_sentence = " ".join(words_new)

    prediction = classifier(masked_sentence)

    prediction = prediction[0]['token_str']

    new_sentence = masked_sentence.replace("[MASK]", prediction)

    return new_sentence

In [17]:
print(random_sentences[0])

Sosiyete yubucuruzi ikandika icyapa kinini cyane iti Guhamagara nubuntu  kandi bashaka kuvuga ngo Guhamagara ni ubuntu Birababaza


In [18]:
print(iterative_mask_fill(random_sentences[0]))
print(random_mask_fill(random_sentences[0]))

gewag fighter wafana fighter fighter jiif limu wafana lafi fighter ulipangwa wafana gewag gewag broadcast limu cop
Sosiyete yubucuruzi ikandika icyapa gewag cyane iti Guhamagara nubuntu kandi bashaka kuvuga ngo Guhamagara ni ubuntu Birababaza


In [19]:
try:
    new_sentences_iter = [iterative_mask_fill(sentence) for sentence in random_sentences]
    new_sentences_rand = [random_mask_fill(sentence) for sentence in random_sentences]
except Exception:
    print("error")

In [20]:
print(new_sentences_iter[0])
print(new_sentences_rand[0])

gewag fighter wafana fighter fighter jiif limu wafana lafi fighter ulipangwa wafana gewag gewag broadcast limu cop
Sosiyete yubucuruzi ikandika icyapa kinini cyane iti Guhamagara nubuntu kandi bashaka gewag ngo Guhamagara ni ubuntu Birababaza


In [21]:
import pandas as pd

iter = pd.DataFrame({'ID': ['kr_train_0' + str(i+3303) for i in range(len(new_sentences_iter))],
                       'tweet': new_sentences_iter,
                       'label': random_labels})
rand = pd.DataFrame({'ID': ['kr_train_0' + str(i+3303) for i in range(len(new_sentences_rand))],
                       'tweet': new_sentences_rand,
                       'label': random_labels})

iter = iter[['ID', 'tweet', 'label']]
rand = rand[['ID', 'tweet', 'label']]

print(iter)
print(rand)

                 ID                                              tweet  \
0    kr_train_03303  gewag fighter wafana fighter fighter jiif limu...   
1    kr_train_03304         ulipangwa ommy ommy tumepanga kunyunyi jak   
2    kr_train_03305              lafi lafi wafana ommy wafana kumlisha   
3    kr_train_03306  ulipangwa jiif walid ommy limu wafana lafi laf...   
4    kr_train_03307  limu limu limu limu limu limu limu limu limu l...   
..              ...                                                ...   
655  kr_train_03958            ulipangwa ommy fighter fighter ommy cop   
656  kr_train_03959  ulipangwa ommy limu fighter wafana gewag lafi ...   
657  kr_train_03960  behavior wafana wafana wafana wafana wafana wa...   
658  kr_train_03961  ##nin grana viewers limu broadcast limu ulipan...   
659  kr_train_03962                ##washu wafana broadcast panel lafi   

        label  
0    negative  
1     neutral  
2    negative  
3    negative  
4     neutral  
..        ...  

### Add the augmented sentences to original dataset

In [22]:
import pandas as pd

augmented_iter = pd.concat([train_data, iter], axis=0)
augmented_rand = pd.concat([train_data, rand], axis=0)

In [23]:
augmented_iter.to_csv("kr_train_iter.tsv", sep="\t", index=False)
augmented_rand.to_csv("kr_train_rand.tsv", sep="\t", index=False)

### Reference

[SERENGETI: Massively Multilingual Language Models for Africa](https://arxiv.org/pdf/2212.10785)

[XLM-E: Cross-lingual Language Model Pre-training via ELECTRA](https://aclanthology.org/2022.acl-long.427.pdf)

[Unsupervised Cross-lingual Representation Learning at Scale](https://aclanthology.org/2020.acl-main.747.pdf)

[Iterative Mask Filling: An Effective Text Augmentation Method Using Masked Language Modeling](https://arxiv.org/abs/2401.01830)
