In [1]:
from modeling_distillemb import BertModel, BertForSequenceClassification, BertForEmbeddingLM
from distill_emb import DistillEmbSmall, DistillEmb
from config import DistillModelConfig, DistillEmbConfig
import torch
from transformers import AutoTokenizer, RwkvConfig, RwkvModel, AutoModel
from tokenizer import CharTokenizer
from knn_classifier import KNNTextClassifier
from data_loader import load_sentiment, load_ner_dataset, load_pos_dataset
from data_loader import load_news_dataset
import pandas as pd
from retrieval import build_json_pairs, top1_accuracy
import os
from transformers import GPT2LMHeadModel

In [2]:
df, classes = load_sentiment()

Loaded 105862 rows from sentiment.parquet columns Index(['text', 'label', 'lang', 'split'], dtype='object')


In [3]:
df

Unnamed: 0,text,label,lang,split
1,Tesfaye ·àà·ä´·àµ ·å≠·â•·àç ·àà·â•·à∞·àΩ ·ã®·çï·àÆ·çå·à∞·à≠·äï ·çé·â∂ ·àà·å•·çà·ä≠ ·ä•·àç·àù ·ã´·àç·ä≠ ·â£...,negative,am,train
2,·ã≠·àÑ·ãç ·äê·ãç ·ä†·ã≠·ã∞·àç ·ã®·ä•·ãç·âÄ·âµ·àΩ ·å•·åç....·â†·à∞·àö ·à∞·àö ·ä®·àù·âµ·äì·åà·à™ ·àà·àù·äï ·â≥·à™·ä≠...,negative,am,train
3,·ãò·åà·â† ·ã≠·â£·àã·àç? ·àå·àã ·ã®·àö·â£·àç ·äê·åà·à≠ ·ä´·àà ·ä†·äï·â∞·ãâ ·äï·åà·à®·äï!,negative,am,train
4,?? ·ãµ·àÆ ·â†·ãò·àò·äê ·äÆ·ã≥·ä≠ ·çé·â∂ ·â§·âµ ·çç·àã·àπ ·çè ·à≤·àç ·ä†·ã≠·äì·âΩ·äï ·â∞·å®·çç·äñ ·ä•·äï·ã≥·ã≠·ãà...,negative,am,train
5,·ã†·àç·å•?? ???? ·åà·åà·àõ,negative,am,train
...,...,...,...,...
105857,@user Taakkee Jabaadhu!!! olola gadi galoo hin...,positive,or,test
105858,@user Waraana Bilisummaa Oromiyaa. Unity of Or...,neutral,or,test
105859,#Jawwaar dhugumatti hogganaa walitti-hidhaa ga...,negative,or,test
105860,Yooyyaa Yooyyaa akkam jirtan sabni Oromo hundi...,negative,or,test


In [4]:
df, classes = load_pos_dataset()

Loaded 30494 rows from masakhapos.parquet columns Index(['id', 'tokens', 'labels', 'split'], dtype='object')


In [5]:
df

Unnamed: 0,id,tokens,labels,split
0,0,"[Muso, ≈ãana, ,, Afiriki, tilebinyanfan, n', a,...","[0, 6, 1, 0, 0, 9, 11, 0, 14, 9, 0, 14, 17, 11...",train
1,1,"[Ni, m…îg…î, ka, d…îg…în, kojugu, ,, i, b…õ, m…îg…îw,...","[9, 0, 7, 6, 0, 1, 11, 17, 0, 0, 16, 0, 7, 11,...",train
2,2,"[A, k…îr…îtalen, a, ka, ta…≤…õ, f…õ, kow, …≤…õm…îg…îyab...","[11, 16, 11, 7, 0, 7, 0, 0, 7, 9, 0, 17, 11, 7...",train
3,3,"[Ale, y', a, (, basik…õti, ), to, a, ka, se, ka...","[11, 7, 11, 1, 0, 1, 16, 11, 7, 16, 7, 16, 11,...",train
4,4,"[Sannay…õl…õn, galabuk…õn…õya, a, sera, ka, min, f...","[0, 0, 11, 16, 7, 11, 16, 10, 0, 7, 9, 0, 1, 0...",train
...,...,...,...,...
30489,145,"[Uthe, leli, cala, lisaphenywa, .]","[16, 8, 0, 16, 1]",dev
30490,146,"[Uthe, kusolakala, ukuthi, kube, nezigilamkhub...","[16, 16, 5, 16, 0, 16, 0, 12, 10, 1]",dev
30491,147,"["""""""", Kusolakala, ukuthi, umndeni, waseMange,...","[1, 16, 5, 0, 10, 1, 10, 16, 0, 16, 1]",dev
30492,148,"[Babathumbile, beba, nezimoto, .]","[16, 16, 0, 1]",dev


In [6]:
num_input_chars=12

In [7]:
tokenizer = CharTokenizer(charset_file_path='tokenizer/charset.json',
                          max_word_length=num_input_chars)

In [8]:
config = DistillModelConfig(
    vocab_size=30522,
    hidden_size=768,
    num_hidden_layers=9,
    num_attention_heads=8,
    intermediate_size=3072,
    max_position_embeddings=512,
    type_vocab_size=2,
    pad_token_id=0,
    position_embedding_type="absolute",
    use_cache=True,
    classifier_dropout=None,
    embedding_type="distill",  # 'distilemb', 'fasttext'
    encoder_type='bert', #'lstm'
    num_input_chars=num_input_chars,  # number of characters in each token
    char_vocab_size=tokenizer.char_vocab_size,
    distil_config=DistillEmbConfig(
        num_input_chars=tokenizer.max_word_length,  # number of characters in each token
        char_vocab_size=tokenizer.char_vocab_size,
        size="small",
        distill_dropout=0.1,
        embedding_size=512,  # size of the embedding vector for each character
    )
)
model = BertForSequenceClassification(config)
model.bert.load_word_embeddings('/home/leo/project/distil-research/distilemb/logs/distill_emb_v0/distill_emb_v0-epoch=136-epoch_val_loss=0.27.ckpt')
# input ids with (B, S, N)
char_input = torch.randint(0, config.num_input_chars, (1, 10, config.num_input_chars))
# input ids with (B, S, N)
print("char_input shape:", char_input.shape)
inputs = {
    "input_ids": char_input,
    "attention_mask":torch.tensor([[1] * char_input.size(1)]),  # attention mask for each token
    "token_type_ids": torch.tensor([[0] * char_input.size(1)]),  # token type ids for each token
}
outputs = model(**inputs)

char_input shape: torch.Size([1, 10, 12])


In [9]:
outputs[0].shape

torch.Size([1, 2])

In [None]:
distill_emb = DistillEmb(config.distill_config)
path = "logs/distill_emb_v0/distill_emb_v0-epoch=95-epoch_val_loss=0.06.ckpt"
if os.path.exists(path):
    state_dict = torch.load(path, map_location='cpu')['state_dict']
    # remove 'model.' prefix from state_dict keys
    state_dict = {k.replace('model.', ''): v for k, v in state_dict.items()}
    distill_emb.load_state_dict(state_dict)
else:
    print(f"Model checkpoint {path} not found. Please check the path.")

Model checkpoint logs/distill_emb_v0/distill_emb_v0-epoch=95-epoch_val_loss=0.06.ckpt not found. Please check the path.


In [11]:
distill_emb

DistillEmb(
  (encoder): DistillEmbSmall(
    (embedding): Embedding(1518, 64)
    (conv1): Conv1d(12, 128, kernel_size=(5,), stride=(1,))
    (conv2): Conv1d(128, 256, kernel_size=(5,), stride=(1,))
    (conv3): Conv1d(256, 384, kernel_size=(5,), stride=(1,))
    (conv4): Conv1d(384, 512, kernel_size=(4,), stride=(1,))
    (pool): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (output_layer): Linear(in_features=512, out_features=512, bias=True)
    (activation): ReLU()
    (tanh): Tanh()
    (norm0): LayerNorm((12, 64), eps=1e-05, elementwise_affine=True)
    (norm1): LayerNorm((128, 30), eps=1e-05, elementwise_affine=True)
    (norm2): LayerNorm((256, 13), eps=1e-05, elementwise_affine=True)
    (norm3): LayerNorm((384, 4), eps=1e-05, elementwise_affine=True)
    (norm4): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (output_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
)

In [12]:
texts = [
    "Paris is the capital of France.",
    "The Eiffel Tower is in Paris.",
    "The Louvre Museum is in Paris.",
    "The Seine River flows through Paris.",
    "Paris is known for its art, fashion, and culture."
]

In [13]:
out = tokenizer(texts, add_special_tokens=True, return_tensors='pt', padding='longest', truncation=True, max_length=32, return_attention_mask=True)

In [14]:
out['input_ids'].shape, out['attention_mask'].shape

(torch.Size([5, 12, 12]), torch.Size([5, 12]))

In [15]:
emb_lm_model = BertForEmbeddingLM(config)

BertForEmbeddingLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly defined. However, it doesn't directly inherit from `GenerationMixin`. From üëâv4.50üëà onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


In [16]:
labels = distill_emb(out['input_ids'])
out['labels'] = labels

output = emb_lm_model(**out)

In [17]:
output.embeddings.shape

torch.Size([5, 12, 512])

In [31]:
out = tokenizer.encode(tokenizer.special_token2word['[PAD]'], add_cls=False, add_sep=False, return_attention_mask=False)

In [35]:
torch.tensor(out['input_ids']).shape

torch.Size([1, 1, 12])

In [None]:
distill_emb = distill_emb.to('cuda').eval()

In [None]:
classifier = KNNTextClassifier(tokenizer, model=distill_emb)

In [None]:
df, classes = load_sentiment()

In [None]:
train_df = df.sample(1000, random_state=42)
test_df = df.drop(train_df.index).sample(100, random_state=42)

In [None]:
classifier.classifiy(train_df=train_df, test_df=test_df, k=5, batch_size=32, model=None, tokenizer=None)

In [None]:
model_name = "bonadossou/afrolm_active_learning"
tok = AutoTokenizer.from_pretrained(model_name)
xmodel = AutoModel.from_pretrained(model_name)
class Wrapper(torch.nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model

    def forward(self, **kwargs):
        embs = self.model(**kwargs).last_hidden_state
        return embs

wrapper_model = Wrapper(xmodel).to('cuda').eval()
classifier = KNNTextClassifier(tokenizer=tok, model=wrapper_model)
classifier.classifiy(train_df=train_df, test_df=test_df, k=5, batch_size=32, model=wrapper_model, tokenizer=tok)

In [None]:
data, classes = load_news_dataset()

In [None]:
train_df = data.sample(1000, random_state=42)
test_df = data.drop(train_df.index).sample(100, random_state=42)

In [None]:
classifier.classifiy(train_df=train_df, test_df=test_df, k=5, batch_size=32, model=None, tokenizer=None)

In [None]:
# select 10 per language
train_df = data[data['split'] == 'train'].groupby('lang').apply(lambda x: x.sample(200, random_state=42)).reset_index(drop=True)

In [None]:
# langs = ['amh', 'hau', 'ibo', 'lug', 'pcm','yor']
# train_df = train_df[train_df['lang'].isin(langs)].reset_index(drop=True)

In [None]:
train_df['headline'].sample(1).values[0]

In [None]:
# result = build_json_pairs(train_df, model_name="Davlan/afro-xlmr-large",
#                  n_samples=200, m_candidates=100, k_top=9, text_col="text", headline_col="headline")
# # save to json file
# import json
# with open('news_result.json', 'w', encoding='utf-8') as f:
#     json.dump(result, f, indent=4, ensure_ascii=False)  

In [None]:
# result = build_json_pairs(train_df, model_name="Davlan/afro-xlmr-large",
#                  n_samples=200, m_candidates=100, k_top=9, text_col="headline", headline_col="text")
# # save to json file
# import json
# with open('headline_result.json', 'w', encoding='utf-8') as f:
#     json.dump(result, f, indent=4, ensure_ascii=False)  

In [None]:
df = pd.read_json('news_result.json')
d = df.to_dict(orient='records')
top1_accuracy(d, batch_size=32, model=xmodel, tokenizer=tok)
# top1_accuracy(d, batch_size=32, model=distill_emb, tokenizer=tokenizer)

In [None]:
from fasttext_model import FastTextModel
fasttext_model = FastTextModel(file_path='embeddings/afriberta/afriberta.vec')
# fasttext_model.embedding.weight.requires_grad = False  # freeze the weights