In [1]:
from bert import BertModel
from distill_emb import DistillEmbSmall
from config import BertConfig
import torch
from transformers import AutoTokenizer, RwkvConfig, RwkvModel, AutoModel
from tokenizer import CharTokenizer
from knn_classifier import KNNTextClassifier
from data_loader import load_sentiment
from data_loader import load_news_dataset
import pandas as pd
from retrieval import build_json_pairs, top1_accuracy
import os

In [2]:
num_input_chars=12

In [3]:
tokenizer = CharTokenizer(charset_file_path='tokenizer/charset.json',
                          max_word_length=num_input_chars)

In [4]:
config = BertConfig(
    vocab_size=30522,
    hidden_size=768,
    num_hidden_layers=9,
    num_attention_heads=8,
    intermediate_size=3072,
    max_position_embeddings=512,
    type_vocab_size=2,
    pad_token_id=0,
    position_embedding_type="absolute",
    use_cache=True,
    classifier_dropout=None,
    embedding_type="distill",  # 'distilemb', 'fasttext'
    encoder_type='lstm',
    num_input_chars=num_input_chars,  # number of characters in each token
    char_vocab_size=tokenizer.char_vocab_size
)
model = BertModel(config)
# input ids with (B, S, N)
char_input = torch.randint(0, config.num_input_chars, (1, 10, config.num_input_chars))
# input ids with (B, S, N)

inputs = {
    "input_ids": char_input,
    "attention_mask":torch.tensor([[1] * char_input.size(1)]),  # attention mask for each token
    "token_type_ids": torch.tensor([[0] * char_input.size(1)]),  # token type ids for each token
}
outputs = model(**inputs)

In [5]:
distill_emb = DistillEmbSmall(config)
path = "logs/distill_emb_v0/distill_emb_v0-epoch=95-epoch_val_loss=0.06.ckpt"
if os.path.exists(path):
    state_dict = torch.load(path, map_location='cpu')['state_dict']
    # remove 'model.' prefix from state_dict keys
    state_dict = {k.replace('model.', ''): v for k, v in state_dict.items()}
    distill_emb.load_state_dict(state_dict)
else:
    print(f"Model checkpoint {path} not found. Please check the path.")

In [6]:
distill_emb

DistillEmbSmall(
  (embedding): Embedding(1518, 64)
  (conv1): Conv1d(12, 128, kernel_size=(5,), stride=(1,))
  (conv2): Conv1d(128, 256, kernel_size=(5,), stride=(1,))
  (conv3): Conv1d(256, 384, kernel_size=(5,), stride=(1,))
  (conv4): Conv1d(384, 512, kernel_size=(4,), stride=(1,))
  (pool): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (output_layer): Linear(in_features=512, out_features=512, bias=True)
  (activation): ReLU()
  (tanh): Tanh()
  (norm0): LayerNorm((12, 64), eps=1e-05, elementwise_affine=True)
  (norm1): LayerNorm((128, 30), eps=1e-05, elementwise_affine=True)
  (norm2): LayerNorm((256, 13), eps=1e-05, elementwise_affine=True)
  (norm3): LayerNorm((384, 4), eps=1e-05, elementwise_affine=True)
  (norm4): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  (output_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  (dropout): Dropout(p=0.1, inplace=False)
)

In [7]:
out = tokenizer('hello world', add_special_tokens=False, return_tensors='pt')

In [8]:
distill_emb(out['input_ids'][0]).shape

torch.Size([2, 512])

In [9]:
distill_emb = distill_emb.to('cuda').eval()

In [10]:
classifier = KNNTextClassifier(tokenizer, model=distill_emb)

In [11]:
df, classes = load_sentiment()

Loaded 105862 rows from sentiment.parquet columns Index(['text', 'label', 'lang', 'split'], dtype='object')


In [12]:
train_df = df.sample(1000, random_state=42)
test_df = df.drop(train_df.index).sample(100, random_state=42)

In [13]:
classifier.classifiy(train_df=train_df, test_df=test_df, k=5, batch_size=32, model=None, tokenizer=None)

(0.4007096171802055,
 0.41000000000000003,
 {'ma': {'f1': 0.4608843537414966, 'acc': 0.5},
  'sw': {'f1': 0.4897385620915033, 'acc': 0.5},
  'dz': {'f1': 0.33730158730158727, 'acc': 0.3333333333333333},
  'kr': {'f1': 0.6439393939393939, 'acc': 0.6363636363636364},
  'pt': {'f1': 0.2040816326530612, 'acc': 0.14285714285714285},
  'twi': {'f1': 0.1, 'acc': 0.25},
  'yo': {'f1': 0.3956043956043956, 'acc': 0.38461538461538464},
  'pcm': {'f1': 0.0, 'acc': 0.0},
  'ha': {'f1': 0.4800000000000001, 'acc': 0.4},
  'am': {'f1': 0.6, 'acc': 0.6},
  'ts': {'f1': 0.0, 'acc': 0.0},
  'ig': {'f1': 0.3333333333333333, 'acc': 0.5},
  'tg': {'f1': 0.0, 'acc': 0.0}})

In [None]:
model_name = "bonadossou/afrolm_active_learning"
tok = AutoTokenizer.from_pretrained(model_name)
xmodel = AutoModel.from_pretrained(model_name)
class Wrapper(torch.nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model

    def forward(self, **kwargs):
        embs = self.model(**kwargs).last_hidden_state
        return embs

wrapper_model = Wrapper(xmodel).to('cuda').eval()
classifier = KNNTextClassifier(tokenizer=tok, model=wrapper_model)
classifier.classifiy(train_df=train_df, test_df=test_df, k=5, batch_size=32, model=wrapper_model, tokenizer=tok)

In [14]:
data, classes = load_news_dataset()

Loaded 30809 rows from masakhanews.parquet columns Index(['label', 'headline', 'text', 'headline_text', 'url', 'lang', 'split'], dtype='object')


In [15]:
train_df = data.sample(1000, random_state=42)
test_df = data.drop(train_df.index).sample(100, random_state=42)

In [18]:
classifier.classifiy(train_df=train_df, test_df=test_df, k=5, batch_size=32, model=None, tokenizer=None)

(0.3351587301587302,
 0.3700000000000001,
 {'orm': {'f1': 0.16049382716049382, 'acc': 0.2222222222222222},
  'xho': {'f1': 0.4821428571428571, 'acc': 0.5},
  'eng': {'f1': 0.24603174603174605, 'acc': 0.26666666666666666},
  'tir': {'f1': 0.3333333333333333, 'acc': 0.4},
  'som': {'f1': 0.36666666666666664, 'acc': 0.4},
  'hau': {'f1': 0.11904761904761904, 'acc': 0.14285714285714285},
  'pcm': {'f1': 0.2833333333333334, 'acc': 0.3333333333333333},
  'lug': {'f1': 0.19047619047619047, 'acc': 0.3333333333333333},
  'lin': {'f1': 0.8333333333333334, 'acc': 0.8333333333333334},
  'ibo': {'f1': 0.6666666666666666, 'acc': 0.75},
  'fra': {'f1': 0.34285714285714286, 'acc': 0.4},
  'sna': {'f1': 0.0, 'acc': 0.0},
  'run': {'f1': 0.4444444444444444, 'acc': 0.3333333333333333},
  'swa': {'f1': 0.8, 'acc': 0.8},
  'yor': {'f1': 0.16, 'acc': 0.2},
  'amh': {'f1': 1.0, 'acc': 1.0}})

In [None]:
# select 10 per language
train_df = data[data['split'] == 'train'].groupby('lang').apply(lambda x: x.sample(200, random_state=42)).reset_index(drop=True)

In [None]:
# langs = ['amh', 'hau', 'ibo', 'lug', 'pcm','yor']
# train_df = train_df[train_df['lang'].isin(langs)].reset_index(drop=True)

In [None]:
train_df['headline'].sample(1).values[0]

In [None]:
# result = build_json_pairs(train_df, model_name="Davlan/afro-xlmr-large",
#                  n_samples=200, m_candidates=100, k_top=9, text_col="text", headline_col="headline")
# # save to json file
# import json
# with open('news_result.json', 'w', encoding='utf-8') as f:
#     json.dump(result, f, indent=4, ensure_ascii=False)  

In [None]:
# result = build_json_pairs(train_df, model_name="Davlan/afro-xlmr-large",
#                  n_samples=200, m_candidates=100, k_top=9, text_col="headline", headline_col="text")
# # save to json file
# import json
# with open('headline_result.json', 'w', encoding='utf-8') as f:
#     json.dump(result, f, indent=4, ensure_ascii=False)  

In [None]:
df = pd.read_json('news_result.json')
d = df.to_dict(orient='records')
top1_accuracy(d, batch_size=32, model=xmodel, tokenizer=tok)
# top1_accuracy(d, batch_size=32, model=distill_emb, tokenizer=tokenizer)

In [None]:
from fasttext_model import FastTextModel
fasttext_model = FastTextModel(file_path='embeddings/afriberta/afriberta.vec')
# fasttext_model.embedding.weight.requires_grad = False  # freeze the weights