In [1]:
from bert import BertModel
from distill_emb import DistillEmbSmall
from config import BertConfig
import torch
from transformers import AutoTokenizer, RwkvConfig, RwkvModel, AutoModel
from tokenizer import CharTokenizer
from knn_classifier import KNNTextClassifier
from data_loader import load_sentiment
from data_loader import load_news_dataset
import pandas as pd
from retrieval import build_json_pairs, top1_accuracy

In [2]:
num_input_chars=12

In [3]:
tokenizer = CharTokenizer(charset_file_path='tokenizer/charset.json',
                          max_word_length=num_input_chars)

In [4]:
config = BertConfig(
    vocab_size=30522,
    hidden_size=768,
    num_hidden_layers=9,
    num_attention_heads=8,
    intermediate_size=3072,
    max_position_embeddings=512,
    type_vocab_size=2,
    pad_token_id=0,
    position_embedding_type="absolute",
    use_cache=True,
    classifier_dropout=None,
    embedding_type="distill",  # 'distilemb', 'fasttext'
    encoder_type='lstm',
    num_input_chars=num_input_chars,  # number of characters in each token
    char_vocab_size=tokenizer.char_vocab_size
)
model = BertModel(config)
# input ids with (B, S, N)
char_input = torch.randint(0, config.num_input_chars, (1, 10, config.num_input_chars))
# input ids with (B, S, N)

inputs = {
    "input_ids": char_input,
    "attention_mask":torch.tensor([[1] * char_input.size(1)]),  # attention mask for each token
    "token_type_ids": torch.tensor([[0] * char_input.size(1)]),  # token type ids for each token
}
outputs = model(**inputs)

In [5]:
distill_emb = DistillEmbSmall(config)
# distilemb/logs/distill_emb_v0/distill_emb_v0-epoch=18-val_loss=0.00.ckpt
# state_dict = torch.load('logs/distill_emb_v0/distill_emb_v0-epoch=18-val_loss=0.00.ckpt')['state_dict']
# # remove 'model.' prefix from state_dict keys
# state_dict = {k.replace('model.', ''): v for k, v in state_dict.items()}
# distill_emb.load_state_dict(state_dict)

In [6]:
distill_emb

DistillEmbSmall(
  (embedding): Embedding(1518, 64)
  (conv1): Conv1d(12, 128, kernel_size=(5,), stride=(1,))
  (conv2): Conv1d(128, 256, kernel_size=(5,), stride=(1,))
  (conv3): Conv1d(256, 384, kernel_size=(5,), stride=(1,))
  (conv4): Conv1d(384, 512, kernel_size=(4,), stride=(1,))
  (pool): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (output_layer): Linear(in_features=512, out_features=512, bias=True)
  (activation): ReLU()
  (tanh): Tanh()
  (norm0): LayerNorm((12, 64), eps=1e-05, elementwise_affine=True)
  (norm1): LayerNorm((128, 30), eps=1e-05, elementwise_affine=True)
  (norm2): LayerNorm((256, 13), eps=1e-05, elementwise_affine=True)
  (norm3): LayerNorm((384, 4), eps=1e-05, elementwise_affine=True)
  (norm4): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  (output_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  (dropout): Dropout(p=0.1, inplace=False)
)

In [7]:
out = tokenizer('hello world', add_special_tokens=False, return_tensors='pt')

In [8]:
distill_emb(out['input_ids'][0]).shape

torch.Size([2, 512])

In [9]:
distill_emb = distill_emb.to('cuda').eval()

In [10]:
classifier = KNNTextClassifier(tokenizer, model=distill_emb)

In [11]:
df, classes = load_sentiment()

Loaded 105862 rows from sentiment.parquet columns Index(['text', 'label', 'lang', 'split'], dtype='object')


In [12]:
train_df = df.sample(1000, random_state=42)
test_df = df.drop(train_df.index).sample(100, random_state=42)

In [13]:
classifier.classifiy(train_df=train_df, test_df=test_df, k=5, batch_size=32, model=None, tokenizer=None)

(0.3740010340536656,
 0.3700000000000001,
 {'ma': {'f1': 0.384786641929499, 'acc': 0.35714285714285715},
  'sw': {'f1': 0.21052631578947367, 'acc': 0.2},
  'dz': {'f1': 0.68994708994709, 'acc': 0.6666666666666666},
  'kr': {'f1': 0.3181818181818182, 'acc': 0.2727272727272727},
  'pt': {'f1': 0.4653061224489795, 'acc': 0.42857142857142855},
  'twi': {'f1': 0.1, 'acc': 0.25},
  'yo': {'f1': 0.3976331360946746, 'acc': 0.38461538461538464},
  'pcm': {'f1': 0.7333333333333334, 'acc': 0.75},
  'ha': {'f1': 0.4, 'acc': 0.4},
  'am': {'f1': 0.6, 'acc': 0.6},
  'ts': {'f1': 0.0, 'acc': 0.0},
  'ig': {'f1': 0.3333333333333333, 'acc': 0.5},
  'tg': {'f1': 0.0, 'acc': 0.0}})

In [14]:
model_name = "bonadossou/afrolm_active_learning"
tok = AutoTokenizer.from_pretrained(model_name)
xmodel = AutoModel.from_pretrained(model_name)
class Wrapper(torch.nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model

    def forward(self, **kwargs):
        embs = self.model(**kwargs).last_hidden_state
        return embs

wrapper_model = Wrapper(xmodel).to('cuda').eval()
classifier = KNNTextClassifier(tokenizer=tok, model=wrapper_model)
classifier.classifiy(train_df=train_df, test_df=test_df, k=5, batch_size=32, model=wrapper_model, tokenizer=tok)

tokenizer_config.json:   0%|          | 0.00/241 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/755 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/6.01M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.06G [00:00<?, ?B/s]

Some weights of XLMRobertaModel were not initialized from the model checkpoint at bonadossou/afrolm_active_learning and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


(0.4402293233082707,
 0.45000000000000007,
 {'ma': {'f1': 0.293040293040293, 'acc': 0.2857142857142857},
  'sw': {'f1': 0.5898279352226721, 'acc': 0.6},
  'dz': {'f1': 0.23703703703703705, 'acc': 0.3333333333333333},
  'kr': {'f1': 0.47792207792207786, 'acc': 0.45454545454545453},
  'pt': {'f1': 0.42857142857142855, 'acc': 0.42857142857142855},
  'twi': {'f1': 0.25, 'acc': 0.25},
  'yo': {'f1': 0.6205128205128205, 'acc': 0.6153846153846154},
  'pcm': {'f1': 0.3333333333333333, 'acc': 0.25},
  'ha': {'f1': 0.4, 'acc': 0.4},
  'am': {'f1': 0.6, 'acc': 0.6},
  'ts': {'f1': 0.3333333333333333, 'acc': 0.3333333333333333},
  'ig': {'f1': 0.3333333333333333, 'acc': 0.5},
  'tg': {'f1': 0.0, 'acc': 0.0}})

In [15]:
data, classes = load_news_dataset()

Loaded 30809 rows from masakhanews.parquet columns Index(['label', 'headline', 'text', 'headline_text', 'url', 'lang', 'split'], dtype='object')


In [16]:
train_df = data.sample(1000, random_state=42)
test_df = data.drop(train_df.index).sample(100, random_state=42)

In [24]:
classifier.classifiy(train_df=train_df, test_df=test_df, k=5, batch_size=32, model=xmodel, tokenizer=tok)

RuntimeError: The expanded size of the tensor (512) must match the existing size (258) at non-singleton dimension 1.  Target sizes: [32, 512].  Tensor sizes: [1, 258]

In [20]:
# select 10 per language
train_df = data[data['split'] == 'train'].groupby('lang').apply(lambda x: x.sample(200, random_state=42)).reset_index(drop=True)

  train_df = data[data['split'] == 'train'].groupby('lang').apply(lambda x: x.sample(200, random_state=42)).reset_index(drop=True)


In [21]:
# langs = ['amh', 'hau', 'ibo', 'lug', 'pcm','yor']
# train_df = train_df[train_df['lang'].isin(langs)].reset_index(drop=True)

In [22]:
train_df['headline'].sample(1).values[0]

'Varimi vePurazi reWadzanayi kuLalapanzi Vonetsana neKambani yekuChina YeAsia Ferry Yakapinda muPurazi Ravo'

In [None]:
# result = build_json_pairs(train_df, model_name="Davlan/afro-xlmr-large",
#                  n_samples=200, m_candidates=100, k_top=9, text_col="text", headline_col="headline")
# # save to json file
# import json
# with open('news_result.json', 'w', encoding='utf-8') as f:
#     json.dump(result, f, indent=4, ensure_ascii=False)  

In [None]:
# result = build_json_pairs(train_df, model_name="Davlan/afro-xlmr-large",
#                  n_samples=200, m_candidates=100, k_top=9, text_col="headline", headline_col="text")
# # save to json file
# import json
# with open('headline_result.json', 'w', encoding='utf-8') as f:
#     json.dump(result, f, indent=4, ensure_ascii=False)  

In [23]:
df = pd.read_json('news_result.json')
d = df.to_dict(orient='records')
top1_accuracy(d, batch_size=32, model=xmodel, tokenizer=tok)
# top1_accuracy(d, batch_size=32, model=distill_emb, tokenizer=tokenizer)

Evaluating:   0%|          | 0/3200 [00:00<?, ?it/s]

RuntimeError: The expanded size of the tensor (512) must match the existing size (258) at non-singleton dimension 1.  Target sizes: [1, 512].  Tensor sizes: [1, 258]

In [None]:
from fasttext_model import FastTextModel
fasttext_model = FastTextModel(file_path='embeddings/afriberta/afriberta.vec')
# fasttext_model.embedding.weight.requires_grad = False  # freeze the weights