In [1]:
import torch
import torch.nn as nn

import pandas as pd
from datasets import Dataset

from model import BertForTokenClassification
import utils.NERutils as nu

from transformers import AutoConfig, AutoTokenizer


In [2]:
# Define tokenizer
bert_model_name = "bert-base-multilingual-cased"
bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)

#### Load datasets

In [3]:
train_path = "data/train.parquet"
dev_path = "data/dev.parquet"
test_path = "data/test.parquet"

In [4]:
train_dataset = nu.NERdataset(dataset_path=train_path, tokenizer=bert_tokenizer)
#dev_dataset = nu.NERdataset(dataset_path=dev_path, tokenizer=bert_tokenizer)
#test_dataset = nu.NERdataset(dataset_path=test_path, tokenizer=bert_tokenizer)

#### Get pretrained model

In [5]:
device = torch.device('cpu')

In [6]:
# Config
bert_model_name = "bert-base-multilingual-cased"
bert_config = AutoConfig.from_pretrained(
    bert_model_name, 
    num_labels=len(train_dataset.tags), 
    id2label=train_dataset.index2tag, 
    label2id=train_dataset.tag2index
)

model = BertForTokenClassification.from_pretrained(bert_model_name, config=bert_config, tags=train_dataset.tags).to(device)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
# Load model
model.load_state_dict(torch.load("test_model", map_location=device))

<All keys matched successfully>

In [47]:
def tag_text(text, tags, model, tokenizer):
    # Get tokens with special characters
    tokens = tokenizer(text).tokens()
    # Encode the sequence into IDs
    input_ids = bert_tokenizer(text, return_tensors="pt").input_ids.to(device)
    # Get predictions as distribution over 7 possible classes
    outputs = model(input_ids)[0]
    # Take argmax to get most likely class per token
    predictions = torch.argmax(outputs, dim=2)
    # Convert to DataFrame
    preds = [tags[p] for p in predictions[0].cpu().numpy()]
    
    return pd.DataFrame([tokens, preds], index=["Tokens", "Tags"])

In [48]:
filter = test_dataset['dagw_domain']=='Legal'

# Encode the sequence into IDs
input_ids = [bert_tokenizer(text, return_tensors="pt").input_ids.to(device) for text in test_dataset[filter].text]

In [49]:
model.eval()
output = model.forward(input_ids[0])

### Active learning

In [11]:
import numpy as np
import random

from torch.utils.data import DataLoader, SubsetRandomSampler

In [5]:
train_dataset.unlabeled_mask

Unnamed: 0,unlabeled
0,1
1,1
2,1
3,1
4,1
...,...
11757,1
11758,1
11759,1
11760,1


In [None]:
def query_the_oracle(model, device, dataset, quer_sSize=10, query_strategy='random', 
                     interactive=True, pool_size=0, batch_size=128, num_workers=4):
    
    unlabeled_idx = np.nonzero(dataset.unlabeled_mask)[0]

    if pool_size > 0:
        pool_idx = random.sample(range(1, len(unlabeled_idx)), pool_size)
        pool_loader = DataLoader(dataset, batcg_size=batch_size, num_workers=num_workers,
                                 sampler=SubsetRandomSampler(unlabeled_idx[pool_idx]))
    else:
        pool_loader = DataLoader(dataset, batcg_size=batch_size, num_workers=num_workers,
                                 sampler=SubsetRandomSampler(unlabeled_idx))
        
    if query_strategy == 'margin':
        #sample_idx = margin_query(model, device, pool_loader, query_size)
        print("Method not implemented yet")
        return
    else:
        sample_idx = random_query(pool_loader, query_size)
    
    # Move observation to the pool of labeled samples
    for sample in sample_idx:
        dataset.unlabeled_mask[sample] = 0

In [None]:
def random_query(data_loader, query_size=10):

    sample_idx = []

    for batch in data_loader:

        _, _, idx = batch
        sample_idx.extend(idx.tolist())

        if len(sample_idx) >= query_size:
            break
        
    return sample_idx[0:query_size]

In [None]:
def margin_query(model, device, data_loader, query_size=10):

    margins = []
    indices = []

    model.eval()

    with torch.no_grad():
        for batch in data_loader:

            data, _, idx = batch
            logits = model(data.to(device))
            probabilities = F.

            toptwo = torch.topk(probabilities, 2, dim=1)[0]

            differences = toptwo[:,0]-toptwo[:,1]
            margins.extend(torch.abs(differences).cpu().tolist())
            indices.extend(idx.tolist())
    
    margin = np.asarray(margins)
    index = np.asarray(indices)
    sorted_pool = np.argsort(margin)

    return index[sorted_pool][0:query_size]

In [13]:
num_queries = 10
batch_size = 16
query_size = 5
query_strategy='random'
pool_size=10

for query in range(num_queries):

    query_the_oracle(model, device, train_dataset, query_size, query_strategy, pool_size)

    labeled_idx = np.where(train_dataset.unlabeled_massk == 0)[0]
    labeled_loader = DataLoader(train_dataset, batch_size=batch_size, num_workers=0, sampler=SubsetRandomSampler(labeled_idx))

    # train model

# test model

1.0