In [1]:
import torch
import torch.nn as nn

import pandas as pd
import numpy as np
import random

from model_new import BertForTokenClassification
import utils.NERutils as nu
import utils.query_funcs as q

from transformers import AutoConfig, AutoTokenizer

from torch.utils.data import DataLoader, SubsetRandomSampler


  from .autonotebook import tqdm as notebook_tqdm


### Link for inspiration

https://www.scaleway.com/en/blog/active-learning-pytorch/

In [2]:
# Define tokenizer
bert_model_name = "bert-base-multilingual-cased"
bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)

#### Load datasets

In [3]:
train_path = "data/train.parquet"
dev_path = "data/dev.parquet"
test_path = "data/test.parquet"

In [4]:
filter = 'Legal'

In [5]:
train_dataset = nu.NERdataset(dataset_path=train_path, tokenizer=bert_tokenizer, filter=filter)
#dev_dataset = nu.NERdataset(dataset_path=dev_path, tokenizer=bert_tokenizer)
test_dataset = nu.NERdataset(dataset_path=test_path, tokenizer=bert_tokenizer, filter=filter)

In [6]:
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=0)

#### Get pretrained model

In [7]:
if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

In [8]:
# Config
bert_model_name = "bert-base-multilingual-cased"
bert_config = AutoConfig.from_pretrained(
    bert_model_name, 
    num_labels=len(train_dataset.tags), 
    id2label=train_dataset.index2tag, 
    label2id=train_dataset.tag2index
)

model = BertForTokenClassification.from_pretrained(bert_model_name, config=bert_config, tags=train_dataset.tags).to(device)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# Load model
model.load_state_dict(torch.load("Trained_models/test_model", map_location=device))

<All keys matched successfully>

### Active learning

In [10]:
def random_query(data_loader, query_size=10):

    sample_idx = []

    for batch in data_loader:

        _, _, idx = batch
        sample_idx.extend(idx.tolist())

        if len(sample_idx) >= query_size:
            break
        
    return sample_idx[0:query_size]

In [11]:
def query_the_oracle(model, device, dataset, query_size=10, query_strategy='random', 
                     interactive=True, pool_size=0, batch_size=16, num_workers=0):
    
    unlabeled_idx = np.nonzero(dataset.unlabeled_mask)[0]

    # Pool based sampeling
    if pool_size > 0:
        pool_idx = random.sample(range(1, len(unlabeled_idx)), pool_size)
        pool_loader = DataLoader(dataset, batcg_size=batch_size, num_workers=num_workers,
                                 sampler=SubsetRandomSampler(unlabeled_idx[pool_idx]))
    else:
        pool_loader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers,
                                 sampler=SubsetRandomSampler(unlabeled_idx))
    
    # Strategies
    if query_strategy == 'margin':
        #sample_idx = margin_query(model, device, pool_loader, query_size)
        print("Method not implemented yet")
        return
    else:
        sample_idx = q.random_query(pool_loader, query_size)
    
    # Move observation to the pool of labeled samples
    for sample in sample_idx:
        dataset.unlabeled_mask[sample] = 0

#### Active learning loop 

In [12]:
num_queries = 10
batch_size = 16
query_size = 5
query_strategy='random'
pool_size=10

num_epochs = 1
learning_rate = 1e-05
optimizer = torch.optim.Adam(params=model.parameters(), lr=learning_rate)

for query in range(num_queries):

    query_the_oracle(model, device, train_dataset, query_size, query_strategy, pool_size)

    labeled_idx = np.where(train_dataset.unlabeled_mask == 0)[0]
    labeled_loader = DataLoader(train_dataset, batch_size=batch_size, num_workers=0, sampler=SubsetRandomSampler(labeled_idx))

    # train model
    previous_test_acc = 0
    current_test_acc = 1
    while current_test_acc > previous_test_acc:
        previous_test_acc = current_test_acc
        print("Fitting")
        model.fit(num_epochs, labeled_loader, device, optimizer)
        print("Testing")
        model.test(test_loader, device)
        
        train_loss = model.training_loss[-1]
        val_acc = model.training_acc[-1]
        print(train_loss)
        print(val_acc)

    # test model

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Fitting


  0%|          | 0/1 [00:00<?, ?it/s]

0


100%|██████████| 1/1 [00:01<00:00,  1.87s/it]


1 of 1 epochs
Testing
0.3794887363910675
0.042011834319526625
Fitting


  0%|          | 0/1 [00:00<?, ?it/s]

0


100%|██████████| 1/1 [00:14<00:00, 14.51s/it]


1 of 1 epochs
Testing
0.16287675499916077
0.07633136094674556
Fitting


  0%|          | 0/1 [00:00<?, ?it/s]

0


100%|██████████| 1/1 [00:28<00:00, 28.40s/it]


1 of 1 epochs
Testing
0.17674221098423004
0.13136094674556212
Fitting


  0%|          | 0/2 [00:00<?, ?it/s]

0


  0%|          | 0/2 [00:11<?, ?it/s]


RuntimeError: MPS backend out of memory (MPS allocated: 16.86 GB, other allocations: 1.24 GB, max allowed: 18.13 GB). Tried to allocate 192.00 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

[0.2553871273994446]