In [50]:
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
random_seed = 1337

In [16]:
#get and prepare training data
def get_training_data(dataset_path:str, test_split_ratio:float=0.1,verbose=False):
    data = pd.read_json(dataset_path)
    data["label_train"] = data["label"] - 1
    data["display_text"] = [d[1]['text'][d[1]['displayTextRangeStart']: d[1]['getDisplayTextRangeEnd']] for d in data[["text","displayTextRangeStart", "getDisplayTextRangeEnd"]].iterrows()]
    if verbose : print("max text length", len(data.iloc[np.argmax(data['text'].to_numpy())]['text']))
    max_display_text_length = len(data.iloc[np.argmax(data['display_text'].to_numpy())]['display_text'])
    if verbose : print("max display text length", max_display_text_length)
    X = data.display_text.to_list()
    y = data.label_train.to_list()
    # split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_split_ratio, random_state=random_seed, shuffle=True)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=test_split_ratio * len(X) / len(X_train), random_state=random_seed, shuffle=True)
    return X_train, y_train, X_val, y_val, X_test, y_test
    

In [80]:
def compute_metrics(pred, labels):
    pred = np.argmax(pred, axis=1)
    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred, average='weighted')
    precision = precision_score(y_true=labels, y_pred=pred, average='weighted')
    f1 = f1_score(y_true=labels, y_pred=pred, average='weighted')
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

def train_model(X_train, y_train, X_val, y_val, X_test, y_test, batch_size=100, epochs=3, model=None, model_name = 'distilbert-base-uncased'):
    
    # correct validation set length if it is too long
    ratio = 4
    if len(y_train) / ratio > len(y_val):
        X_val = X_val[0:math.ceil(len(X_val) / ratio)]
        y_val = y_val[0:math.ceil(len(y_val) / ratio)]
    
#     if len(y_train) / ratio > len(y_test):
#         X_test = X_test[0:math.ceil(len(X_test) / ratio)]
#         y_test = y_test[0:math.ceil(len(y_test) / ratio)]
    
    # BEGIN disable logging 
    import logging
    def set_global_logging_level(level=logging.ERROR, prefices=[""]):
        import re
        prefix_re = re.compile(fr'^(?:{ "|".join(prefices) })')
        for name in logging.root.manager.loggerDict:
            if re.match(prefix_re, name):
                logging.getLogger(name).setLevel(level)
    set_global_logging_level(logging.CRITICAL) # disable INFO and DEBUG logging everywhere
    
    import warnings
    warnings.filterwarnings("ignore")
    # END disable logging
    
    # BEGIN Set determinism !! must be inside function in every loop to work

    from os import environ
    environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
    # !! important !! import torch after setting cublas deterministic or it will not work !!
    import torch
    from transformers import TrainingArguments, Trainer, DistilBertTokenizer, DistilBertForSequenceClassification
    import transformers
    torch.use_deterministic_algorithms(True)
    torch.backends.cudnn.benchmark = False
    torch.manual_seed(random_seed)
    np.random.seed(random_seed)
    import random
    random.seed(random_seed)
    
    # END Set determinism
    
     # Create torch dataset
    class Dataset(torch.utils.data.Dataset):
        def __init__(self, encodings, labels=None):
            self.encodings = encodings
            self.labels = labels
        def __getitem__(self, idx):
            item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
            if self.labels: item["labels"] = torch.tensor(self.labels[idx])
            return item
        def __len__(self):
            return len(self.encodings["input_ids"])
    
    #disable logging
    #transformers.logging.set_verbosity(transformers.logging.CRITICAL)
    
    # create tokenizer
    tokenizer = DistilBertTokenizer.from_pretrained(model_name) 
    
    # create datasets
    train_dataset = Dataset(tokenizer(X_train, truncation=True, padding=True, max_length=512), y_train)
    val_dataset = Dataset(tokenizer(X_val, truncation=True, padding=True, max_length=512), y_val)
    test_dataset = Dataset(tokenizer(X_test, padding=True, truncation=True, max_length=512), y_test)
    
    #create model

    if model is None:
        model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=4)


    #training settings
    args = TrainingArguments(
        output_dir="output",
        evaluation_strategy="epoch",
        eval_steps=1,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=epochs,
        seed=random_seed,
        load_best_model_at_end=False
    )
    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=lambda p: compute_metrics(p[0], p[1])
    )
    # disable print log
    from transformers.trainer_callback import PrinterCallback
    trainer.remove_callback(PrinterCallback)

    # Train
    trainer.train()

    # Test
    metrics = trainer.evaluate(test_dataset, metric_key_prefix="")

#     raw_pred, _, _ = trainer.predict(test_dataset)
#     m = compute_metrics(raw_pred, y_test)
    return metrics, trainer, trainer.model

In [67]:
X_train, y_train, X_val, y_val, X_test, y_test = get_training_data('data/dataset_1.json', test_split_ratio=0.2)
metrics1, trainer1, model1 = train_model(X_train, y_train, X_val, y_val, X_test, y_test)
metrics2, trainer2, model2 = train_model(X_train, y_train, X_val, y_val, X_test, y_test)
metrics1, metrics2

({'_loss': 0.34015172719955444,
  '_accuracy': 0.871264367816092,
  '_precision': 0.8601053715194783,
  '_recall': 0.871264367816092,
  '_f1': 0.8568178278784206,
  '_runtime': 0.9449,
  '_samples_per_second': 1381.04,
  '_steps_per_second': 14.816,
  'epoch': 3.0},
 {'_loss': 0.34015172719955444,
  '_accuracy': 0.871264367816092,
  '_precision': 0.8601053715194783,
  '_recall': 0.871264367816092,
  '_f1': 0.8568178278784206,
  '_runtime': 0.9616,
  '_samples_per_second': 1357.094,
  '_steps_per_second': 14.559,
  'epoch': 3.0})

In [68]:
len(X_val),len(X_test)

(1305, 1305)

In [59]:
def sbert_tokenize(sentences, verbose=False, bert_model_name='all-distilroberta-v1'):
    from sentence_transformers import SentenceTransformer, util
    import torch
    model = SentenceTransformer(bert_model_name)
    model.max_seq_length = np.argmax(sentences)
    
    embedding_list = model.encode(sentences, show_progress_bar=verbose)
    return embedding_list

In [142]:
def get_sbert_centroid_args(sentences, num_labels:int, bert_model_name='all-distilroberta-v1', verbose=False):
    l = len(sentences)
    if l <= 0: return []
    
    # if sample size is smaller than the list there is nothing to sample  then return all indices
    if l < num_labels: return list(range(0, l))
    
    from sentence_transformers import SentenceTransformer, util
    import torch
    model = SentenceTransformer(bert_model_name)
    model.max_seq_length = np.argmax(sentences)
    
    embedding_list = model.encode(sentences, show_progress_bar=verbose)
    from sklearn.cluster import KMeans
    
    clustering_model = KMeans(n_clusters=num_labels, random_state=1337) 
    clustering_model.fit(embedding_list)
    cluster_assignment = clustering_model.labels_
    clustered_sentences = {}
    for sentence_id, cluster_id in enumerate(cluster_assignment):
        if cluster_id not in clustered_sentences:
            clustered_sentences[cluster_id] = []

        clustered_sentences[cluster_id].append(sentence_id)

    centroids = []
    for i in range(len(clustering_model.cluster_centers_)):
        center = clustering_model.cluster_centers_[i]
        # get centroid arg for cluster by min euclidian distance from cluster center
        centroid_arg = clustered_sentences[i][np.argmin([np.linalg.norm(embedding_list[cluster_item_arg]-center) for cluster_item_arg in clustered_sentences[i]])]
        centroids.append(centroid_arg)
    return centroids

In [119]:
centroid_args = get_sbert_centroid_args(sentences=X_train, num_labels=5)
centroid_args.sort()
centroid_args

[400, 1582, 2095, 2950, 3438]

In [143]:
def get_random_sampling_args(embedding_list, num_labels:int, verbose=False):
    l = len(embedding_list)
    if l <= 0: return []
    # if sample size is smaller than the list there is nothing to sample then return all indices
    if l < num_labels: return list(range(0, l))
    import random
    random.seed(1337) 
    return random.sample(range(0, l), num_labels)

In [144]:
random_args = get_random_sampling_args(embedding_list=X_train, num_labels=5)
random_args.sort()
random_args

[1498, 2184, 2530, 2907, 3790]

In [145]:
def apply_active_learning(algorithm, source, source_y, batch_size=200, epochs=3, continuous_mode=False):
    res = []
    source = list(source)
    source_y = list(source_y)
    i = 0
    samples = []
    samples_y = []
    model = None
    while len(source) > 0:
        if continuous_mode:
            samples = []
            samples_y = []
        
        pick_args = algorithm(source, batch_size)
        #sort reverse or pop will end with argument out of range exception
        pick_args.sort(reverse=True)
 
        # transfer samples from embedding list to samples
        for d in pick_args: 
            samples.append(source.pop(d))
            samples_y.append(source_y.pop(d))
            
        metric, trainer, model = train_model(samples, samples_y, X_val, y_val, X_test, y_test, epochs=epochs, model=model if continuous_mode else None)

        if continuous_mode:
            i = i + len(samples)
            metric["trained_samples"] = i
        else:
            i = i + len(samples)
            metric["trained_samples"] = len(samples)
        res.append(metric)
    return res

In [147]:
res_rand = apply_active_learning(get_random_sampling_args, X_train, y_train) 
res_rand_continuous = apply_active_learning(get_random_sampling_args, X_train, y_train, continuous_mode=True) 
#res_centr = apply_active_learning(get_sbert_centroid_args, X_train, y_train) 
res_rand, res_rand_continuous

([{'_loss': 0.9344017505645752,
   '_accuracy': 0.7854406130268199,
   '_precision': 0.6169169565919467,
   '_recall': 0.7854406130268199,
   '_f1': 0.6910528998733824,
   '_runtime': 0.938,
   '_samples_per_second': 1391.186,
   '_steps_per_second': 14.925,
   'epoch': 3.0,
   'trained_samples': 200},
  {'_loss': 0.7494674921035767,
   '_accuracy': 0.7854406130268199,
   '_precision': 0.6169169565919467,
   '_recall': 0.7854406130268199,
   '_f1': 0.6910528998733824,
   '_runtime': 0.943,
   '_samples_per_second': 1383.869,
   '_steps_per_second': 14.846,
   'epoch': 3.0,
   'trained_samples': 400},
  {'_loss': 0.6539126038551331,
   '_accuracy': 0.7854406130268199,
   '_precision': 0.6169169565919467,
   '_recall': 0.7854406130268199,
   '_f1': 0.6910528998733824,
   '_runtime': 0.9546,
   '_samples_per_second': 1367.081,
   '_steps_per_second': 14.666,
   'epoch': 3.0,
   'trained_samples': 600},
  {'_loss': 0.6080392003059387,
   '_accuracy': 0.7854406130268199,
   '_precision': 0.

In [148]:
res_sbert = apply_active_learning(get_sbert_centroid_args, X_train, y_train)
res_sbert_continuous = apply_active_learning(get_sbert_centroid_args, X_train, y_train, continuous_mode=True)
#res_centr = apply_active_learning(get_sbert_centroid_args, X_train, y_train)
res_sbert, res_sbert_continuous

([{'_loss': 0.9355064630508423,
   '_accuracy': 0.7854406130268199,
   '_precision': 0.6169169565919467,
   '_recall': 0.7854406130268199,
   '_f1': 0.6910528998733824,
   '_runtime': 0.9512,
   '_samples_per_second': 1371.886,
   '_steps_per_second': 14.718,
   'epoch': 3.0,
   'trained_samples': 200},
  {'_loss': 0.747168242931366,
   '_accuracy': 0.7854406130268199,
   '_precision': 0.6169169565919467,
   '_recall': 0.7854406130268199,
   '_f1': 0.6910528998733824,
   '_runtime': 0.9349,
   '_samples_per_second': 1395.895,
   '_steps_per_second': 14.975,
   'epoch': 3.0,
   'trained_samples': 400},
  {'_loss': 0.6651548147201538,
   '_accuracy': 0.7854406130268199,
   '_precision': 0.6169169565919467,
   '_recall': 0.7854406130268199,
   '_f1': 0.6910528998733824,
   '_runtime': 0.9504,
   '_samples_per_second': 1373.172,
   '_steps_per_second': 14.731,
   'epoch': 3.0,
   'trained_samples': 600},
  {'_loss': 0.6011356115341187,
   '_accuracy': 0.7854406130268199,
   '_precision': 0