In [3]:
import json

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from tokenizers import Tokenizer, models, trainers, pre_tokenizers, normalizers, processors, Regex
from tokenizers.pre_tokenizers import Whitespace, Punctuation, Sequence, PreTokenizer, Split
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer

In [4]:
# Loading Data

DATA_FILES = [
    "../data/sbd_adjudicatory_dec/data_set/intellectual_property.json",
    "../data/sbd_adjudicatory_dec/data_set/bva.json",
    "../data/sbd_adjudicatory_dec/data_set/scotus.json",
]

VAL_DATA_FILE = [
    "../data/sbd_adjudicatory_dec/data_set/cyber_crime.json",
]

CONTEXT_WINDOW = 6

In [5]:
sorted(set((3,2,1)))

[1, 2, 3]

In [6]:
def read_texts_from_json_files(file_paths):
    texts = []
    files = []
    keys = []
    all_ends = []
    for file_path in file_paths:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            for key in data:
                if 'text' in data[key]:
                    texts.append(data[key]['text'])
                    files.append(file_path)
                    keys.append(key)

                file_ends = []
                for annotation in data[key]['annotations']:
                    file_ends.append(annotation['end'])
                all_ends.append(sorted(list(set(file_ends))))
    return texts, all_ends, files, keys

def label_tokens(offsets, ends):
    ends_idx = 0
    labels = []
    encoded_end_idxes = []

    for offsets_idx, offset in enumerate(offsets):
        left, right = offset
        if (ends_idx < len(ends)) and (left <= (ends[ends_idx]-1) < right):
            labels.append(True)
            encoded_end_idxes.append(offsets_idx)
            ends_idx += 1
        else:
            labels.append(False)

    return encoded_end_idxes, labels

class TextDataset(Dataset):
    def __init__(self, texts, ends, tokenizer, padding=CONTEXT_WINDOW, device='cpu'):
        self.texts = texts
        self.ends = ends
        self.tokenizer = tokenizer
        self.device = device
        self.encoded_texts, self.labels = self._encode(texts, ends, tokenizer, padding)
        self.window_ids, self.windows_labels = self._extract_contexts(
            self.encoded_texts,
            self.labels
        )

    def _encode(self, train_texts, ends, tokenizer, padding=CONTEXT_WINDOW):
        encoded_texts = []
        labels = []
        pad_id = tokenizer.token_to_id("[PAD]")
        for i, text in enumerate(train_texts):
            encoded = tokenizer.encode(text)
            encoded.pad(len(encoded) + padding, direction="left", pad_id=pad_id)
            encoded.pad(len(encoded) + padding, direction="right", pad_id=pad_id)
            encoded_texts.append(encoded)
            _, labeled_tokens = label_tokens(encoded.offsets, ends[i])
            labels.append(labeled_tokens)

        return encoded_texts, labels

    def _is_end_of_line(self, tokens, i):
        num_tokens = len(tokens)
        if i >= num_tokens:
            raise ValueError()
        elif (i < (num_tokens - 1)) and (tokens[i] != "\n") and (tokens[i+1] == "\n"):
            return True
        return False

    def _extract_contexts(
        self,
        encoded_texts,
        labels,
        target_tokens=['.', '"', ']', ')', ':', '"', "'", '*', '>', ';'],
        # target_tokens=['.'],
        context_size=CONTEXT_WINDOW
    ):
        window_ids = []
        window_labels = []
        for i, encoded_text in enumerate(encoded_texts):
            tokens = encoded_text.tokens
            num_tokens = len(tokens)
            for j in range(num_tokens):
                token = tokens[j]
                if (token in target_tokens) or self._is_end_of_line(tokens, j) or labels[i][j]:
                    start = max(0, j - context_size)
                    end = min(len(encoded_text), j + context_size + 1)
                    window_ids.append(encoded_text.ids[start:end])
                    # window_tokens.append(encoded_text.tokens[start:end])
                    window_labels.append(labels[i][j])
                
        return window_ids, window_labels

    def __len__(self):
        return len(self.window_ids)

    def __getitem__(self, idx):
        window_ids = torch.tensor(self.window_ids[idx], dtype=torch.int32, device=self.device)
        label = torch.tensor(self.windows_labels[idx], dtype=torch.float32, device=self.device)
        return window_ids, label

In [7]:
# Load Data

train_texts, train_ends, train_files, train_keys = read_texts_from_json_files(DATA_FILES)
val_texts, val_ends, val_files, val_keys = read_texts_from_json_files(VAL_DATA_FILE)

In [8]:
# Prepare Tokenizer

tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
trainer = BpeTrainer(vocab_size=300, special_tokens=["[UNK]", "[PAD]"])

# Define the sequence of pre-tokenizers
tokenizer.pre_tokenizer = Sequence([
    Split(pattern="\n", behavior="isolated"),
    Split(pattern=Regex(r'\w+|[^\w\t ]+'), behavior="removed", invert=True), # Like the Whitespace() pre_tokenizer, but ignores newline "\n" characters, as we want to tokenize these,
    Punctuation()
])

tokenizer.train_from_iterator(train_texts, trainer=trainer)






In [9]:
# Prepare torch dataloaders

train_dataset = TextDataset(train_texts, train_ends, tokenizer)
val_dataset = TextDataset(val_texts, val_ends, tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=1024, shuffle=False)

In [10]:
class CNNModel(nn.Module):
    def __init__(self, vocab_size, embedding_size=64):
        super(CNNModel, self).__init__()
        kernel_size = 5
        conv_out_channels = 6
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_size)
        self.conv1d = nn.Conv1d(in_channels=embedding_size, out_channels=conv_out_channels, kernel_size=kernel_size)
        self.fc1 = nn.Linear(((CONTEXT_WINDOW*2+1)-kernel_size+1)*conv_out_channels, 128)
        self.fc2 = nn.Linear(128, 1)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.2)

    def forward(self, x):
        x = self.embedding(x).permute(0, 2, 1)
        x = self.conv1d(x)
        x = x.view(x.size(0), -1)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        output = torch.sigmoid(x)
        return output

def predict(model, dataloader):
    all_predictions = []
    all_labels = []
    
    for batch, labels in dataloader:
        
        # Forward pass
        predictions = model(batch)
        
        all_predictions.append(predictions)
        all_labels.append(labels)

    all_predictions = torch.cat(all_predictions)
    all_labels = torch.cat(all_labels)
    return all_predictions, all_labels

def calculate_metrics(preds, targets):
    # Ensure the predictions and targets are torch tensors
    if not isinstance(preds, torch.Tensor):
        preds = torch.tensor(preds)
    if not isinstance(targets, torch.Tensor):
        targets = torch.tensor(targets)

    # Threshold predictions to binary values (assuming binary classification with threshold of 0.5)
    preds = (preds >= 0.5).float()

    # Calculate True Positives, False Positives, True Negatives, and False Negatives
    TP = ((preds == 1) & (targets == 1)).sum().item()
    FP = ((preds == 1) & (targets == 0)).sum().item()
    TN = ((preds == 0) & (targets == 0)).sum().item()
    FN = ((preds == 0) & (targets == 1)).sum().item()

    # Calculate Accuracy
    accuracy = (TP + TN) / (TP + FP + TN + FN)

    # Calculate Precision, Recall, and F1 Score
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    # Confusion Matrix
    confusion_matrix = torch.tensor([[TN, FP], [FN, TP]])

    return {
        'True Positives': TP,
        'False Positives': FP,
        'True Negatives': TN,
        'False Negatives': FN,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1_score,
        'Confusion Matrix': confusion_matrix
    }

In [11]:
# Instantiate the model, define the loss function and the optimizer
model = CNNModel(vocab_size=tokenizer.get_vocab_size())
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [12]:
num_epochs = 5

for epoch in range(num_epochs):
    print(f"Epoch: {epoch+1}")
    model.train()
    for batch, labels in train_dataloader:
        
        # Forward pass
        predictions = model(batch)

        # Calculate loss
        loss = criterion(predictions, labels.view(-1, 1))

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    model.eval()
    with torch.no_grad():
        train_predictions, train_labels = predict(model, train_dataloader)
        val_predictions, val_labels = predict(model, val_dataloader)
        val_metrics = calculate_metrics(val_predictions.flatten(), val_labels)
        train_metrics = calculate_metrics(train_predictions.flatten(), train_labels)
        print(f"val f1: {val_metrics["F1 Score"]}")
        print(f"train f1: {train_metrics["F1 Score"]}")

Epoch: 1


  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


val f1: 0.9277018633540374
train f1: 0.9652982535722386
Epoch: 2
val f1: 0.9386763056213386
train f1: 0.9783544845287323
Epoch: 3
val f1: 0.9407232509009568
train f1: 0.9834442737668544
Epoch: 4
val f1: 0.9415863141524106
train f1: 0.9861559101418662
Epoch: 5
val f1: 0.9449853943524831
train f1: 0.987829441969985


In [13]:
# Simple Investigation of Ensemble Efficacy

In [14]:
def prepare_ensemble_model(criterion, num_epochs=5):
    # Instantiate the model, define the loss function and the optimizer
    model = CNNModel(vocab_size=tokenizer.get_vocab_size())
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    for epoch in range(num_epochs):
        print(f"Epoch: {epoch+1}")
        model.train()
        for batch, labels in train_dataloader:
            
            # Forward pass
            predictions = model(batch)
    
            # Calculate loss
            loss = criterion(predictions, labels.view(-1, 1))
    
            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
    
    model.eval()
    with torch.no_grad():
        train_predictions, train_labels = predict(model, train_dataloader)
        val_predictions, val_labels = predict(model, val_dataloader)
        val_metrics = calculate_metrics(val_predictions.flatten(), val_labels)
        print(f"val f1: {val_metrics["F1 Score"]}")

    return model

In [15]:
ensemble_models = []
for i in range(5):
    print(f"Preparing model {i}")
    ensemble_models.append(
        prepare_ensemble_model(criterion)
    )

Preparing model 0
Epoch: 1
Epoch: 2
Epoch: 3
Epoch: 4
Epoch: 5
val f1: 0.9423932651834035
Preparing model 1
Epoch: 1
Epoch: 2
Epoch: 3
Epoch: 4
Epoch: 5
val f1: 0.9380816478471276
Preparing model 2
Epoch: 1
Epoch: 2
Epoch: 3
Epoch: 4
Epoch: 5
val f1: 0.9489248968024152
Preparing model 3
Epoch: 1
Epoch: 2
Epoch: 3
Epoch: 4
Epoch: 5
val f1: 0.9406081625069672
Preparing model 4
Epoch: 1
Epoch: 2
Epoch: 3
Epoch: 4
Epoch: 5
val f1: 0.9418877236273905


In [16]:
def ensemble_predict(ensemble, dataloader):
    ensembled_predictions = []
    for model in ensemble:
        pred, _ = predict(model, dataloader)
        ensembled_predictions.append(pred)
    return torch.stack(ensembled_predictions)

ensembled_predictions = ensemble_predict(ensemble_models, val_dataloader)

In [19]:
ensembled_predictions.mean(axis=0).flatten().shape

torch.Size([27160])

In [18]:
metrics = calculate_metrics(ensembled_predictions.mean(axis=0).flatten(), labels)
metrics

RuntimeError: The size of tensor a (27160) must match the size of tensor b (29) at non-singleton dimension 0

In [None]:
# Investigation of ensemble uncertainty metrics

In [None]:
correct_prediction = ((ensembled_predictions.mean(axis=0).flatten() > 0.5) == labels.type(torch.bool))

In [None]:
ensembled_predictions.var(axis=0).flatten()[correct_prediction].mean()

In [None]:
ensembled_predictions.var(axis=0).flatten()[correct_prediction].var()

In [None]:
ensembled_predictions.var(axis=0).flatten()[~correct_prediction].mean()

In [None]:
ensembled_predictions.var(axis=0).flatten()[~correct_prediction].var()

In [None]:
uncertainty_mask = ensembled_predictions.var(axis=0).flatten() > 0.04

In [None]:
uncertainty_mask.sum()

In [None]:
metrics = calculate_metrics(ensembled_predictions.mean(axis=0).flatten()[~uncertainty_mask], labels[~uncertainty_mask])
metrics