In [1]:
import json

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from tokenizers import Tokenizer, models, trainers, pre_tokenizers, normalizers, processors, Regex
from tokenizers.pre_tokenizers import Whitespace, Punctuation, Sequence, PreTokenizer, Split
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer

In [2]:
# DATA_FILES = [
#     "../data/sbd_adjudicatory_dec/data_set/intellectual_property.json",
#     "../data/sbd_adjudicatory_dec/data_set/bva.json",
#     "../data/sbd_adjudicatory_dec/data_set/scotus.json",
# ]

# TEST_DATA_FILE = [
#     "../data/sbd_adjudicatory_dec/data_set/cyber_crime.json",
# ]

DATA_FILES = [
    "../../SBD/data/annotations/20180415_bva.json",
    "../../SBD/data/annotations/20180415_intellectual_property.json",
    "../../SBD/data/annotations/20180415_scotus.json",
]

TEST_DATA_FILE = [
    "../../SBD/data/annotations/20180415_cyber_crime.json"
]

CONTEXT_WINDOW = 6

In [3]:
def read_texts_from_json_files(file_paths):
    texts = []
    files = []
    keys = []
    all_ends = []
    for file_path in file_paths:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            for key in data:
                if 'text' in data[key]:
                    texts.append(data[key]['text'])
                    files.append(file_path)
                    keys.append(key)

                file_ends = []
                for annotation in data[key]['annotations']:
                    file_ends.append(annotation['end'])
                all_ends.append(sorted(list(set(file_ends))))
    return texts, all_ends, files, keys

train_texts, train_ends, train_files, train_keys = read_texts_from_json_files(DATA_FILES)
val_texts, val_ends, val_files, val_keys = read_texts_from_json_files(TEST_DATA_FILE)

In [4]:
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
trainer = BpeTrainer(vocab_size=300, special_tokens=["[UNK]", "[PAD]"])

# Define the sequence of pre-tokenizers
tokenizer.pre_tokenizer = Sequence([
    Split(pattern="\n", behavior="isolated"),
    Split(pattern=Regex(r'\w+|[^\w\t ]+'), behavior="removed", invert=True), # Like the Whitespace() pre_tokenizer, but ignores newline "\n" characters, as we want to tokenize these,
    Punctuation()
])

tokenizer.train_from_iterator(train_texts, trainer=trainer)






In [5]:
# encoded = tokenizer.encode(train_texts[0])
# encoded_end_idxes, labels = label_tokens(encoded.offsets, ends[0])

In [6]:
def label_tokens(offsets, ends):
    ends_idx = 0
    labels = []
    encoded_end_idxes = []

    for offsets_idx, offset in enumerate(offsets):
        left, right = offset
        if (ends_idx < len(ends)) and (left <= (ends[ends_idx]-1) < right):
            labels.append(True)
            encoded_end_idxes.append(offsets_idx)
            ends_idx += 1
        else:
            labels.append(False)

    return encoded_end_idxes, labels

class TextDataset(Dataset):
    def __init__(self, texts, ends, tokenizer, padding=6, device='cpu'):
        self.texts = texts
        self.ends = ends
        self.tokenizer = tokenizer
        self.device = device
        self.encoded_texts, self.labels = self._encode(texts, ends, tokenizer, padding)
        self.window_ids, self.windows_labels = self._extract_contexts(
            self.encoded_texts,
            self.labels
        )

    def _encode(self, train_texts, ends, tokenizer, padding=6):
        encoded_texts = []
        labels = []
        pad_id = tokenizer.token_to_id("[PAD]")
        for i, text in enumerate(train_texts):
            encoded = tokenizer.encode(text)
            encoded.pad(len(encoded) + padding, direction="left", pad_id=pad_id)
            encoded.pad(len(encoded) + padding, direction="right", pad_id=pad_id)
            encoded_texts.append(encoded)
            _, labeled_tokens = label_tokens(encoded.offsets, ends[i])
            labels.append(labeled_tokens)

        return encoded_texts, labels

    def _is_end_of_line(self, tokens, i):
        num_tokens = len(tokens)
        if i >= num_tokens:
            raise ValueError()
        elif (i < (num_tokens - 1)) and (tokens[i] != "\n") and (tokens[i+1] == "\n"):
            return True
        return False

    def _extract_contexts(
        self,
        encoded_texts,
        labels,
        target_tokens=['.', '"', ']', ')', ':', '"', "'", '*', '>', ';'],
        # target_tokens=['.'],
        context_size=CONTEXT_WINDOW
    ):
        window_ids = []
        window_labels = []
        for i, encoded_text in enumerate(encoded_texts):
            tokens = encoded_text.tokens
            num_tokens = len(tokens)
            for j in range(num_tokens):
                token = tokens[j]
                if (token in target_tokens) or self._is_end_of_line(tokens, j) or labels[i][j]:
                    start = max(0, j - context_size)
                    end = min(len(encoded_text), j + context_size + 1)
                    window_ids.append(encoded_text.ids[start:end])
                    # window_tokens.append(encoded_text.tokens[start:end])
                    window_labels.append(labels[i][j])
                
        return window_ids, window_labels

    def __len__(self):
        return len(self.window_ids)

    def __getitem__(self, idx):
        window_ids = torch.tensor(self.window_ids[idx], dtype=torch.int32, device=self.device)
        label = torch.tensor(self.windows_labels[idx], dtype=torch.float32, device=self.device)
        return window_ids, label

In [7]:
train_dataset = TextDataset(train_texts, train_ends, tokenizer)
val_dataset = TextDataset(val_texts, val_ends, tokenizer)

In [8]:
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=1024, shuffle=False)

In [9]:
encoded_end_idxes, labels = label_tokens(val_dataset.encoded_texts[2].offsets, val_ends[2])

In [40]:
class CNNModel(nn.Module):
    def __init__(self, vocab_size, embedding_size=64):
        super(CNNModel, self).__init__()
        kernel_size = 5
        conv_out_channels = 6
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_size)
        self.conv1d = nn.Conv1d(in_channels=embedding_size, out_channels=conv_out_channels, kernel_size=kernel_size)
        self.fc1 = nn.Linear(((CONTEXT_WINDOW*2+1)-kernel_size+1)*conv_out_channels, 128)
        self.fc2 = nn.Linear(128, 1)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.2)

    def forward(self, x):
        x = self.embedding(x).permute(0, 2, 1)
        x = self.conv1d(x)
        x = x.view(x.size(0), -1)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        output = torch.sigmoid(x)
        return output

In [41]:
# Instantiate the model, define the loss function and the optimizer
model = CNNModel(vocab_size=tokenizer.get_vocab_size())
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [44]:
def predict(model, dataloader):
    all_predictions = []
    all_labels = []
    
    for batch, labels in dataloader:
        
        # Forward pass
        predictions = model(batch)
        
        all_predictions.append(predictions)
        all_labels.append(labels)

    all_predictions = torch.cat(all_predictions)
    all_labels = torch.cat(all_labels)
    return all_predictions, all_labels

def calculate_metrics(preds, targets):
    # Ensure the predictions and targets are torch tensors
    if not isinstance(preds, torch.Tensor):
        preds = torch.tensor(preds)
    if not isinstance(targets, torch.Tensor):
        targets = torch.tensor(targets)

    # Threshold predictions to binary values (assuming binary classification with threshold of 0.5)
    preds = (preds >= 0.5).float()

    # Calculate True Positives, False Positives, True Negatives, and False Negatives
    TP = ((preds == 1) & (targets == 1)).sum().item()
    FP = ((preds == 1) & (targets == 0)).sum().item()
    TN = ((preds == 0) & (targets == 0)).sum().item()
    FN = ((preds == 0) & (targets == 1)).sum().item()

    # Calculate Accuracy
    accuracy = (TP + TN) / (TP + FP + TN + FN)

    # Calculate Precision, Recall, and F1 Score
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    # Confusion Matrix
    confusion_matrix = torch.tensor([[TN, FP], [FN, TP]])

    return {
        'True Positives': TP,
        'False Positives': FP,
        'True Negatives': TN,
        'False Negatives': FN,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1_score,
        'Confusion Matrix': confusion_matrix
    }

In [45]:
num_epochs = 50

for epoch in range(num_epochs):
    print(f"Epoch: {epoch+1}")
    model.train()
    for batch, labels in train_dataloader:
        
        # Forward pass
        predictions = model(batch)

        # Calculate loss
        loss = criterion(predictions, labels.view(-1, 1))

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    model.eval()
    with torch.no_grad():
        predictions, labels = predict(model, val_dataloader)
        metrics = calculate_metrics(predictions.flatten(), labels)
        print(f"val f1: {metrics["F1 Score"]}")

Epoch: 1
val f1: 0.9660609701156331
Epoch: 2
val f1: 0.970375939849624
Epoch: 3
val f1: 0.9705728972658869
Epoch: 4
val f1: 0.969660648737733
Epoch: 5
val f1: 0.9686635255403484
Epoch: 6
val f1: 0.9671305001880406
Epoch: 7
val f1: 0.96749924766777
Epoch: 8
val f1: 0.9652992096349267
Epoch: 9
val f1: 0.9672869147659064
Epoch: 10
val f1: 0.9653817082388512
Epoch: 11
val f1: 0.9670296430732003
Epoch: 12
val f1: 0.9677370722146842
Epoch: 13
val f1: 0.9686206637068252
Epoch: 14


KeyboardInterrupt: 

In [46]:
model.eval()
predictions, labels = predict(model, val_dataloader)
metrics = calculate_metrics(predictions.flatten(), labels)

In [54]:
val_dataset.

tensor([False, False, False,  ..., False, False,  True])

In [55]:
(predictions > 0.5).flatten() != labels.type(torch.bool)

tensor([False, False, False,  ..., False, False, False])

In [27]:
metrics

{'True Positives': 6475,
 'False Positives': 165,
 'True Negatives': 13314,
 'False Negatives': 205,
 'Accuracy': 0.9816459149759412,
 'Precision': 0.9751506024096386,
 'Recall': 0.969311377245509,
 'F1 Score': 0.9722222222222222,
 'Confusion Matrix': tensor([[13314,   165],
         [  205,  6475]])}

In [18]:
metrics['True Positives'] + metrics['False Negatives']

6680

In [17]:
labels.sum()

tensor(6680.)

In [None]:
tokenizer.get_vocab_size()

In [None]:
# ROUGH STUFF BELOW, WANDER WITH CARE






In [13]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn as nn

  from .autonotebook import tqdm as notebook_tqdm


In [14]:
class CustomModel(nn.Module):
    def __init__(self, pretrained_model_name):
        super(CustomModel, self).__init__()
        # Load the pretrained tokenizer and model
        self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)
        self.model = AutoModel.from_pretrained(pretrained_model_name)
        self.embedding_layer = self.model.get_input_embeddings()
        
        # Define additional layers or components if needed
        self.linear = nn.Linear(self.embedding_layer.embedding_dim, 10)  # Example linear layer

    def forward(self, text):
        # Tokenize the text
        tokens = self.tokenizer(text, return_tensors='pt', padding=True, truncation=True)
        input_ids = tokens['input_ids']
        
        # Get embeddings
        embeddings = self.embedding_layer(input_ids)

        return tokens, embeddings

In [15]:
pretrained_model_name = 'nlpaueb/legal-bert-base-uncased'

In [16]:
custom_model = CustomModel(pretrained_model_name)

text = "Hello, how are you?"
tokens, embeddings = custom_model(text)

print("Output shape:", output.shape)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

NameError: name 'output' is not defined

In [88]:
embeddings[0].shape

torch.Size([9, 768])

In [89]:
# custom_model.tokenizer.tokenize(train_texts[0])

In [82]:
tokens

{'input_ids': tensor([[  101, 18858,   185,   115,  1459,   244,   799,   124,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [80]:
embeddings

tensor([[[ 1.0771e-03,  1.3710e-02,  1.6612e-03,  ...,  1.1057e-04,
          -8.1636e-03, -2.3660e-03],
         [ 4.4711e-02, -8.5918e-02, -4.7011e-02,  ..., -5.0366e-02,
           1.5850e-02, -1.2906e-01],
         [-6.4795e-02, -4.4210e-02, -2.2164e-02,  ..., -4.0500e-02,
           1.1583e-02, -2.6889e-02],
         ...,
         [ 3.0790e-02, -8.1743e-02, -3.0175e-02,  ...,  3.8957e-02,
          -2.8992e-02, -2.0377e-02],
         [-2.0481e-02, -8.2805e-02, -4.5078e-02,  ...,  3.2213e-02,
           3.3308e-02,  1.5795e-03],
         [-3.8182e-02, -5.7226e-03,  8.3643e-04,  ..., -4.9915e-02,
           1.5197e-02,  1.6943e-02]]], grad_fn=<EmbeddingBackward0>)

In [75]:
print(tokens)

{'input_ids': tensor([[  101, 18858,   185,   115,  1459,   244,   799,   124,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [55]:
# Test the tokenizer
encoded = tokenizer.encode("Hello, how are you?")
print("Tokenized output:", encoded.tokens)  # Output: ['[CLS]', 'Hello', ',', 'how', 'are', 'you', '?', '[SEP]']

Tokenized output: ['H', 'e', 'l', 'l', 'o', ',', 'h', 'ow', 'a', 're', 'y', 'ou', '?']


In [56]:
tokenizer.get_vocab_size()

200

In [57]:
# Test the tokenizer
encoded = tokenizer.encode(train_texts[0])
# print("Tokenized output:", encoded.tokens)  # Output: ['[CLS]', 'Hello', ',', 'how', 'are', 'you', '?', '[SEP]']

Tokenized output: ['A', 'pp', 'le', 'C', 'o', 'm', 'p', 'u', 'ter', ',', 'I', 'n', 'c', '.', 'v', '.', 'F', 'r', 'an', 'k', 'l', 'in', 'C', 'o', 'm', 'p', 'u', 'ter', 'C', 'or', 'p', 'or', 'ation', 'U', '.', 'S', '.', 'C', 'ourt', 'of', 'A', 'pp', 'e', 'al', 's', 'T', 'h', 'ir', 'd', 'C', 'ir', 'c', 'u', 'it', 'A', 'u', 'g', 'u', 'st', '3', '0', ',', '19', '8', '3', '7', '1', '4', 'F', '.', '2', 'd', '1', '2', '4', '0', ',', '2', '19', 'U', 'S', 'P', 'Q', '1', '1', '3', '[', 'E', 'd', 'it', 'or', "'", 's', 'not', 'e', ':', 'T', 'h', 'is', 'c', 'as', 'e', 'is', 'dis', 'c', 'us', 's', 'ed', 'in', 'L', 'e', 'g', 'al', 'P', 'ro', 't', 'ec', 'tion', 'of', 'D', 'i', 'g', 'it', 'al', 'I', 'n', 'for', 'm', 'ation', 'in', ':', 'C', 'h', 'a', 'p', 'ter', '2', ',', 'S', 'ec', 'tion', 'I', 'I', '.', 'B', '.', '2', '.', '(', 'A', 'pp', 'le', 'v', '.', 'F', 'r', 'an', 'k', 'l', 'in', ')', '.', ']', 'S', 'l', 'o', 'v', 'it', 'er', ',', 'C', 'ir', 'c', 'u', 'it', 'J', 'u', 'd', 'g', 'e', '.', 'I', '.'

In [None]:
number_periods = []

for dataset in full_train_data:
    for key in dataset.keys():
        text = dataset[key]['text']
        ch = '.'
        period_idxes = [i for i, ltr in enumerate(text) if ltr == ch]
        dataset[key]['period_idxes'] = period_idxes
        dataset[key]['contexts'] = []
        for period_idx in period_idxes:
            left_idx = max(period_idx-CONTEXT_WINDOW, 0)
            left_context = text[left_idx:period_idx] 
            right_context = text[period_idx+1: period_idx+CONTEXT_WINDOW]
            dataset[key]['contexts'].append({
                'period_idx': period_idx
                'left_context': left_context,
                'right_context': right_context
            })
        number_periods.append(len(period_idxes))

Input shape: torch.Size([32, 1, 100])
After conv1d: torch.Size([32, 6, 96])
After global max pool: torch.Size([32, 6, 96])
After flattening: torch.Size([32, 576])


RuntimeError: mat1 and mat2 shapes cannot be multiplied (32x576 and 6x250)