In [40]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [74]:
# load data
train = torch.load('../embeddings/legal-bert-base-uncased/emb_tr_cpu.pkl')
test = torch.load('../embeddings/legal-bert-base-uncased/emb_test_cpu.pkl')
dev = torch.load('../embeddings/legal-bert-base-uncased/emb_dev_cpu.pkl')

In [75]:
# load labels
train_labels = pd.read_pickle('../ECHR_Dataset_Tokenized/legal-bert-base-uncased/df_train_tokenized.pkl')['label']
test_labels = pd.read_pickle('../ECHR_Dataset_Tokenized/legal-bert-base-uncased/df_test_tokenized.pkl')['label']
dev_labels = pd.read_pickle('../ECHR_Dataset_Tokenized/legal-bert-base-uncased/df_dev_tokenized.pkl')['label']


In [76]:
# save labels
pd.to_pickle(train_labels, 'train_labels.pkl')
pd.to_pickle(test_labels, 'test_labels.pkl')
pd.to_pickle(dev_labels, 'dev_labels.pkl')

In [77]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# train labels to tensor to device
train = [x.to(device) for x in train]
test = [x.to(device) for x in test]
dev = [x.to(device) for x in dev]
train_labels = torch.tensor(train_labels.values).to(device)
test_labels = torch.tensor(test_labels.values).to(device)
dev_labels = torch.tensor(dev_labels.values).to(device)

In [86]:
# create a torch dataset
class ECHRDataset(Dataset):
    def __init__(self, data, attention_mask, labels):
        self.data = data
        self.attention_mask = attention_mask
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.attention_mask[idx], self.labels[idx]


In [79]:
# pad the data to be of the same shape
def pad_data(data, max_len):
    padded_data = []
    attention_masks = []
    for i in range(len(data)):
        attention_masks.append([1] * data[i].shape[0] + [0] * (max_len - data[i].shape[0]))
        padded_data.append(F.pad(data[i], (0, 0, 0, max_len - data[i].shape[0])))
    print(len(attention_masks))
    return torch.stack(padded_data), torch.tensor(attention_masks)


In [80]:
# pad the data
max_len_train = max([x.shape[0] for x in train])
max_len_test = max([x.shape[0] for x in test])
max_len_dev = max([x.shape[0] for x in dev])
train, train_attention_masks = pad_data(train, max_len_train)
test, test_attention_masks = pad_data(test, max_len_test)
dev, dev_attention_masks = pad_data(dev, max_len_dev)

7100
2998
1380


In [87]:
# create the datasets
train_dataset = ECHRDataset(train, train_attention_masks, train_labels)
test_dataset = ECHRDataset(test, test_attention_masks, test_labels)
dev_dataset = ECHRDataset(dev, dev_attention_masks, dev_labels)

In [88]:
train_dataset[0]

(tensor([[-0.1817,  0.5268, -0.7749,  ...,  0.3693,  0.6814, -0.4771],
         [-0.5845,  0.3973,  0.9896,  ...,  0.2682,  0.1414, -0.6813],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]]),
 tensor([1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0]),
 tensor(0))

In [92]:
# save the datasets
path_to_datasets = '../ds_with_attention_mask/'
import os
if not os.path.exists(path_to_datasets):
    os.makedirs(path_to_datasets)
torch.save(train_dataset, path_to_datasets+'train_dataset.pt')
torch.save(test_dataset, path_to_datasets+'test_dataset.pt')
torch.save(dev_dataset, path_to_datasets+'dev_dataset.pt')

In [114]:
class AttentionMLP(nn.Module):
    def __init__(self, input_dim, hidden_sizes, dropout=0, weight_decay=0.01):
        super(AttentionMLP, self).__init__()
        # vector for query attention
        self.selector = nn.parameter.Parameter(torch.randn(input_dim, 1))
        self.Value= nn.Linear(input_dim, input_dim, bias=False)
        self.Key = nn.Linear(input_dim, input_dim, bias=False)
        # mlp layers
        layers = []
        for i in range(len(hidden_sizes) - 1):
            layers.append(nn.Linear(hidden_sizes[i], hidden_sizes[i + 1]))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout))
        self.mlp = nn.Sequential(*layers)
        self.output = nn.Linear(hidden_sizes[-1], 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x, attention_mask=None):
        # attention
        key = self.Key(x)

        value = self.Value(x)

        non_normalized_attention = torch.matmul(key, self.selector)
        if attention_mask is not None:
            attention_mask=attention_mask.unsqueeze(2)

            non_normalized_attention = non_normalized_attention.masked_fill(attention_mask == 0, -1e9)
        attention = F.softmax(non_normalized_attention, dim=1)
        # permute the attention to match the shape of the value
        attention = attention.permute(0, 2, 1)

        x = torch.matmul(attention, value)

        # mlp
        x = self.mlp(x)
        x = self.output(x)
        x = self.sigmoid(x)
        return x.squeeze()


In [115]:
# create the model
model = AttentionMLP(768, [768, 256, 64])
model = model.to(device)

In [116]:
# create the loss function
criterion = nn.BCELoss()


In [117]:
# create the optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [118]:
# create the dataloader for the training set
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
next(iter(train_dataloader))

[tensor([[[-0.8513,  0.5029,  0.9999,  ...,  0.3402,  0.1261, -0.1561],
          [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
          ...,
          [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],
 
         [[-0.4650,  0.8310, -0.9921,  ...,  0.4154,  0.6513, -0.6376],
          [-0.6296,  0.7994, -0.8645,  ...,  0.3450,  0.6652, -0.6714],
          [-0.7047,  0.6992, -0.7457,  ...,  0.2913,  0.6056, -0.7340],
          ...,
          [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],
 
         [[-0.5858,  0.7784, -0.3366,  ...,  0.2684,  0.5538, -0.4406],
          [-0.7407,  0.6959,

In [119]:
next(iter(train_dataloader))[1].shape

torch.Size([32, 102])

In [122]:
# train the model
from tqdm import tqdm
train_loss = []
val_loss = []
for epoch in range(10):
    running_loss = 0.0
    for i, data in tqdm(enumerate(train_dataloader, 0)):
        inputs, att_masks, labels = data

        optimizer.zero_grad()
        outputs = model(inputs, att_masks)
        loss = criterion(outputs, labels.float())
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    print(f'Epoch {epoch + 1}, loss: {running_loss / len(train_dataloader)}')
    train_loss.append(running_loss / len(train_dataloader))

    # validate the model
    with torch.no_grad():
        outputs = model(dev)
        loss = criterion(outputs, dev_labels.float())
        print(f'Validation loss: {loss.item()}')
        accuracy = ((outputs > 0.5) == dev_labels).sum().item() / len(dev_labels)
        print(f'Validation accuracy: {accuracy}')
        val_loss.append(loss.item())


222it [01:19,  2.81it/s]


Epoch 1, loss: 0.6327354576405104
Validation loss: 0.6937826871871948
Validation accuracy: 0.5007246376811594


222it [01:19,  2.80it/s]


Epoch 2, loss: 0.6025752737983927
Validation loss: 0.6662874221801758
Validation accuracy: 0.5963768115942029


29it [00:11,  2.52it/s]
Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x000002B39356D2D0>>
Traceback (most recent call last):
  File "C:\Users\HUAWEI\PycharmProjects\hlt-project2\venv\lib\site-packages\ipykernel\ipkernel.py", line 770, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


KeyboardInterrupt: 

In [None]:
# print traianing and validation loss
plt.plot(train_loss, label='Training loss')
plt.plot(val_loss, label='Validation loss')
plt.legend()
plt.show()

In [None]:
# test the model
with torch.no_grad():
    outputs = model(test)
    loss = criterion(outputs, test_labels.float())
    print(f'Test loss: {loss.item()}')
    from sklearn.metrics import classification_report
    print(classification_report(test_labels.cpu().numpy(), (outputs > 0.5).cpu().numpy()))