## Key Points
- TASK: Binary Classification
- DATASET: ECHR
- MODEL: H-BERT, LEGAL-BERT, ...

In [2]:
import pandas as pd
from src.utils import load_ECHR

# load train, dev and test dataset from json to pandas dataframe
df_train, df_dev, df_test = load_ECHR()

In [3]:
# add a column with 0/1 labels to the dataframe 0 if VIOLATED_ARTICLE is empty, 1 otherwise
df_train['label'] = df_train['VIOLATED_ARTICLES'].apply(lambda x: 0 if x == [] else 1)
df_dev['label'] = df_dev['VIOLATED_ARTICLES'].apply(lambda x: 0 if x == [] else 1)
df_test['label'] = df_test['VIOLATED_ARTICLES'].apply(lambda x: 0 if x == [] else 1)

# print VIOLATED_ARTICLES and labels
print(df_train[['VIOLATED_ARTICLES', 'label']].sample(10))

     VIOLATED_ARTICLES  label
1645                []      0
4100                []      0
676                [6]      1
3280                []      0
1481               [2]      1
6652                []      0
1819                []      0
1032                []      0
249                [5]      1
3014                []      0


In [4]:
# print len of each data set
print("Training set has {} samples.".format(df_train.shape[0]))
print("Validation set has {} samples.".format(df_dev.shape[0]))
print("Test set has {} samples.".format(df_test.shape[0]))

print("--------------------")

# check if train, dev and test are balanced
print("Training set has {} positive samples.".format(df_train[df_train['label'] == 1].shape[0]))
print("Training set has {} negative samples.".format(df_train[df_train['label'] == 0].shape[0]))
print("Validation set has {} positive samples.".format(df_dev[df_dev['label'] == 1].shape[0]))
print("Validation set has {} negative samples.".format(df_dev[df_dev['label'] == 0].shape[0]))
print("Test set has {} positive samples.".format(df_test[df_test['label'] == 1].shape[0]))
print("Test set has {} negative samples.".format(df_test[df_test['label'] == 0].shape[0]))


Training set has 7100 samples.
Validation set has 1380 samples.
Test set has 2998 samples.
--------------------
Training set has 3551 positive samples.
Training set has 3549 negative samples.
Validation set has 690 positive samples.
Validation set has 690 negative samples.
Test set has 1974 positive samples.
Test set has 1024 negative samples.


### Use only train set to try classification

In [5]:
# from df_train take only the columns that are needed for classification, and drop the rest, we need only 'FACTS' and 'labels'
train_data = df_train[['TEXT', 'label']]

print(train_data.head())
type(train_data['TEXT'])

                                                TEXT  label
0  [7. On 28 September 1994 the applicant's husba...      0
1  [8. The applicant was born in 1974 and lives i...      0
2  [5. The first applicant, Mr Ivan Dvořáček, was...      1
3  [4. The applicant was born in 1959 and lives i...      1
4  [6. The applicant was born in 1946., 7. On 14 ...      1


pandas.core.series.Series

In [6]:
import torch
if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
    x = torch.ones(1, device=mps_device)
    print (x)
else:
    print ("MPS device not found.")

tensor([1.], device='mps:0')


In [7]:
# Set random seed
import numpy as np
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

In [8]:
# Split data into train and test
from sklearn.model_selection import train_test_split
train, val = train_test_split(train_data, test_size=0.2, random_state=42)

In [9]:
text = train['TEXT'].values
labels = train['label'].values

sentences = [" ".join(t) for t in text]

print(sentences[0])
print(text.shape)
print(sentences.__len__())
type(sentences[0])


The applicant, Mr Dafče Jančev, is a Macedonian national who was born in 1951 and lives in the village Dolni Disan, Negotino. He was represented before the Court by Mr M. Mančev, a lawyer practising in Kavadarci, the former Yugoslav Republic of Macedonia. The facts of the case, as submitted by the applicant, may be summarised as follows. The applicant and Mr Dz.I. (“the plaintiff”) are neighbours whose plots of land are adjacent. On 16 February 2008 the applicant constructed a wall, a meter long and 90 cm high, and put three concrete bricks on a passage that the plaintiff used to access his property. The plaintiff brought a civil action requesting the Negotino Court of First Instance (“the first-instance court”) to establish that the applicant disturbed his possession (смеќавање на владение) and to order reinstatement in previous state. On 10 November 2008 the first-instance court allowed the plaintiff’s claim and ordered the applicant to demolish the wall and remove the bricks. The tr

str

In [10]:
# load legal bert model and classify the legal documents
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import pandas as pd
import numpy as np

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# encode train data 
token_ids = []
attention_masks = []
for fact in sentences:
    encoding_dict = tokenizer.encode_plus(
                        fact,
                        add_special_tokens = True,
                        max_length = 512,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt',
                    )
    token_ids.append(encoding_dict['input_ids'])
    attention_masks.append(encoding_dict['attention_mask'])

token_ids = torch.cat(token_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

print('Original: ', sentences[0])
print('Token IDs:', token_ids[0])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Original:  The applicant, Mr Dafče Jančev, is a Macedonian national who was born in 1951 and lives in the village Dolni Disan, Negotino. He was represented before the Court by Mr M. Mančev, a lawyer practising in Kavadarci, the former Yugoslav Republic of Macedonia. The facts of the case, as submitted by the applicant, may be summarised as follows. The applicant and Mr Dz.I. (“the plaintiff”) are neighbours whose plots of land are adjacent. On 16 February 2008 the applicant constructed a wall, a meter long and 90 cm high, and put three concrete bricks on a passage that the plaintiff used to access his property. The plaintiff brought a civil action requesting the Negotino Court of First Instance (“the first-instance court”) to establish that the applicant disturbed his possession (смеќавање на владение) and to order reinstatement in previous state. On 10 November 2008 the first-instance court allowed the plaintiff’s claim and ordered the applicant to demolish the wall and remove the bri

In [11]:
labels = labels.T
labels.shape

  labels = labels.T


torch.Size([5680])

In [12]:
# save tensor to file
torch.save(token_ids, 'token_ids.pt')
torch.save(attention_masks, 'attention_masks.pt')
torch.save(labels, 'labels.pt')

In [11]:
# decode token_ids to text
tokenizer.decode(token_ids[0])
token_ids.shape

torch.Size([5680, 512])

In [2]:
# load tensor from file pt
import torch
token_ids = torch.load('encoding/token_ids.pt')
attention_masks = torch.load('encoding/attention_masks.pt')
labels = torch.load('encoding/labels.pt')


In [39]:
# progress bar
from tqdm import tqdm
from tqdm.notebook import tqdm

def train(num_epochs, model, data_loader, loss_fn, optimizer, att_mask):

    for epoch in range(num_epochs):

        loop = tqdm(enumerate(data_loader), total=len(data_loader), leave=False)
        for batch in data_loader:
            
            inputs = batch[0].to(mps_device)
            att_mask = batch[1].to(mps_device)
            labels = batch[2].to(mps_device)
            optimizer.zero_grad()

            outputs = model(inputs, attention_mask=att_mask)
            loss = loss_fn(outputs.logits, labels)

            loss.backward()
            optimizer.step()

            # update progress bar
            loop.set_description(f"Epoch [{epoch}/{num_epochs}]")
            loop.set_postfix(loss=loss.item())

In [33]:
# function to evaluate model 
from sklearn.metrics import accuracy_score

def evaluate_model(model, data_loader, device, metric_fn=accuracy_score):

    with torch.no_grad():
        accuracy = []
        
        for batch in data_loader:

            inputs = batch[0].to(mps_device)
            att_mask = batch[1].to(mps_device)
            labels = batch[2].to(mps_device)

            outputs = model(inputs, attention_mask=att_mask)
            predictions = torch.argmax(outputs.logits, dim=1)

            accuracy.append( metric_fn(labels.cpu(), predictions.cpu()))

    return accuracy.mean()

In [37]:
from torch.utils.data import DataLoader

# unify token_ids and labels 
train_tensor = torch.utils.data.TensorDataset(token_ids, attention_masks, labels)

train_data_loader = DataLoader(train_tensor, batch_size=16, shuffle=True)


In [35]:
for batch in train_data_loader:
    print(batch[0].shape)
    print(batch[1].shape)
    print(batch[2].shape)
    break

torch.Size([16, 512])
torch.Size([16, 512])
torch.Size([16])


In [40]:
#### -------- TRAIN ON COLAB -------- ###
model_bert_class = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model_bert_class.to(mps_device)
crossentropy = torch.nn.CrossEntropyLoss()
adam = torch.optim.Adam(model_bert_class.parameters(), lr=0.001)

# train
train(10,model_bert_class,train_data_loader,crossentropy,adam,attention_masks)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/355 [00:00<?, ?it/s]

RuntimeError: MPS backend out of memory (MPS allocated: 16.70 GB, other allocations: 1.25 GB, max allowed: 18.13 GB). Tried to allocate 192.00 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).