# Predicting Judicial Decisions of the European Court of Human Rights

In [1]:
from fastai import *
from fastai.text import *
from fastai.utils.mem import gpu_mem_get_free_no_cache
from sklearn.model_selection import train_test_split

In [2]:
import numpy as np
import re
import os
import copy
import torch

In [3]:
def read_dataset(PATH):
    X_dataset = {}
    Y_dataset = {}
    for path, dirs, files in os.walk(PATH):
        for filename in files:
            fullpath = os.path.join(path, filename)
            if "both" not in fullpath:
                with open(fullpath, 'r', encoding="utf8") as file:
                    X_dataset, Y_dataset = add_file_to_dataset(fullpath, X_dataset, Y_dataset, file.read())

    return X_dataset, Y_dataset       

In [4]:
def add_file_to_dataset(fullpath, x_dataset, y_dataset, file):
    article = extract_article(fullpath)
    file = preprocess(file)
    if article not in x_dataset.keys() :
        x_dataset[article] = []
        y_dataset[article] = []
    x_dataset[article] = x_dataset[article] + [file]
    label = 0 if "non-violation" in fullpath else 1
    y_dataset[article] = y_dataset[article] + [label]
    return x_dataset, y_dataset  

We use regex to extract the number of the Article from the fullpath and insert the file into the list under that specific Article.

In [5]:
def extract_article(path): 
    pattern = r"(Article\d+)"
    result = re.search(pattern, path)
    article = result.group(1)
    return article

### Preprocessing 

In [6]:
def preprocess(file): 
    file = extract_paragraphs(file)
    return file

In [7]:
def extract_paragraphs(file): 
    file = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\xff]', '', file)
    pat = r'(PROCEDURE\s*\n.+?)?((THE CIRCUMSTANCES OF THE CASE\s*\n.+?RELEVANT DOMESTIC LAW.+?)|(\n(AS TO THE FACTS|THE FACTS|FACTS)\s*\n.+?))(\nIII\.|THE LAW\s*\n|PROCEEDINGS BEFORE THE COMMISSION\s*\n|ALLEGED VIOLATION OF ARTICLE [0-9]+ OF THE CONVENTION \s*\n)'
    result = re.search(pat, file, re.S |  re.IGNORECASE)
    content = ""
    if result.group(1) is not None:
        content += result.group(1)
    content += result.group(2)
    return content

### Loading the data

In [8]:
base_path = "Datasets\\Human rights dataset"

In [9]:
X_train_docs, Y_train_docs = read_dataset(base_path + "\\train")
X_test_docs, Y_test_docs = read_dataset(base_path + "\\test20")

In [10]:
X_train_docs.keys()

dict_keys(['Article10', 'Article11', 'Article12', 'Article13', 'Article14', 'Article18', 'Article2', 'Article3', 'Article4', 'Article5', 'Article6', 'Article7', 'Article8'])

In [11]:
def select_articles(train_set):
    selected_training_set = copy.deepcopy(train_set)
    
    for key in train_set.keys():
        if len(train_set[key]) <= 50:
            selected_training_set.pop(key)
            continue
    return selected_training_set

In [12]:
X_train_docs = select_articles(X_train_docs)

In [13]:
X_train_docs.keys()

dict_keys(['Article10', 'Article11', 'Article13', 'Article14', 'Article2', 'Article3', 'Article5', 'Article6', 'Article8'])

### Combining all the articles according to class

In [14]:
X_train = X_train_docs["Article2"] + X_train_docs["Article3"] + X_train_docs["Article5"] + X_train_docs["Article6"] + X_train_docs["Article8"] + X_train_docs["Article10"] + X_train_docs["Article11"] + X_train_docs["Article13"] + X_train_docs["Article14"]

In [15]:
X_test = X_test_docs["Article2"] + X_test_docs["Article3"] + X_test_docs["Article5"] + X_test_docs["Article6"] + X_test_docs["Article8"] + X_test_docs["Article10"] + X_test_docs["Article11"] + X_test_docs["Article13"] + X_test_docs["Article14"]

In [16]:
Y_train = Y_train_docs["Article2"] + Y_train_docs["Article3"] + Y_train_docs["Article5"] + Y_train_docs["Article6"] + Y_train_docs["Article8"] + Y_train_docs["Article10"] + Y_train_docs["Article11"] + Y_train_docs["Article13"] + Y_train_docs["Article14"]

In [17]:
Y_test = Y_test_docs["Article2"] + Y_test_docs["Article3"] + Y_test_docs["Article5"] + Y_test_docs["Article6"] + Y_test_docs["Article8"] + Y_test_docs["Article10"] + Y_test_docs["Article11"] + Y_test_docs["Article13"] + Y_test_docs["Article14"]

In [18]:
len(X_train), len(Y_train), len(X_test), len(Y_test)

(3131, 3131, 784, 784)

### Creating the Classifier DataBunches

Credit to FastAi for explaining and providing the code (https://github.com/fastai/course-nlp/blob/master/5-nn-imdb.ipynb)

In [22]:
path = os.getcwd() + "\\Data"

In [20]:
X_train_np = np.array(X_train)
Y_train_np = np.array(Y_train)

X_test_np = np.array(X_test)
Y_test_np = np.array(Y_test)
 
train_cases = {'Case': X_train_np, 'Label': Y_train_np}
all_train_df = DataFrame(train_cases, columns= ['Case', 'Label'])

test_cases = {'Case': X_test_np, 'Label': Y_test_np}
test_df = DataFrame(test_cases, columns= ['Case', 'Label'])

In [23]:
data_lm = (TextList.from_df(all_train_df, cols='Case')
          .split_none()
          .label_for_lm()           
          .databunch(bs=16, num_workers=1))
data_lm.save(path + '\\data_lm_export_test20.pkl')

In [24]:
data_clas = (TextList.from_df(all_train_df, cols='Case', vocab = data_lm.vocab)
            .split_none()
            .label_from_df(cols='Label')       
            .databunch(bs=16, num_workers=1))
data_clas.save(path + '\\data_clas_export_test20.pkl')

In [25]:
len(data_clas.vocab.itos), len(data_lm.vocab.itos)

(48128, 48128)

In [26]:
data_clas_test = TextClasDataBunch.from_df(path, train_df = all_train_df, valid_df = test_df, vocab=data_lm.vocab, text_cols = 'Case', label_cols = 'Label', bs=16)
data_clas_test.save('data_clas_test_export_test20.pkl')

### Loading the DataBunches

In [3]:
lm_filename = 'data_lm_export_test20.pkl'
clas_filename = 'data_clas_export_test20.pkl'
clas_test_filename = 'data_clas_test_export_test20.pkl'

In [4]:
base_path = os.getcwd()
path = base_path + "\\Data"

In [5]:
bs = 16
data_lm = load_data(path, lm_filename, bs=bs)
data_clas = load_data(path, clas_filename, bs=bs)
data_clas_test = load_data(path, clas_test_filename, bs=bs)

In [6]:
torch.cuda.set_device(0)
gpu_mem_get_free_no_cache()

4021

### Creating and Fine-tuning the Language Model

In [8]:
learn_lm = language_model_learner(data_lm, AWD_LSTM, drop_mult=0.5)
learn_lm.model = learn_lm.model.cuda()

In [9]:
lr = 1e-2
lr *= bs/48
learn_lm.to_fp16();

In [10]:
learn_lm.fit_one_cycle(1, lr*10, moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy,time
0,3.677093,#na#,1:59:35,


In [11]:
learn_lm.unfreeze()
learn_lm.fit_one_cycle(3, lr, moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy,time
0,2.925367,#na#,2:01:17,
1,2.82362,#na#,2:00:36,
2,2.680472,#na#,1:59:19,


In [12]:
model_save = "fine_tuned_test20"
enc_save = "fine_tuned_enc_test20"

In [13]:
learn_lm.save(model_save)
learn_lm.save_encoder(enc_save)

### Classification

In [7]:
model_save = "fine_tuned_test20"
enc_save = "fine_tuned_enc_test20"

In [8]:
learn_c = text_classifier_learner(data_clas, AWD_LSTM, drop_mult=0.5).to_fp16()
learn_c.load_encoder(enc_save)
learn_c.freeze()

In [9]:
learn_c.metrics= [accuracy,
               Precision(average='micro'),
               Recall(average='micro'),
               FBeta(average='micro')]

In [12]:
learn_c.fit_one_cycle(1, 1e-2, moms=(0.8, 0.7))

epoch,train_loss,valid_loss,accuracy,precision,recall,f_beta,time
0,0.608677,#na#,39:31,,,,


In [13]:
learn_c.freeze_to(-2)
learn_c.fit_one_cycle(1, slice(1e-2/(2.6**4), 1e-2), moms=(0.8, 0.7))

epoch,train_loss,valid_loss,accuracy,precision,recall,f_beta,time
0,0.640671,#na#,33:09,,,,


In [14]:
learn_c.save('2nd_test20')

In [10]:
learn_c.load('2nd_test20')

RNNLearner(data=TextClasDataBunch;

Train: LabelList (3131 items)
x: TextList
xxbos xxup procedure 
  1.the case originated in an application ( no . xxunk / 03 ) against the xxmaj republic of xxmaj bulgaria lodged with the xxmaj court under xxmaj article 34 of the xxmaj convention for the xxmaj protection of xxmaj human xxmaj rights and xxmaj fundamental xxmaj freedoms ( “ the xxmaj convention ” ) by a xxmaj bulgarian national , xxmaj mr xxmaj xxunk xxmaj xxunk xxmaj pankov ( “ the applicant ” ) , on 28 xxmaj march 2003 . 
  2.the applicant was represented by xxmaj mr xxup y. xxmaj grozev , a lawyer practising in xxmaj sofia and by xxmaj mr xxup b. xxmaj boev , formerly a lawyer practising in xxmaj sofia , who on 7october 2008 was granted leave under xxmaj rule 36 4 ( a ) in fine of the xxmaj rules of xxmaj court to continue representing the applicant . xxmaj the xxmaj bulgarian xxmaj government ( “ the xxmaj government ” ) were represented by their xxmaj agents , xxmaj ms xxup n. xxma

In [11]:
learn_c.freeze_to(-3)
learn_c.fit_one_cycle(1, slice(5e-3/(2.6**4), 5e-3), moms=(0.7, 0.6))

epoch,train_loss,valid_loss,accuracy,precision,recall,f_beta,time
0,0.586339,#na#,36:51,,,,


In [12]:
learn_c.save('3rd_test20')

In [10]:
learn_c.load('3rd_test20')

RNNLearner(data=TextClasDataBunch;

Train: LabelList (3131 items)
x: TextList
xxbos xxup procedure 
  1.the case originated in an application ( no . xxunk / 03 ) against the xxmaj republic of xxmaj bulgaria lodged with the xxmaj court under xxmaj article 34 of the xxmaj convention for the xxmaj protection of xxmaj human xxmaj rights and xxmaj fundamental xxmaj freedoms ( “ the xxmaj convention ” ) by a xxmaj bulgarian national , xxmaj mr xxmaj xxunk xxmaj xxunk xxmaj pankov ( “ the applicant ” ) , on 28 xxmaj march 2003 . 
  2.the applicant was represented by xxmaj mr xxup y. xxmaj grozev , a lawyer practising in xxmaj sofia and by xxmaj mr xxup b. xxmaj boev , formerly a lawyer practising in xxmaj sofia , who on 7october 2008 was granted leave under xxmaj rule 36 4 ( a ) in fine of the xxmaj rules of xxmaj court to continue representing the applicant . xxmaj the xxmaj bulgarian xxmaj government ( “ the xxmaj government ” ) were represented by their xxmaj agents , xxmaj ms xxup n. xxma

In [11]:
learn_c.unfreeze()
learn_c.fit_one_cycle(1, slice(2e-3/(2.6**4), 2e-3), moms=(0.9, 0.8))

epoch,train_loss,valid_loss,accuracy,precision,recall,f_beta,time


RuntimeError: CUDA out of memory. Tried to allocate 34.00 MiB (GPU 0; 4.00 GiB total capacity; 2.76 GiB already allocated; 12.35 MiB free; 144.20 MiB cached)

In [None]:
learn_c.save('clas_test20')

### Testing

In [0]:
data_clas.vocab.itos = data_lm.vocab.itos
# data_clas_test.vocab.itos = data_lm.vocab.itos
learn_c = text_classifier_learner(data_clas, AWD_LSTM, drop_mult=0.5).to_fp16()
learn_c.load('clas_test20')
data_clas.valid_dl = data_clas_test.valid_dl

In [0]:
y_test_pred, y_true = learn_c.get_preds(ds_type=DatasetType.Valid, ordered=True)

In [0]:
preds = torch.argmax(y_test_pred, dim=1)

In [0]:
import sklearn.metrics as sm
# Compute performance metrics
print("Logistic regression performance:")
print("Mean absolute error =", round(sm.mean_absolute_error(y_true, preds), 2))
print("Accuracy score =", sm.accuracy_score(y_true, preds))
print("Recall score =", sm.recall_score(y_true, preds))
print("Precision score =", sm.precision_score(y_true, preds))
print("F1 score =", sm.f1_score(y_true, preds))

Logistic regression performance:
Mean absolute error = 0.13
Accuracy score = 0.8698979591836735
Recall score = 0.8061224489795918
Precision score = 0.9239766081871345
F1 score = 0.8610354223433242
