# Predicting Judicial Decisions of the European Court of Human Rights

In [1]:
import numpy as np
import re
import os
import copy

In [2]:
import sklearn.metrics as sm
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn import svm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [3]:
import spacy
from spacy.lang import en

In [4]:
scoring = {'accuracy': make_scorer(sm.accuracy_score),
           'precision': make_scorer(sm.precision_score),
           'recall': make_scorer(sm.recall_score),
           'f1': make_scorer(sm.f1_score)}

In [5]:
def read_dataset(PATH):
    X_dataset = {}
    Y_dataset = {}
    for path, dirs, files in os.walk(PATH):
        for filename in files:
            fullpath = os.path.join(path, filename)
            if "both" not in fullpath:
                with open(fullpath, 'r', encoding="utf8") as file:
                    X_dataset, Y_dataset = add_file_to_dataset(fullpath, X_dataset, Y_dataset, file.read())

    return X_dataset, Y_dataset       

In [6]:
def add_file_to_dataset(fullpath, x_dataset, y_dataset, file):
    article = extract_article(fullpath)
    file = preprocess(file)
    if article not in x_dataset.keys() :
        x_dataset[article] = []
        y_dataset[article] = []
    x_dataset[article] = x_dataset[article] + [file]
    label = 0 if "non-violation" in fullpath else 1
    y_dataset[article] = y_dataset[article] + [label]
    return x_dataset, y_dataset  

In [7]:
def extract_article(path): 
    pattern = r"(Article\d+)"
    result = re.search(pattern, path)
    article = result.group(1)
    return article

### Preprocessing 

In [8]:
def preprocess(file): 
    file = extract_paragraphs(file)
    return file

In [9]:
def extract_paragraphs(file): 
    file = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\xff]', '', file)
    pat = r'(PROCEDURE\s*\n.+?)?((THE CIRCUMSTANCES OF THE CASE\s*\n.+?RELEVANT DOMESTIC LAW.+?)|(\n(AS TO THE FACTS|THE FACTS)\s*\n.+?))(\nIII\.|THE LAW\s*\n|PROCEEDINGS BEFORE THE COMMISSION\s*\n|ALLEGED VIOLATION OF ARTICLE [0-9]+ OF THE CONVENTION \s*\n)'
    result = re.search(pat, file, re.S |  re.IGNORECASE)
    if result is None:
        print(repr(file))
    content = ""
    if result.group(1) is not None:
        content += result.group(1)
    content += result.group(2)
    return content

### Loading the data

In [10]:
base_path = "Datasets\\Human rights dataset"

In [11]:
X_train_docs, Y_train_docs = read_dataset(base_path + "\\train")
#X_extra_test_docs, Y_extra_test = read_dataset(base_path + "\\test_violations")

In [12]:
X_train_docs.keys()

dict_keys(['Article10', 'Article11', 'Article12', 'Article13', 'Article14', 'Article18', 'Article2', 'Article3', 'Article4', 'Article5', 'Article6', 'Article7', 'Article8'])

In [13]:
def select_articles(train_set):
    selected_training_set = copy.deepcopy(train_set)
    
    for key in train_set.keys():
        if len(train_set[key]) <= 50:
            selected_training_set.pop(key)
            continue
    return selected_training_set

In [14]:
X_train_docs = select_articles(X_train_docs)

In [15]:
X_train_docs.keys()

dict_keys(['Article10', 'Article11', 'Article13', 'Article14', 'Article2', 'Article3', 'Article5', 'Article6', 'Article8'])

### Combining all the articles according to class

In [16]:
X_train = X_train_docs["Article2"] + X_train_docs["Article3"] + X_train_docs["Article5"] + X_train_docs["Article6"] + X_train_docs["Article8"] + X_train_docs["Article10"] + X_train_docs["Article11"] + X_train_docs["Article13"] + X_train_docs["Article14"]

In [17]:
print(str(len(X_train_docs["Article2"])) + "+" + str(len(X_train_docs["Article3"])) + "+" + str(len(X_train_docs["Article5"])) + "+" + str(len(X_train_docs["Article6"])) + "+" + str(len(X_train_docs["Article8"])) + "+" + str(len(X_train_docs["Article10"])) + "+" + str(len(X_train_docs["Article11"])) + "+" + str(len(X_train_docs["Article13"])) + "+" + str(len(X_train_docs["Article14"])) + "=" + str(len(X_train)))

114+568+300+916+457+212+64+212+288=3131


In [18]:
Y_train = Y_train_docs["Article2"] + Y_train_docs["Article3"] + Y_train_docs["Article5"] + Y_train_docs["Article6"] + Y_train_docs["Article8"] + Y_train_docs["Article10"] + Y_train_docs["Article11"] + Y_train_docs["Article13"] + Y_train_docs["Article14"]

In [19]:
len(Y_train)

3131

In [20]:
# tokenize the doc and lemmatize its tokens
lemmatizer = spacy.lang.en.English()
def my_tokenizer(doc):
    tokens = lemmatizer(doc)
    return([token.lemma_ for token in tokens])

### Tokenization with TfIdfVectorizer and Classification with LR

In [21]:
vect = TfidfVectorizer(ngram_range=(2,4), lowercase=True, tokenizer=my_tokenizer, max_features=600000, min_df=3)
term_doc_matrix = vect.fit_transform(X_train).toarray()

In [22]:
print("The number of features: " + str(len(vect.get_feature_names())))

The number of features: 600000


#### Run #1

In [23]:
X_train_new = SelectKBest(chi2, k=300000).fit_transform(term_doc_matrix, Y_train)

In [24]:
X_train_new.shape

(3131, 300000)

In [25]:
classifier_instance = LogisticRegression(solver = 'lbfgs')        
scores = cross_validate(classifier_instance, term_doc_matrix, Y_train, cv=10, scoring=scoring)

In [26]:
print("Accuracy: %0.3f" % (scores["test_accuracy"].mean()))
print("Precision: %0.3f" % (scores["test_precision"].mean()))
print("Recall: %0.3f" % (scores["test_recall"].mean()))
print("F1: %0.3f" % (scores["test_f1"].mean()))

Accuracy: 0.710
Precision: 0.730
Recall: 0.690
F1: 0.697


In [27]:
scores["fit_time"].mean(), scores["score_time"].mean()

(206.76720082759857, 18.61684639453888)

#### Run #2

In [23]:
from sklearn.utils import shuffle

In [24]:
X_train, term_doc_matrix, Y_train = shuffle(X_train, term_doc_matrix, Y_train)

In [25]:
X_train_new = SelectKBest(chi2, k=300000).fit_transform(term_doc_matrix, Y_train)

In [26]:
X_train_new.shape

(3131, 300000)

In [27]:
classifier_instance = LogisticRegression(solver = 'lbfgs')        
scores = cross_validate(classifier_instance, term_doc_matrix, Y_train, cv=10, scoring=scoring)

In [28]:
print("Accuracy: %0.3f" % (scores["test_accuracy"].mean()))
print("Precision: %0.3f" % (scores["test_precision"].mean()))
print("Recall: %0.3f" % (scores["test_recall"].mean()))
print("F1: %0.3f" % (scores["test_f1"].mean()))

Accuracy: 0.738
Precision: 0.747
Recall: 0.720
F1: 0.733


In [29]:
scores["fit_time"].mean(), scores["score_time"].mean()

(185.85568478107453, 21.988081789016725)

#### Run #3

In [23]:
from sklearn.utils import shuffle

In [24]:
X_train, term_doc_matrix, Y_train = shuffle(X_train, term_doc_matrix, Y_train)

In [25]:
X_train_new = SelectKBest(chi2, k=300000).fit_transform(term_doc_matrix, Y_train)

In [26]:
X_train_new.shape

(3131, 300000)

In [27]:
classifier_instance = LogisticRegression(solver = 'lbfgs')        
scores = cross_validate(classifier_instance, term_doc_matrix, Y_train, cv=10, scoring=scoring)

In [28]:
print("Accuracy: %0.3f" % (scores["test_accuracy"].mean()))
print("Precision: %0.3f" % (scores["test_precision"].mean()))
print("Recall: %0.3f" % (scores["test_recall"].mean()))
print("F1: %0.3f" % (scores["test_f1"].mean()))

Accuracy: 0.737
Precision: 0.745
Recall: 0.721
F1: 0.733


In [29]:
scores["fit_time"].mean(), scores["score_time"].mean()

(182.87630417346955, 16.60242943763733)

#### Run #4

In [23]:
from sklearn.utils import shuffle

In [24]:
X_train, term_doc_matrix, Y_train = shuffle(X_train, term_doc_matrix, Y_train)

In [25]:
X_train_new = SelectKBest(chi2, k=300000).fit_transform(term_doc_matrix, Y_train)

In [26]:
X_train_new.shape

(3131, 300000)

In [27]:
classifier_instance = LogisticRegression(solver = 'lbfgs')        
scores = cross_validate(classifier_instance, term_doc_matrix, Y_train, cv=10, scoring=scoring)

In [28]:
print("Accuracy: %0.3f" % (scores["test_accuracy"].mean()))
print("Precision: %0.3f" % (scores["test_precision"].mean()))
print("Recall: %0.3f" % (scores["test_recall"].mean()))
print("F1: %0.3f" % (scores["test_f1"].mean()))

Accuracy: 0.736
Precision: 0.744
Recall: 0.720
F1: 0.731


In [29]:
scores["fit_time"].mean(), scores["score_time"].mean()

(188.40387692451478, 19.728974437713624)

#### Run #5

In [23]:
from sklearn.utils import shuffle

In [24]:
X_train, term_doc_matrix, Y_train = shuffle(X_train, term_doc_matrix, Y_train)

In [25]:
X_train_new = SelectKBest(chi2, k=300000).fit_transform(term_doc_matrix, Y_train)

In [26]:
X_train_new.shape

(3131, 300000)

In [27]:
classifier_instance = LogisticRegression(solver = 'lbfgs')        
scores = cross_validate(classifier_instance, term_doc_matrix, Y_train, cv=10, scoring=scoring)

In [28]:
print("Accuracy: %0.3f" % (scores["test_accuracy"].mean()))
print("Precision: %0.3f" % (scores["test_precision"].mean()))
print("Recall: %0.3f" % (scores["test_recall"].mean()))
print("F1: %0.3f" % (scores["test_f1"].mean()))

Accuracy: 0.738
Precision: 0.748
Recall: 0.718
F1: 0.732


In [29]:
scores["fit_time"].mean(), scores["score_time"].mean()

(173.06495959758757, 16.428086495399477)

#### Run #6

In [23]:
from sklearn.utils import shuffle

In [24]:
X_train, term_doc_matrix, Y_train = shuffle(X_train, term_doc_matrix, Y_train)

In [25]:
X_train_new = SelectKBest(chi2, k=300000).fit_transform(term_doc_matrix, Y_train)

In [26]:
X_train_new.shape

(3131, 300000)

In [27]:
classifier_instance = LogisticRegression(solver = 'lbfgs')        
scores = cross_validate(classifier_instance, term_doc_matrix, Y_train, cv=10, scoring=scoring)

In [28]:
print("Accuracy: %0.3f" % (scores["test_accuracy"].mean()))
print("Precision: %0.3f" % (scores["test_precision"].mean()))
print("Recall: %0.3f" % (scores["test_recall"].mean()))
print("F1: %0.3f" % (scores["test_f1"].mean()))

Accuracy: 0.737
Precision: 0.748
Recall: 0.715
F1: 0.730


In [29]:
scores["fit_time"].mean(), scores["score_time"].mean()

(219.6618717432022, 22.58730866909027)

#### Run #7

In [23]:
from sklearn.utils import shuffle

In [24]:
X_train, term_doc_matrix, Y_train = shuffle(X_train, term_doc_matrix, Y_train)

In [25]:
X_train_new = SelectKBest(chi2, k=300000).fit_transform(term_doc_matrix, Y_train)

In [26]:
X_train_new.shape

(3131, 300000)

In [27]:
classifier_instance = LogisticRegression(solver = 'lbfgs')        
scores = cross_validate(classifier_instance, term_doc_matrix, Y_train, cv=10, scoring=scoring)

In [28]:
print("Accuracy: %0.3f" % (scores["test_accuracy"].mean()))
print("Precision: %0.3f" % (scores["test_precision"].mean()))
print("Recall: %0.3f" % (scores["test_recall"].mean()))
print("F1: %0.3f" % (scores["test_f1"].mean()))

Accuracy: 0.738
Precision: 0.747
Recall: 0.719
F1: 0.733


In [29]:
scores["fit_time"].mean(), scores["score_time"].mean()

(212.202121758461, 19.439407801628114)

#### Run #8

In [23]:
from sklearn.utils import shuffle

In [24]:
X_train, term_doc_matrix, Y_train = shuffle(X_train, term_doc_matrix, Y_train)

In [25]:
X_train_new = SelectKBest(chi2, k=300000).fit_transform(term_doc_matrix, Y_train)

In [26]:
X_train_new.shape

(3131, 300000)

In [27]:
classifier_instance = LogisticRegression(solver = 'lbfgs')        
scores = cross_validate(classifier_instance, term_doc_matrix, Y_train, cv=10, scoring=scoring)

In [28]:
print("Accuracy: %0.3f" % (scores["test_accuracy"].mean()))
print("Precision: %0.3f" % (scores["test_precision"].mean()))
print("Recall: %0.3f" % (scores["test_recall"].mean()))
print("F1: %0.3f" % (scores["test_f1"].mean()))

Accuracy: 0.738
Precision: 0.749
Recall: 0.720
F1: 0.733


In [29]:
scores["fit_time"].mean(), scores["score_time"].mean()

(161.91309115886688, 11.865291166305543)

#### Run #9

In [23]:
from sklearn.utils import shuffle

In [24]:
X_train, term_doc_matrix, Y_train = shuffle(X_train, term_doc_matrix, Y_train)

In [25]:
X_train_new = SelectKBest(chi2, k=300000).fit_transform(term_doc_matrix, Y_train)

In [26]:
X_train_new.shape

(3131, 300000)

In [27]:
classifier_instance = LogisticRegression(solver = 'lbfgs')        
scores = cross_validate(classifier_instance, term_doc_matrix, Y_train, cv=10, scoring=scoring)

In [28]:
print("Accuracy: %0.3f" % (scores["test_accuracy"].mean()))
print("Precision: %0.3f" % (scores["test_precision"].mean()))
print("Recall: %0.3f" % (scores["test_recall"].mean()))
print("F1: %0.3f" % (scores["test_f1"].mean()))

Accuracy: 0.735
Precision: 0.745
Recall: 0.716
F1: 0.729


In [29]:
scores["fit_time"].mean(), scores["score_time"].mean()

(186.56428835391998, 25.659661650657654)

#### Run #10

In [23]:
from sklearn.utils import shuffle

In [24]:
X_train, term_doc_matrix, Y_train = shuffle(X_train, term_doc_matrix, Y_train)

In [25]:
X_train_new = SelectKBest(chi2, k=300000).fit_transform(term_doc_matrix, Y_train)

In [26]:
X_train_new.shape

(3131, 300000)

In [27]:
classifier_instance = LogisticRegression(solver = 'lbfgs')        
scores = cross_validate(classifier_instance, term_doc_matrix, Y_train, cv=10, scoring=scoring)

In [28]:
print("Accuracy: %0.3f" % (scores["test_accuracy"].mean()))
print("Precision: %0.3f" % (scores["test_precision"].mean()))
print("Recall: %0.3f" % (scores["test_recall"].mean()))
print("F1: %0.3f" % (scores["test_f1"].mean()))

Accuracy: 0.741
Precision: 0.753
Recall: 0.719
F1: 0.735


In [29]:
scores["fit_time"].mean(), scores["score_time"].mean()

(193.98135352134705, 21.373994183540344)