# Predicting Judicial Decisions of the European Court of Human Rights

In [1]:
import numpy as np
import re
import os
import copy

In [2]:
import sklearn.metrics as sm
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn import svm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [3]:
import spacy
from spacy.lang import en

In [4]:
scoring = {'accuracy': make_scorer(sm.accuracy_score),
           'precision': make_scorer(sm.precision_score),
           'recall': make_scorer(sm.recall_score),
           'f1': make_scorer(sm.f1_score)}

In [5]:
def read_dataset(PATH):
    X_dataset = {}
    Y_dataset = {}
    for path, dirs, files in os.walk(PATH):
        for filename in files:
            fullpath = os.path.join(path, filename)
            if "both" not in fullpath:
                with open(fullpath, 'r', encoding="utf8") as file:
                    X_dataset, Y_dataset = add_file_to_dataset(fullpath, X_dataset, Y_dataset, file.read())

    return X_dataset, Y_dataset       

In [6]:
def add_file_to_dataset(fullpath, x_dataset, y_dataset, file):
    article = extract_article(fullpath)
    file = preprocess(file)
    if article not in x_dataset.keys() :
        x_dataset[article] = []
        y_dataset[article] = []
    x_dataset[article] = x_dataset[article] + [file]
    label = 0 if "non-violation" in fullpath else 1
    y_dataset[article] = y_dataset[article] + [label]
    return x_dataset, y_dataset  

In [7]:
def extract_article(path): 
    pattern = r"(Article\d+)"
    result = re.search(pattern, path)
    article = result.group(1)
    return article

### Preprocessing 

In [8]:
def preprocess(file): 
    file = extract_paragraphs(file)
    return file

In [9]:
def extract_paragraphs(file): 
    file = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\xff]', '', file)
    pat = r'(PROCEDURE\s*\n.+?)?((THE CIRCUMSTANCES OF THE CASE\s*\n.+?RELEVANT DOMESTIC LAW.+?)|(\n(AS TO THE FACTS|THE FACTS|FACTS)\s*\n.+?))(\nIII\.|THE LAW\s*\n|PROCEEDINGS BEFORE THE COMMISSION\s*\n|ALLEGED VIOLATION OF ARTICLE [0-9]+|ALLEGED VIOLATION OF ARTICLE [0-9]+ OF THE CONVENTION \s*\n)'
    result = re.search(pat, file, re.S |  re.IGNORECASE)
    if result is None:
        pat = r'(PROCEDURE\s*\n.+?)?(\nIII\.|THE LAW\s*\n|PROCEEDINGS BEFORE THE COMMISSION\s*\n|ALLEGED VIOLATION OF ARTICLE [0-9]+ OF THE CONVENTION \s*\n)'
        result = re.search(pat, file, re.S |  re.IGNORECASE)
        content = ""
        if result is not None:
            content += result.group(1)
        else:
            print(file)
    else: 
        content = ""
        if result.group(1) is not None:
            content += result.group(1)
        content += result.group(2)
    return content

### Loading the data

In [10]:
base_path = "Datasets\\Human rights dataset"

In [11]:
X_train_docs, Y_train_docs = read_dataset(base_path + "/train")
X_extra_test_docs, Y_extra_test_docs = read_dataset(base_path + "/test_violations")

In [12]:
X_train_docs.keys()

dict_keys(['Article10', 'Article11', 'Article12', 'Article13', 'Article14', 'Article18', 'Article2', 'Article3', 'Article4', 'Article5', 'Article6', 'Article7', 'Article8'])

In [13]:
X_extra_test_docs.keys()

dict_keys(['Article10', 'Article11', 'Article12', 'Article13', 'Article14', 'Article18', 'Article2', 'Article3', 'Article4', 'Article5', 'Article6', 'Article7', 'Article8', 'Article9'])

### Combining all the articles according to class

In [14]:
X_train = X_train_docs["Article2"] + X_train_docs["Article3"] + X_train_docs["Article5"] + X_train_docs["Article6"] + X_train_docs["Article8"] + X_train_docs["Article10"] + X_train_docs["Article11"] + X_train_docs["Article13"] + X_train_docs["Article14"]

In [15]:
#X_extra_test = X_extra_test_docs["Article2"] + X_extra_test_docs["Article3"] + X_extra_test_docs["Article5"] + X_extra_test_docs["Article6"] + X_extra_test_docs["Article8"] + X_extra_test_docs["Article10"] + X_extra_test_docs["Article11"] + X_extra_test_docs["Article13"] + X_extra_test_docs["Article14"]

In [16]:
Y_train = Y_train_docs["Article2"] + Y_train_docs["Article3"] + Y_train_docs["Article5"] + Y_train_docs["Article6"] + Y_train_docs["Article8"] + Y_train_docs["Article10"] + Y_train_docs["Article11"] + Y_train_docs["Article13"] + Y_train_docs["Article14"]

In [17]:
#Y_extra_test = Y_extra_test_docs["Article2"] + Y_extra_test_docs["Article3"] + Y_extra_test_docs["Article5"] + Y_extra_test_docs["Article6"] + Y_extra_test_docs["Article8"] + Y_extra_test_docs["Article10"] + Y_extra_test_docs["Article11"] + Y_extra_test_docs["Article13"] + Y_extra_test_docs["Article14"]

In [18]:
#len(X_train), len(Y_train), len(X_extra_test), len(Y_extra_test)

### Tokenization with TfIdfVectorizer and Classification with Logistic Regression

In [19]:
# tokenize the doc and lemmatize its tokens
lemmatizer = spacy.lang.en.English()
def my_tokenizer(doc):
    tokens = lemmatizer(doc)
    return([token.lemma_ for token in tokens])

#### Test_violations evaluation

In [20]:
vect = TfidfVectorizer(ngram_range=(2,4), lowercase=True, tokenizer=my_tokenizer, max_features=600000, min_df=3)
term_doc_matrix = vect.fit_transform(X_train).toarray()   

In [21]:
# from sklearn.utils import shuffle
# X_train, term_doc_matrix, Y_train = shuffle(X_train, term_doc_matrix, Y_train)

In [22]:
X_train_new = SelectKBest(chi2, k=300000).fit_transform(term_doc_matrix, Y_train)
X_train_new.shape

(3131, 300000)

In [23]:
classifier = LogisticRegression(solver='lbfgs')        
classifier.fit(term_doc_matrix, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

#### Article 2 results

In [24]:
test_term_doc = vect.transform(X_extra_test_docs["Article2"]).toarray()

In [25]:
y_test_pred = classifier.predict(test_term_doc)

In [26]:
# Compute performance metrics
print("Logistic regression performance:")
print("Mean absolute error =", round(sm.mean_absolute_error(Y_extra_test_docs["Article2"], y_test_pred), 2))
print("Accuracy score =", sm.accuracy_score(Y_extra_test_docs["Article2"], y_test_pred))
print("Recall score =", sm.recall_score(Y_extra_test_docs["Article2"], y_test_pred))
print("Precision score =", sm.precision_score(Y_extra_test_docs["Article2"], y_test_pred))
print("F1 score =", sm.f1_score(Y_extra_test_docs["Article2"], y_test_pred))

Logistic regression performance:
Mean absolute error = 0.25
Accuracy score = 0.7512562814070352
Recall score = 0.7512562814070352
Precision score = 1.0
F1 score = 0.8579626972740315


In [27]:
from sklearn.metrics import confusion_matrix
tn, fp, fn, tp = confusion_matrix(Y_extra_test_docs["Article2"], y_test_pred).ravel()
(tn, fp, fn, tp)

(0, 0, 99, 299)

#### Article 3 results

In [28]:
test_term_doc = vect.transform(X_extra_test_docs["Article3"]).toarray()

In [29]:
y_test_pred = classifier.predict(test_term_doc)

In [30]:
# Compute performance metrics
print("Logistic regression performance:")
print("Mean absolute error =", round(sm.mean_absolute_error(Y_extra_test_docs["Article3"], y_test_pred), 2))
print("Accuracy score =", sm.accuracy_score(Y_extra_test_docs["Article3"], y_test_pred))
print("Recall score =", sm.recall_score(Y_extra_test_docs["Article3"], y_test_pred))
print("Precision score =", sm.precision_score(Y_extra_test_docs["Article3"], y_test_pred))
print("F1 score =", sm.f1_score(Y_extra_test_docs["Article3"], y_test_pred))

Logistic regression performance:
Mean absolute error = 0.18
Accuracy score = 0.8155111633372503
Recall score = 0.8155111633372503
Precision score = 1.0
F1 score = 0.8983818770226537


In [31]:
from sklearn.metrics import confusion_matrix
tn, fp, fn, tp = confusion_matrix(Y_extra_test_docs["Article3"], y_test_pred).ravel()
(tn, fp, fn, tp)

(0, 0, 157, 694)

#### Article 5 results

In [32]:
test_term_doc = vect.transform(X_extra_test_docs["Article5"]).toarray()

In [33]:
y_test_pred = classifier.predict(test_term_doc)

In [34]:
# Compute performance metrics
print("Logistic regression performance:")
print("Mean absolute error =", round(sm.mean_absolute_error(Y_extra_test_docs["Article5"], y_test_pred), 2))
print("Accuracy score =", sm.accuracy_score(Y_extra_test_docs["Article5"], y_test_pred))
print("Recall score =", sm.recall_score(Y_extra_test_docs["Article5"], y_test_pred))
print("Precision score =", sm.precision_score(Y_extra_test_docs["Article5"], y_test_pred))
print("F1 score =", sm.f1_score(Y_extra_test_docs["Article5"], y_test_pred))

Logistic regression performance:
Mean absolute error = 0.21
Accuracy score = 0.7906976744186046
Recall score = 0.7906976744186046
Precision score = 1.0
F1 score = 0.8831168831168831


In [35]:
from sklearn.metrics import confusion_matrix
tn, fp, fn, tp = confusion_matrix(Y_extra_test_docs["Article5"], y_test_pred).ravel()
(tn, fp, fn, tp)

(0, 0, 234, 884)

#### Article 6 results

In [36]:
test_term_doc = vect.transform(X_extra_test_docs["Article6"]).toarray()

In [37]:
y_test_pred = classifier.predict(test_term_doc)

In [38]:
# Compute performance metrics
print("Logistic regression performance:")
print("Mean absolute error =", round(sm.mean_absolute_error(Y_extra_test_docs["Article6"], y_test_pred), 2))
print("Accuracy score =", sm.accuracy_score(Y_extra_test_docs["Article6"], y_test_pred))
print("Recall score =", sm.recall_score(Y_extra_test_docs["Article6"], y_test_pred))
print("Precision score =", sm.precision_score(Y_extra_test_docs["Article6"], y_test_pred))
print("F1 score =", sm.f1_score(Y_extra_test_docs["Article6"], y_test_pred))

Logistic regression performance:
Mean absolute error = 0.19
Accuracy score = 0.8123167155425219
Recall score = 0.8123167155425219
Precision score = 1.0
F1 score = 0.8964401294498382


In [39]:
from sklearn.metrics import confusion_matrix
tn, fp, fn, tp = confusion_matrix(Y_extra_test_docs["Article6"], y_test_pred).ravel()
(tn, fp, fn, tp)

(0, 0, 768, 3324)

#### Article 8 results

In [40]:
test_term_doc = vect.transform(X_extra_test_docs["Article8"]).toarray()

In [41]:
y_test_pred = classifier.predict(test_term_doc)

In [42]:
# Compute performance metrics
print("Logistic regression performance:")
print("Mean absolute error =", round(sm.mean_absolute_error(Y_extra_test_docs["Article8"], y_test_pred), 2))
print("Accuracy score =", sm.accuracy_score(Y_extra_test_docs["Article8"], y_test_pred))
print("Recall score =", sm.recall_score(Y_extra_test_docs["Article8"], y_test_pred))
print("Precision score =", sm.precision_score(Y_extra_test_docs["Article8"], y_test_pred))
print("F1 score =", sm.f1_score(Y_extra_test_docs["Article8"], y_test_pred))

Logistic regression performance:
Mean absolute error = 0.44
Accuracy score = 0.5625
Recall score = 0.5625
Precision score = 1.0
F1 score = 0.72


In [43]:
from sklearn.metrics import confusion_matrix
tn, fp, fn, tp = confusion_matrix(Y_extra_test_docs["Article8"], y_test_pred).ravel()
(tn, fp, fn, tp)

(0, 0, 217, 279)

#### Article 10 results

In [44]:
test_term_doc = vect.transform(X_extra_test_docs["Article10"]).toarray()

In [45]:
y_test_pred = classifier.predict(test_term_doc)

In [46]:
# Compute performance metrics
print("Logistic regression performance:")
print("Mean absolute error =", round(sm.mean_absolute_error(Y_extra_test_docs["Article10"], y_test_pred), 2))
print("Accuracy score =", sm.accuracy_score(Y_extra_test_docs["Article10"], y_test_pred))
print("Recall score =", sm.recall_score(Y_extra_test_docs["Article10"], y_test_pred))
print("Precision score =", sm.precision_score(Y_extra_test_docs["Article10"], y_test_pred))
print("F1 score =", sm.f1_score(Y_extra_test_docs["Article10"], y_test_pred))

Logistic regression performance:
Mean absolute error = 0.54
Accuracy score = 0.4603174603174603
Recall score = 0.4603174603174603
Precision score = 1.0
F1 score = 0.6304347826086957


In [47]:
from sklearn.metrics import confusion_matrix
tn, fp, fn, tp = confusion_matrix(Y_extra_test_docs["Article10"], y_test_pred).ravel()
(tn, fp, fn, tp)

(0, 0, 136, 116)

#### Article 11 results

In [48]:
test_term_doc = vect.transform(X_extra_test_docs["Article11"]).toarray()

In [49]:
y_test_pred = classifier.predict(test_term_doc)

In [50]:
# Compute performance metrics
print("Logistic regression performance:")
print("Mean absolute error =", round(sm.mean_absolute_error(Y_extra_test_docs["Article11"], y_test_pred), 2))
print("Accuracy score =", sm.accuracy_score(Y_extra_test_docs["Article11"], y_test_pred))
print("Recall score =", sm.recall_score(Y_extra_test_docs["Article11"], y_test_pred))
print("Precision score =", sm.precision_score(Y_extra_test_docs["Article11"], y_test_pred))
print("F1 score =", sm.f1_score(Y_extra_test_docs["Article11"], y_test_pred))

Logistic regression performance:
Mean absolute error = 0.3
Accuracy score = 0.6966292134831461
Recall score = 0.6966292134831461
Precision score = 1.0
F1 score = 0.8211920529801325


In [51]:
from sklearn.metrics import confusion_matrix
tn, fp, fn, tp = confusion_matrix(Y_extra_test_docs["Article11"], y_test_pred).ravel()
(tn, fp, fn, tp)

(0, 0, 27, 62)

#### Article 13 results

In [52]:
test_term_doc = vect.transform(X_extra_test_docs["Article13"]).toarray()

In [53]:
y_test_pred = classifier.predict(test_term_doc)

In [54]:
# Compute performance metrics
print("Logistic regression performance:")
print("Mean absolute error =", round(sm.mean_absolute_error(Y_extra_test_docs["Article13"], y_test_pred), 2))
print("Accuracy score =", sm.accuracy_score(Y_extra_test_docs["Article13"], y_test_pred))
print("Recall score =", sm.recall_score(Y_extra_test_docs["Article13"], y_test_pred))
print("Precision score =", sm.precision_score(Y_extra_test_docs["Article13"], y_test_pred))
print("F1 score =", sm.f1_score(Y_extra_test_docs["Article13"], y_test_pred))

Logistic regression performance:
Mean absolute error = 0.16
Accuracy score = 0.8358490566037736
Recall score = 0.8358490566037736
Precision score = 1.0
F1 score = 0.9105858170606371


In [55]:
from sklearn.metrics import confusion_matrix
tn, fp, fn, tp = confusion_matrix(Y_extra_test_docs["Article13"], y_test_pred).ravel()
(tn, fp, fn, tp)

(0, 0, 174, 886)

#### Article 14 results

In [56]:
test_term_doc = vect.transform(X_extra_test_docs["Article14"]).toarray()

In [57]:
y_test_pred = classifier.predict(test_term_doc)

In [58]:
# Compute performance metrics
print("Logistic regression performance:")
print("Mean absolute error =", round(sm.mean_absolute_error(Y_extra_test_docs["Article14"], y_test_pred), 2))
print("Accuracy score =", sm.accuracy_score(Y_extra_test_docs["Article14"], y_test_pred))
print("Recall score =", sm.recall_score(Y_extra_test_docs["Article14"], y_test_pred))
print("Precision score =", sm.precision_score(Y_extra_test_docs["Article14"], y_test_pred))
print("F1 score =", sm.f1_score(Y_extra_test_docs["Article14"], y_test_pred))

Logistic regression performance:
Mean absolute error = 0.86
Accuracy score = 0.13636363636363635
Recall score = 0.13636363636363635
Precision score = 1.0
F1 score = 0.24000000000000002


In [59]:
from sklearn.metrics import confusion_matrix
tn, fp, fn, tp = confusion_matrix(Y_extra_test_docs["Article14"], y_test_pred).ravel()
(tn, fp, fn, tp)

(0, 0, 38, 6)

### Tokenization with TfIdfVectorizer and Classification with Linear SVC

In [19]:
vect = TfidfVectorizer(ngram_range=(2,4), lowercase=True, max_features=600000, min_df=3)
term_doc_matrix = vect.fit_transform(X_train).toarray()    

In [20]:
print("The number of features: " + str(len(vect.get_feature_names())))

The number of features: 600000


In [21]:
from sklearn.utils import shuffle
X_train, term_doc_matrix, Y_train = shuffle(X_train, term_doc_matrix, Y_train)

In [22]:
X_train_new = SelectKBest(chi2, k=300000).fit_transform(term_doc_matrix, Y_train)
X_train_new.shape

(3131, 300000)

In [23]:
classifier = svm.LinearSVC(C=0.1, max_iter=1500)             
classifier.fit(term_doc_matrix, Y_train)

LinearSVC(C=0.1, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1500,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

#### Article 2 results

In [24]:
test_term_doc = vect.transform(X_extra_test_docs["Article2"]).toarray()

In [25]:
y_test_pred = classifier.predict(test_term_doc)

In [26]:
# Compute performance metrics
print("Logistic regression performance:")
print("Mean absolute error =", round(sm.mean_absolute_error(Y_extra_test_docs["Article2"], y_test_pred), 2))
print("Accuracy score =", sm.accuracy_score(Y_extra_test_docs["Article2"], y_test_pred))
print("Recall score =", sm.recall_score(Y_extra_test_docs["Article2"], y_test_pred))
print("Precision score =", sm.precision_score(Y_extra_test_docs["Article2"], y_test_pred))
print("F1 score =", sm.f1_score(Y_extra_test_docs["Article2"], y_test_pred))

Logistic regression performance:
Mean absolute error = 0.26
Accuracy score = 0.7412060301507538
Recall score = 0.7412060301507538
Precision score = 1.0
F1 score = 0.8513708513708513


In [27]:
from sklearn.metrics import confusion_matrix
tn, fp, fn, tp = confusion_matrix(Y_extra_test_docs["Article2"], y_test_pred).ravel()
(tn, fp, fn, tp)

(0, 0, 103, 295)

#### Article 3 results

In [28]:
test_term_doc = vect.transform(X_extra_test_docs["Article3"]).toarray()

In [29]:
y_test_pred = classifier.predict(test_term_doc)

In [30]:
# Compute performance metrics
print("Logistic regression performance:")
print("Mean absolute error =", round(sm.mean_absolute_error(Y_extra_test_docs["Article3"], y_test_pred), 2))
print("Accuracy score =", sm.accuracy_score(Y_extra_test_docs["Article3"], y_test_pred))
print("Recall score =", sm.recall_score(Y_extra_test_docs["Article3"], y_test_pred))
print("Precision score =", sm.precision_score(Y_extra_test_docs["Article3"], y_test_pred))
print("F1 score =", sm.f1_score(Y_extra_test_docs["Article3"], y_test_pred))

Logistic regression performance:
Mean absolute error = 0.19
Accuracy score = 0.8143360752056404
Recall score = 0.8143360752056404
Precision score = 1.0
F1 score = 0.8976683937823835


In [31]:
from sklearn.metrics import confusion_matrix
tn, fp, fn, tp = confusion_matrix(Y_extra_test_docs["Article3"], y_test_pred).ravel()
(tn, fp, fn, tp)

(0, 0, 158, 693)

#### Article 5 results

In [32]:
test_term_doc = vect.transform(X_extra_test_docs["Article5"]).toarray()

In [33]:
y_test_pred = classifier.predict(test_term_doc)

In [34]:
# Compute performance metrics
print("Logistic regression performance:")
print("Mean absolute error =", round(sm.mean_absolute_error(Y_extra_test_docs["Article5"], y_test_pred), 2))
print("Accuracy score =", sm.accuracy_score(Y_extra_test_docs["Article5"], y_test_pred))
print("Recall score =", sm.recall_score(Y_extra_test_docs["Article5"], y_test_pred))
print("Precision score =", sm.precision_score(Y_extra_test_docs["Article5"], y_test_pred))
print("F1 score =", sm.f1_score(Y_extra_test_docs["Article5"], y_test_pred))

Logistic regression performance:
Mean absolute error = 0.21
Accuracy score = 0.7924865831842576
Recall score = 0.7924865831842576
Precision score = 1.0
F1 score = 0.8842315369261476


In [35]:
from sklearn.metrics import confusion_matrix
tn, fp, fn, tp = confusion_matrix(Y_extra_test_docs["Article5"], y_test_pred).ravel()
(tn, fp, fn, tp)

(0, 0, 232, 886)

#### Article 6 results

In [36]:
test_term_doc = vect.transform(X_extra_test_docs["Article6"]).toarray()

In [37]:
y_test_pred = classifier.predict(test_term_doc)

In [38]:
# Compute performance metrics
print("Logistic regression performance:")
print("Mean absolute error =", round(sm.mean_absolute_error(Y_extra_test_docs["Article6"], y_test_pred), 2))
print("Accuracy score =", sm.accuracy_score(Y_extra_test_docs["Article6"], y_test_pred))
print("Recall score =", sm.recall_score(Y_extra_test_docs["Article6"], y_test_pred))
print("Precision score =", sm.precision_score(Y_extra_test_docs["Article6"], y_test_pred))
print("F1 score =", sm.f1_score(Y_extra_test_docs["Article6"], y_test_pred))

Logistic regression performance:
Mean absolute error = 0.19
Accuracy score = 0.8054740957966764
Recall score = 0.8054740957966764
Precision score = 1.0
F1 score = 0.8922577152138603


In [39]:
from sklearn.metrics import confusion_matrix
tn, fp, fn, tp = confusion_matrix(Y_extra_test_docs["Article6"], y_test_pred).ravel()
(tn, fp, fn, tp)

(0, 0, 796, 3296)

#### Article 8 results

In [40]:
test_term_doc = vect.transform(X_extra_test_docs["Article8"]).toarray()

In [41]:
y_test_pred = classifier.predict(test_term_doc)

In [42]:
# Compute performance metrics
print("Logistic regression performance:")
print("Mean absolute error =", round(sm.mean_absolute_error(Y_extra_test_docs["Article8"], y_test_pred), 2))
print("Accuracy score =", sm.accuracy_score(Y_extra_test_docs["Article8"], y_test_pred))
print("Recall score =", sm.recall_score(Y_extra_test_docs["Article8"], y_test_pred))
print("Precision score =", sm.precision_score(Y_extra_test_docs["Article8"], y_test_pred))
print("F1 score =", sm.f1_score(Y_extra_test_docs["Article8"], y_test_pred))

Logistic regression performance:
Mean absolute error = 0.44
Accuracy score = 0.5564516129032258
Recall score = 0.5564516129032258
Precision score = 1.0
F1 score = 0.7150259067357513


In [43]:
from sklearn.metrics import confusion_matrix
tn, fp, fn, tp = confusion_matrix(Y_extra_test_docs["Article8"], y_test_pred).ravel()
(tn, fp, fn, tp)

(0, 0, 220, 276)

#### Article 10 results

In [44]:
test_term_doc = vect.transform(X_extra_test_docs["Article10"]).toarray()

In [45]:
y_test_pred = classifier.predict(test_term_doc)

In [46]:
# Compute performance metrics
print("Logistic regression performance:")
print("Mean absolute error =", round(sm.mean_absolute_error(Y_extra_test_docs["Article10"], y_test_pred), 2))
print("Accuracy score =", sm.accuracy_score(Y_extra_test_docs["Article10"], y_test_pred))
print("Recall score =", sm.recall_score(Y_extra_test_docs["Article10"], y_test_pred))
print("Precision score =", sm.precision_score(Y_extra_test_docs["Article10"], y_test_pred))
print("F1 score =", sm.f1_score(Y_extra_test_docs["Article10"], y_test_pred))

Logistic regression performance:
Mean absolute error = 0.56
Accuracy score = 0.4365079365079365
Recall score = 0.4365079365079365
Precision score = 1.0
F1 score = 0.6077348066298343


In [47]:
from sklearn.metrics import confusion_matrix
tn, fp, fn, tp = confusion_matrix(Y_extra_test_docs["Article10"], y_test_pred).ravel()
(tn, fp, fn, tp)

(0, 0, 142, 110)

#### Article 11 results

In [48]:
test_term_doc = vect.transform(X_extra_test_docs["Article11"]).toarray()

In [49]:
y_test_pred = classifier.predict(test_term_doc)

In [50]:
# Compute performance metrics
print("Logistic regression performance:")
print("Mean absolute error =", round(sm.mean_absolute_error(Y_extra_test_docs["Article11"], y_test_pred), 2))
print("Accuracy score =", sm.accuracy_score(Y_extra_test_docs["Article11"], y_test_pred))
print("Recall score =", sm.recall_score(Y_extra_test_docs["Article11"], y_test_pred))
print("Precision score =", sm.precision_score(Y_extra_test_docs["Article11"], y_test_pred))
print("F1 score =", sm.f1_score(Y_extra_test_docs["Article11"], y_test_pred))

Logistic regression performance:
Mean absolute error = 0.3
Accuracy score = 0.6966292134831461
Recall score = 0.6966292134831461
Precision score = 1.0
F1 score = 0.8211920529801325


In [51]:
from sklearn.metrics import confusion_matrix
tn, fp, fn, tp = confusion_matrix(Y_extra_test_docs["Article11"], y_test_pred).ravel()
(tn, fp, fn, tp)

(0, 0, 27, 62)

#### Article 13 results

In [52]:
test_term_doc = vect.transform(X_extra_test_docs["Article13"]).toarray()

In [53]:
y_test_pred = classifier.predict(test_term_doc)

In [54]:
# Compute performance metrics
print("Logistic regression performance:")
print("Mean absolute error =", round(sm.mean_absolute_error(Y_extra_test_docs["Article13"], y_test_pred), 2))
print("Accuracy score =", sm.accuracy_score(Y_extra_test_docs["Article13"], y_test_pred))
print("Recall score =", sm.recall_score(Y_extra_test_docs["Article13"], y_test_pred))
print("Precision score =", sm.precision_score(Y_extra_test_docs["Article13"], y_test_pred))
print("F1 score =", sm.f1_score(Y_extra_test_docs["Article13"], y_test_pred))

Logistic regression performance:
Mean absolute error = 0.17
Accuracy score = 0.8264150943396227
Recall score = 0.8264150943396227
Precision score = 1.0
F1 score = 0.9049586776859504


In [55]:
from sklearn.metrics import confusion_matrix
tn, fp, fn, tp = confusion_matrix(Y_extra_test_docs["Article13"], y_test_pred).ravel()
(tn, fp, fn, tp)

(0, 0, 184, 876)

#### Article 14 results

In [56]:
test_term_doc = vect.transform(X_extra_test_docs["Article14"]).toarray()

In [57]:
y_test_pred = classifier.predict(test_term_doc)

In [58]:
# Compute performance metrics
print("Logistic regression performance:")
print("Mean absolute error =", round(sm.mean_absolute_error(Y_extra_test_docs["Article14"], y_test_pred), 2))
print("Accuracy score =", sm.accuracy_score(Y_extra_test_docs["Article14"], y_test_pred))
print("Recall score =", sm.recall_score(Y_extra_test_docs["Article14"], y_test_pred))
print("Precision score =", sm.precision_score(Y_extra_test_docs["Article14"], y_test_pred))
print("F1 score =", sm.f1_score(Y_extra_test_docs["Article14"], y_test_pred))

Logistic regression performance:
Mean absolute error = 0.86
Accuracy score = 0.13636363636363635
Recall score = 0.13636363636363635
Precision score = 1.0
F1 score = 0.24000000000000002


In [59]:
from sklearn.metrics import confusion_matrix
tn, fp, fn, tp = confusion_matrix(Y_extra_test_docs["Article14"], y_test_pred).ravel()
(tn, fp, fn, tp)

(0, 0, 38, 6)