In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
from fastai import *
from fastai.text import *
from fastai.utils.mem import gpu_mem_get_free_no_cache
from sklearn.model_selection import train_test_split

In [0]:
import numpy as np
import re
import os
import copy
import torch

To read our dataset, we use os.walk to walk through a sub-tree of directories and files and load all of our training data and labels. We avoid the folder 'both' as the files inside are labelled both as violation and non-violation.
Our data set will be loaded into dictionaries, the keys corresponding to articles and the values will be a list of cases (X - our training set) or labels (Y).

In [0]:
def read_dataset(PATH):
    X_dataset = {}
    Y_dataset = {}
    for path, dirs, files in os.walk(PATH):
        for filename in files:
            fullpath = os.path.join(path, filename)
            if "both" not in fullpath:
                with open(fullpath, 'r', encoding="utf8") as file:
                    X_dataset, Y_dataset = add_file_to_dataset(fullpath, X_dataset, Y_dataset, file.read())

    return X_dataset, Y_dataset       

In [0]:
def add_file_to_dataset(fullpath, x_dataset, y_dataset, file):
    article = extract_article(fullpath)
    file = preprocess(file)
    if article not in x_dataset.keys() :
        x_dataset[article] = []
        y_dataset[article] = []
    x_dataset[article] = x_dataset[article] + [file]
    label = 0 if "non-violation" in fullpath else 1
    y_dataset[article] = y_dataset[article] + [label]
    return x_dataset, y_dataset  

We use regex to extract the number of the Article from the fullpath and insert the file into the list under that specific Article.

In [0]:
def extract_article(path): 
    pattern = r"(Article\d+)"
    result = re.search(pattern, path)
    article = result.group(1)
    return article

### Preprocessing 

In [0]:
def preprocess(file): 
    file = extract_paragraphs(file)
    return file

In [0]:
def extract_paragraphs(file): 
    file = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\xff]', '', file)
    pat = r'(PROCEDURE\s*\n.+?)?((THE CIRCUMSTANCES OF THE CASE\s*\n.+?RELEVANT DOMESTIC LAW.+?)|(\n(AS TO THE FACTS|THE FACTS)\s*\n.+?))(\nIII\.|THE LAW\s*\n|PROCEEDINGS BEFORE THE COMMISSION\s*\n|ALLEGED VIOLATION OF ARTICLE [0-9]+ OF THE CONVENTION \s*\n)'
    result = re.search(pat, file, re.S |  re.IGNORECASE)
    content = ""
    if result.group(1) is not None:
        content += result.group(1)
    content += result.group(2)
    return content

### Loading the data

In [0]:
base_path = "/content/drive/My Drive/Colab Notebooks/Datasets/Human rights dataset"

In [0]:
X_train_docs, Y_train_docs = read_dataset(base_path + "/train")
X_test_docs, Y_test = read_dataset(base_path + "\\test20")
X_extra_test_docs, Y_extra_test = read_dataset(base_path + "\\test_violations")

In [0]:
X_train_docs.keys()

dict_keys(['Article11', 'Article10', 'Article12', 'Article13', 'Article5', 'Article3', 'Article4', 'Article18', 'Article6', 'Article7', 'Article14', 'Article2', 'Article8'])

In [11]:
X_test_docs.keys()

dict_keys([])

In [12]:
X_extra_test_docs.keys()

dict_keys([])

### Combining all the articles according to class

In [0]:
X_train = X_train_docs["Article2"] + X_train_docs["Article3"] + X_train_docs["Article5"] + X_train_docs["Article6"] + X_train_docs["Article8"] + X_train_docs["Article10"] + X_train_docs["Article11"] + X_train_docs["Article13"] + X_train_docs["Article14"]

In [0]:
X_train = X_train + X_test_docs["Article2"] + X_test_docs["Article3"] + X_test_docs["Article5"] + X_test_docs["Article6"] + X_test_docs["Article8"] + X_test_docs["Article10"] + X_test_docs["Article11"] + X_test_docs["Article13"] + X_test_docs["Article14"]

In [0]:
X_train = X_train + X_extra_test_docs["Article2"] + X_extra_test_docs["Article3"] + X_extra_test_docs["Article5"] + X_extra_test_docs["Article6"] + X_extra_test_docs["Article8"] + X_extra_test_docs["Article10"] + X_extra_test_docs["Article11"] + X_extra_test_docs["Article13"] + X_extra_test_docs["Article14"]

In [0]:
total_words = 0
average_words = 0
i = 0
for doc in X_train:
    i += 1
    total_words += len(doc.split())
average_words = total_words/i

In [0]:
total_words, average_words