In [12]:
# -*- coding: utf-8 -*-

from bs4 import BeautifulSoup
import os
import json


def get_item(name):
    # link
    path = "output/" + name + ".html"
    
    # file
    file = open(path, "r", encoding="utf-8")

    # read file
    html = BeautifulSoup(file.read())
    
    # the content of the page
    content = html.find("div", {"id": "content"})

    # we want the Erwägung
    erw = content.find('h1', text="Erwägungen")

    data = {}

    # we take the paragraphs
    paragraphs = []
    
    while erw:
        if erw.find('strong') is not None:
            paragraph = {}
            
            # we want the original text
            paragraph["original"] = str(erw)
            
            # the index of the paragraph
            index = erw.find('strong').extract().text
            paragraph["index"] = index
            
            # we find the links
            links = erw.findChildren("a", {"class": "LexLink"})
            hrefs = []
            for link in links:
                hrefs.append(link["href"][1:])
                link.extract()
            paragraph["links"] = hrefs
            
            # we remove to get text
            thinsp = erw.findChildren("span", {"class": "thinsp"})
            for t in thinsp:
                t.extract()
            notes = erw.findChildren("span", {"class": "note"})
            for note in notes:
                note.extract()
            pagebk = erw.findChildren("span", {"class": "pagebreak"})
            for bk in pagebk:
                bk.extract()
            
            # we save only the text
            paragraph["content"] = erw.text
            
            # check that we have a big enough text
            if len(paragraph["content"]) > 10:
                paragraphs.append(paragraph)
                
    
        # next paragraph
        erw = erw.findNext("p")
    
    # save the paragraphs and return
    data["paragraphs"] = paragraphs
    return data


path = "entscheide.txt"
with open(path, "r") as file:
    names = file.read().splitlines()
    
output = {}
for name in names:
    output[name] = get_item(name)


with open('json_data.json', 'w', encoding="utf-8") as outfile:
    content = json.dumps(output, indent=4, ensure_ascii=False)
    outfile.write(content)

In [30]:
import ssdeep
import json
with open('/home/ilya/openlegal/json_data.json', 'r') as inputfile:
    data = json.loads(inputfile.read())


In [41]:
def hash_all_paragraphs(data):
    #count the hashes done so we can report something
    hash_counter = 0
    
    for decision in data:
        for paragraph in decision["paragraphs"]:
            content = paragraph['content']
            paragraph["hash"] = ssdeep.hash(content)
            
    print(hash_counter)


In [114]:
def compare_all_paragraphs(data):

     
    number_of_paragraphs = 0
    #for decision in data:
    #    for paragraph in decision:
    #        number_of_paragraphs = number_of_paragraphs+1
    
    cmpmatrix = {}
    hashlink = {}
    
    #compare_score_matrix = [[0 for _ in range(number_of_paragraphs)] for _ in range(number_of_paragraphs)]
    for decision1 in data:
        for decision2 in data:
            if decision1 == decision2:
                break
            else:
                for paragraph1 in decision1["paragraphs"]:
                    x = paragraph1["hash"]
                    for paragraph2 in decision2["paragraphs"]:
                        y = paragraph2["hash"]
                        compare = ssdeep.compare(x,y)
                        
                        if compare:
                            #print("\n")
                            #print(compare)
                            #print(paragraph1["content"])
                            #print(paragraph2["content"])
                            xint = int.from_bytes(bytes(x,'ascii'),byteorder="big") # big/small, this is arbitrary but MUST be constant
                            yint = int.from_bytes(bytes(y,'ascii'),byteorder="big") # big/small, this is arbitrary but MUST be constant
                            if xint < yint:
                                try:
                                    cmpmatrix[x][y] = compare ## ordre correct!
                                except KeyError:
                                    cmpmatrix[x] = {y: compare}
                            else:
                                try:
                                    cmpmatrix[y][x] = compare ## ordre correct!
                                except KeyError:
                                    cmpmatrix[y] = {x: compare}
                            try:
                                hashlink[x].append( ( decision1['name'], paragraph1['index']) )
                            except KeyError:
                                hashlink[x] = [( decision1['name'], paragraph1['index']) ]
                            try:
                                hashlink[y].append( ( decision2['name'], paragraph2['index']) )
                            except KeyError:
                                hashlink[y] = [( decision2['name'], paragraph2['index']) ]

                        
    print(cmpmatrix)
                


In [52]:
hash_all_paragraphs(data)

0


In [38]:
with open('/home/ilya/openlegal/data_with_hash.json', 'w') as outfile:
    outfile.write(json.dumps(data, indent=4))

In [115]:
compare_all_paragraphs(data)

{'12:AEqZMFdJ2ZIQvxqpAHaIy7JPaR8VeevKo9wWnaIiBQ1FBXVM+AVN:Al0mxvYI+7JCaVHKo9wWnTz/BXVM/VN': {'12:AEqZMFbaqZIQvxqpAHaIy7JPaR8TevKo9wWnaIiBAokYXE30L/FmA1GE61ckoMO:Al0JxvYI+7JCa+Ko9wWnTENXEkSE66MO': 68, '12:AEqZMFdJqZIQvxqpAHaIy7JPaR8VeevKo9wWnaIiBQ1FBXVM+AVN:Al0CxvYI+7JCaVHKo9wWnTz/BXVM/VN': 99}, '24:Al0CxvYI+7JCaVHKo9wWnTz/BXVM/ivwAR2rPn+Fa2jRQ:ACC5Yz7JrVq2tnhVgivwq2bn+Fa2jRQ': {'12:AEqZMFbaqZIQvxqpAHaIy7JPaR8TevKo9wWnaIiBAokYXE30L/FmA1GE61ckoMO:Al0JxvYI+7JCa+Ko9wWnTENXEkSE66MO': 55, '12:AEqZMFdJ2ZIQvxqpAHaIy7JPaR8VeevKo9wWnaIiBQ1FBXVM+AVN:Al0mxvYI+7JCaVHKo9wWnTz/BXVM/VN': 75, '12:AEqZMFdJqZIQvxqpAHaIy7JPaR8VeevKo9wWnaIiBQ1FBXVM+AVN:Al0CxvYI+7JCaVHKo9wWnTz/BXVM/VN': 77}, '24:aLDcMqmU136BVG2AnEmTEZZ3FyDNNgvnN7KKKCUrZkBTsy:aLDSmU13L2jZGDN2fu9iBp': {'12:4Emtm13qPIRpqtBCWHmvC1lNWn7AmznM65T4eG6sE1euhzzlNKjApgwXslhxFg9I:4EmU136BVGHnEmrMUTqFyDNNgWEPMcLH': 47}, '6:NMdcAF2igrclfDav8VJOTjzyiV7+E/MYU8ncIO574EKtvxP9F1jY26IQAFXs:NCF2iY6fDakUiiV6B8y57RcrFd+IvXs': {'3:NMGKcAF28Rd2o8KFydPRXRVzAF8eg+xbR