In [54]:
import os
import json

from typing import List

import numpy as np

from scipy.sparse.csr import csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

In [55]:
class CharType: 
    char = 0
    num = 1
    space = 2
    other = 3

def tokenize(text: str, lower: bool=True, ignore_punc: bool=True) -> List[str]:
    curr = CharType.char
    tokens = []
    curr_word_len = 0
    in_tag = False
    
    t_len = len(text)
    
    for i, t in enumerate(text): 
        prev = curr 
        if t.isdigit():
            curr = CharType.num
        elif t == ' ':
            curr = CharType.space
        elif t.isalpha():
            curr = CharType.char
        else:
            curr = CharType.other
            
        change = False 
        
        if prev != curr:
            change = True 
        elif curr == CharType.other and i > 0 and text[i-1] != t:
            change = True 
        

        if change:
            start = i-curr_word_len
            if start < 0:
                start = 0 

            if curr_word_len != 0:
                if not in_tag:
                    if prev != CharType.other:
                        tokens.append(text[start:i])
                    elif not ignore_punc:
                        tokens.append(text[start:i])

                curr_word_len = 0

        if i == t_len:
            if curr != CharType.space:
                if not_in_tag:
                    if curr != CharType.other: 
                        tokens.append(text[i-curr_word_len:i+1])
                    elif not ignore_punc:
                        tokens.append(text[i-curr_word_len:i+1])

                curr_word_len = 0

        if curr != CharType.space and not in_tag:
            curr_word_len += 1
            
        if curr == CharType.other:
            if t == '<':
                in_tag = True
            elif t == '>':
                in_tag = False
          
    if lower:
        for i, t in enumerate(tokens):
            tokens[i] = t.lower()

    return tokens
        

In [77]:
# Generate list of repealed files

repealed_leg = set()
lookups = {}

with open('/home/danlocke/legislation/comm_acts.json') as f:
    data = json.load(f)
    for item in data:
        lookups[item['id'].lower()] = item
        if item['repealed'] == 'Y':
            repealed_leg.add(item['id'].lower())

In [78]:
path = '/home/danlocke/legislation/comm_parsed'

class Encoder: 
    
    def __init__(self): 
        self._lookup = {}
        self._vocab= []
        self._items = 0
        
    def add(self, tokens: List[str]):
        for t in tokens:
            if t not in self._lookup: 
                self._items += 1
                self._lookup[t] = self._items
                self._vocab.append(t)
                
    def encode(self, tokens: List[str]) -> List[int]:
        ret = [0] * len(tokens)
        for i, t in enumerate(tokens):
            ret[i] = self._lookup.get(t, 0)
            
        return ret
        
encoder = Encoder()

class DataHolder:
    def __init__(self):
        self._lookup = {}
        self._lt_lookup ={}
        self._lt = []
        self._vals = []
        self._titles = []
        self._texts = []
        self._long_title_vecs = None
        self._lt_emb = None
        self._title_vecs = None
        self._text_vecs = None
        
    def add_long_title(self, act: str, lt: str):
        self._lt_lookup[act] = len(self._lt)
        self._lt.append(lt)
        
    def add(self, text: str, title: str, name: str):
        if name in self._lookup:
#             print("-"*40)
#             print("{0} already exists".format(name))
#             ind = self._lookup[name]
#             print(self._titles[ind])
#             print(self._texts[ind])
#             print('*'*10)
#             print(title)
#             print(text)
            return 
            
        self._lookup[name] = len(self._vals)
        self._vals.append(name)
        self._titles.append(title)
        self._texts.append(text)
        
    def add_vecs(self, title_vecs: csr_matrix, text_vecs: csr_matrix):
        self._title_vecs = title_vecs
        self._text_vecs = text_vecs
        
    def add_lt_vec(self, vec: csr_matrix):
        self._long_title_vecs = vec
        
    def texts(self) -> List[str]:
        return self._texts
    
    def titles(self) -> List[str]:
        return self._titles
    
    def long_titles(self) -> List[str]:
        return self._lt

    def long_title_vecs(self) -> csr_matrix:
        return self._long_title_vecs
    
    def title_vecs(self) -> csr_matrix:
        return self._title_vecs
    
    def text_vecs(self) -> csr_matrix:
        return self._text_vecs
    
    def print_row(self, row: int): 
        print(self._vals[row])
        print(self._titles[row])
        print(self._texts[row])
        
    def get_matching_rows_substr(self, sub: str) -> List[int]:
        return [self._lookup[x] for x in self._vals if sub in x]
    
    def num_rows(self) -> int:
        return self._titles.shape[0]
    
    def top_titles(self, k: int):
        count_titles = {}
        for title in self._titles:
            if title not in count_titles:
                count_titles[title] = 0
            count_titles[title] += 1

        n = 0
        for k, v in sorted(count_titles.items(), key=lambda item: item[1], reverse=True):
            print(k, v)
            n += 1
            if n > k:
                break

In [100]:
repealedDataHolder = DataHolder()
currentDataHolder = DataHolder()

default_prefix = 's'
default_sch_prefix = 's'

for root, d, files in os.walk(path):
    for f_name in files:
        if f_name.endswith('.json'):
            act_name = f_name[:-5].lower()
            if act_name[:2] == "sl":
                continue
            
            prefix = lookups[act_name].get('prefix', default_prefix)
            sch_prefix = lookups[act_name].get('schedule_prefix', default_sch_prefix)
            
            print(act_name, prefix, sch_prefix)
                
            with open(os.path.join(root, f_name)) as f:
                data = json.load(f)
                
                if 'body' not in data:
                    continue
                
                enc_sch = False 
                enc_sch_name = ''
                for i, item in enumerate(data['body']):
                    section = item['tag'].replace('<b>', '').replace('</b>', '')
                    

                    if i == 0 or i == 1:
                        joined = ' '.join(tokenize(item['tag']) + tokenize(item['type']))
#                         print(joined)
                        if 'an act' in joined:
                            if act_name in repealed_leg:
                                repealedDataHolder.add_long_title(act_name, joined)
                            else: 
                                currentDataHolder.add_long_title(act_name, joined)
                        continue
                    
                    if len(section) == 0:
                        continue
                        
                    if section[0].isdigit():
                        if enc_sch:
                            section = '{0} {1} {2}'. format(enc_sch_name, sch_prefix, section)
                        else:
                            section = '{0} {1}'. format(prefix, section)
                    elif "Schedule" in section:
                        enc_sch = True
                        enc_sch_name = section.replace("Schedule", "sch")
                        continue
                    else:
                        continue
                    
                    title = ' '.join(tokenize(item['type']))
#                     if title in exclude_some:
#                         continue
                   
                    text = ' '.join(tokenize(item['text']))
                    name = '{0}_{1}'.format(act_name, section)

                    if act_name in repealed_leg: 
                        repealedDataHolder.add(text, title, name)
                    else:
                        currentDataHolder.add(text, title, name)
#                     encoder.add(tokenize(item['text']))
#                     encoder.add(tokenize(item['type']))
                                            

c2020c00130 s s
c2013q00005xn01 s s
f2010c00457 reg cl
cl184 s s
c2020c00120 s s
c2010c00519 s s
c2018c00342 s s
f2017c00182 reg cl
c2019c00103 s s
c2020c00079 s s
c2020c00084 s s
c2019c00028 s s
c2020c00137 s s


In [101]:
stopwords = ['the', 'of', 'to', 'in', 'for', 'that', 'and','on', 'is',
             'be', 'by', 'a', 'an', 'was', 'it', 'as', 'this', 'which', 'with', 'have', 'at', 'been', 'there',
             'no', 'or', 'from', 'has', 'any', 'i', 'would', 'were', 'had', 'are', 'if', 'also','before', 'but', 'his', 'other',
             'those', 'so', 'he', 'did', 'its', 'her', 'she', 'hers']

vectorizer = TfidfVectorizer(min_df=1, stop_words=stopwords)                                                                                                                                                                                                   
vectorizer.fit(repealedDataHolder.titles() + currentDataHolder.titles() + repealedDataHolder.texts() + currentDataHolder.texts() + repealedDataHolder.long_titles() + currentDataHolder.long_titles())                                                                                                                                                                                                                       

TfidfVectorizer(stop_words=['the', 'of', 'to', 'in', 'for', 'that', 'and', 'on',
                            'is', 'be', 'by', 'a', 'an', 'was', 'it', 'as',
                            'this', 'which', 'with', 'have', 'at', 'been',
                            'there', 'no', 'or', 'from', 'has', 'any', 'i',
                            'would', ...])

In [102]:
def load_vectors(path: str):
    embeddings = {}
    with open(path, 'r', encoding='utf-8') as f:
        f.readline()
        for line in f:
            values = line.rstrip().rsplit(' ')
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings[word] = coefs

    return embeddings

embeddings = load_vectors('/home/danlocke/fastText/filtered-100d.vec')


def embed(tokens: List[str]) -> np.array:
    e = [embeddings[x] for x in tokens if x in embeddings]
    if len(e) == 0:
        return np.zeros((100,))
    return np.sum(e, axis=0)

In [103]:
repealedDataHolder.add_vecs(vectorizer.transform(repealedDataHolder.titles()), vectorizer.transform(repealedDataHolder.texts()))
currentDataHolder.add_vecs(vectorizer.transform(currentDataHolder.titles()), vectorizer.transform(currentDataHolder.texts()))

currentDataHolder.add_lt_vec(vectorizer.transform(currentDataHolder.long_titles()))
repealedDataHolder.add_lt_vec(vectorizer.transform(repealedDataHolder.long_titles()))

currentDataHolder._lt_emb = np.stack([embed(tokenize(x)) for x in currentDataHolder._lt], axis=0)
repealedDataHolder._lt_emb = np.stack([embed(tokenize(x)) for x in repealedDataHolder._lt], axis=0)

In [104]:
# identity objects of each act so we can do this as a matching criteria ..
def get_objects(holder: DataHolder, _all: bool=False) -> List[int]:
    if _all:
        inds = [i for i in range(len(holder._titles))]
    else:
        inds = [i for i, x in enumerate(holder._titles) if x == "objects" or x == "purpose" or x == "object"]
    acts = {holder._vals[x].split('_')[0]: x for x in inds}
    return acts


In [105]:
# some typical legislative headings that we aren't really interested in 
exclude_some = {
    'preliminary',
    'short title',
    'commencement',
    'definitions',
    'standard definitions',
    'dictionary',
    'application of act',
    'object',
    'interpretation',
    'act binds all persons',
    'key definitions',
    'regulation making power',
    'application of division',
    'approved forms', 
    'approval of forms',
    'interpretation',
    'act binds the crown',
}

def get_links(a, b, exclude=exclude_some, match_lt: bool = False, match_obj: bool = False):
    
    same = a == b
    a_objs = get_objects(a)
    if same:
        b_objs = a_objs
    else:
        b_objs = get_objects(b)

    links = []
    for row in range(a._title_vecs.shape[0]):
        if a._texts[row] == '' or a._titles[row] == '' or a._titles[row] in exclude_some:
            continue
        
        title_sims = cosine_similarity(a._title_vecs[row:row+1], b.title_vecs()).flatten()
        text_sims = cosine_similarity(a._text_vecs[row:row+1], b.text_vecs()).flatten()
            
        inds = []
        seen = set()
        
        sorted_text = text_sims.argsort()[:-10:-1]
        for ind in sorted_text:
            if same and ind == row:
                continue
            if text_sims[ind] < 0.95: 
                break 
                
            inds.append(ind)
            seen.add(ind)
        
        sorted_title = title_sims.argsort()[:-10:-1]
        for ind in sorted_title:
            if same and ind == row:
                continue
            if ind in seen:
                continue
            if title_sims[ind] < 0.9:
                break
            if b._texts[ind] == '':
                continue
            if 'definition' in a._titles[row] or 'application' in a._titles[row] or 'objects' in a._titles[row] or 'purpose' in a._titles[row]:
                if text_sims[ind] < 0.9: 
                    continue
            elif text_sims[ind] < 0.6: 
                    continue
                
            inds.append(ind)

        if len(inds) == 0:
            continue

#         print('-'*100)
#         currentDataHolder.print_row(row)
        act_name = a._vals[row].split('_')[0]
        lt_row = a._lt_lookup[act_name] if act_name in a._lt_lookup else None
        object_row = a_objs[act_name] if act_name in a_objs else None
        
        # if lt_row is not None:
        # print('\nlong title: {0}'.format(currentDataHolder._lt[lt_row]))

    #     if object_row is not None:
    #         print(currentDataHolder._texts[object_row])

        for ind in inds:
#             print('*'*5)
#             repealedDataHolder.print_row(ind)
#             print(title_sims[ind], text_sims[ind])# len(currentDataHolder._text_vecs.getrow(row).nonzero()[0]), len(repealedDataHolder.text_vecs().getrow(ind).nonzero()[0]))
            lt_sim = -1.0
            lt_emb_sim = -1.0
            obj_sim = -1.0
            
            match_act = b._vals[ind].split('_')[0]
            if match_act in b._lt_lookup and lt_row is not None:
                match_lt_row = b._lt_lookup[match_act]
                lt_sim = cosine_similarity(a._long_title_vecs[lt_row:lt_row+1], b._long_title_vecs[match_lt_row:match_lt_row+1])[0][0]
                lt_emb_sim = cosine_similarity(a._lt_emb[lt_row:lt_row+1], b._lt_emb[match_lt_row:match_lt_row+1])[0][0]
                if match_lt and lt_emb_sim < 0.9:
                    continue
#                 print(cosine_similarity(currentDataHolder._long_title_vecs[lt_row:lt_row+1], repealedDataHolder._long_title_vecs[match_lt_row:match_lt_row+1])[0], 
#                      cosine_similarity(currentDataHolder._lt_emb[lt_row:lt_row+1], repealedDataHolder._lt_emb[match_lt_row:match_lt_row+1])[0])
#                 print('matched long title: {0}'.format(repealedDataHolder._lt[match_lt_row]))
            
                if match_act in b_objs and object_row is not None:
                    m_obj_ind = b_objs[match_act]
                    obj_sim = cosine_similarity(a._text_vecs[object_row:object_row+1], b._text_vecs[m_obj_ind:m_obj_ind+1])[0][0]
#                 print(pairwise_distances(currentDataHolder._text_vecs[object_row:object_row+1], repealedDataHolder._text_vecs[m_obj_ind:m_obj_ind+1], 'cosine'))
#             print()
            links.append({'from': a._vals[row], 'to': b._vals[ind], 'title_sim': float(title_sims[ind]), 'text_sim': float(text_sims[ind]), 
                          'lt_sim': float(lt_sim), 'lt_emb_sim': float(lt_emb_sim), 'obj_sim': float(obj_sim)})
            
    return links



In [106]:
lt_links = get_links(currentDataHolder, repealedDataHolder, match_lt=False)

In [107]:
lt_curr_links = get_links(currentDataHolder, currentDataHolder, match_lt=False)
# lt_rep_link = get_links(repealedDataHolder, repealedDataHolder, match_lt=True)

In [108]:
act_cnts = {}
for link in lt_links: 
    _to = link['to'].split('_')[0]
    _from = link['from'].split('_')[0]
    if _from not in act_cnts:
        act_cnts[_from] = {}

    act_cnts[_from][_to] = act_cnts[_from].get(_to, 0) + 1

In [109]:
def get_len(a, b, substr):
    l = len(a.get_matching_rows_substr(substr))
    if l == 0: 
        return len(b.get_matching_rows_substr(substr))
    return l

for k, v in act_cnts.items():
    f_len = get_len(currentDataHolder, repealedDataHolder, k)
        
    for k2, v2 in v.items():
        t_len = get_len(currentDataHolder, repealedDataHolder, k2)
        small = f_len if f_len < t_len else t_len
        print(k, k2, f_len, t_len, v2, float(v2) / float(small))

c2020c00130 cl184 682 2213 6 0.008797653958944282
c2020c00130 c2010c00519 682 1122 1 0.001466275659824047
c2020c00120 c2010c00519 1025 1122 2 0.001951219512195122
c2018c00342 c2010c00519 338 1122 4 0.011834319526627219
f2017c00182 f2010c00457 45 43 13 0.3023255813953488
c2019c00103 c2010c00519 546 1122 5 0.009157509157509158
c2020c00079 c2010c00519 1489 1122 1074 0.9572192513368984
c2020c00079 cl184 1489 2213 4 0.002686366689053056
c2020c00084 c2010c00519 459 1122 52 0.11328976034858387
c2020c00084 cl184 459 2213 9 0.0196078431372549
c2019c00028 c2010c00519 83 1122 1 0.012048192771084338
c2019c00028 cl184 83 2213 12 0.14457831325301204
c2020c00137 cl184 3357 2213 1192 0.5386353366470854
c2020c00137 c2010c00519 3357 1122 12 0.0106951871657754


In [110]:
for i in lt_links:
    if 'f2017' in i['from']:
        print(i)

{'from': 'f2017c00182_reg 4', 'to': 'f2010c00457_reg 3', 'title_sim': 1.0000000000000002, 'text_sim': 0.796114083287532, 'lt_sim': -1.0, 'lt_emb_sim': -1.0, 'obj_sim': -1.0}
{'from': 'f2017c00182_sch 1 cl 1', 'to': 'f2010c00457_sch cl 1', 'title_sim': 1.0, 'text_sim': 1.0000000000000002, 'lt_sim': -1.0, 'lt_emb_sim': -1.0, 'obj_sim': -1.0}
{'from': 'f2017c00182_sch 1 cl 2', 'to': 'f2010c00457_sch cl 2', 'title_sim': 1.0000000000000002, 'text_sim': 1.0, 'lt_sim': -1.0, 'lt_emb_sim': -1.0, 'obj_sim': -1.0}
{'from': 'f2017c00182_sch 1 cl 5', 'to': 'f2010c00457_sch cl 4', 'title_sim': 1.0, 'text_sim': 0.8384635246839994, 'lt_sim': -1.0, 'lt_emb_sim': -1.0, 'obj_sim': -1.0}
{'from': 'f2017c00182_sch 1 cl 8', 'to': 'f2010c00457_sch cl 6', 'title_sim': 1.0, 'text_sim': 0.6159874693323342, 'lt_sim': -1.0, 'lt_emb_sim': -1.0, 'obj_sim': -1.0}
{'from': 'f2017c00182_sch 1 cl 10', 'to': 'f2010c00457_sch cl 11', 'title_sim': 0.5759880567878783, 'text_sim': 0.9624264010642745, 'lt_sim': -1.0, 'lt_em

In [111]:
with open('comm_links.json', 'w') as f:
    json.dump(lt_links, f, indent=4)
    
with open('comm_curr_links.json', 'w') as f:
    json.dump(lt_curr_links, f, indent=4)

In [74]:
# file_names = ['lt_links', 'lt_curr_links', 'lt_rep_links']
# datas = [lt_links, lt_curr_links, lt_rep_link]
# for i, f_name in enumerate(file_names):
#     with open(f_name+'.json', 'w') as f:
#         json.dump(datas[i], f, indent=4)

In [10]:
with open('/home/danlocke/go/src/github.com/dan-locke/phd-data/case-topics.json', 'rb') as f:
    data = json.load(f)

In [42]:
for topic in data['topics']:
    print(topic['topic'])
#     tokens = tokenize(topic['topic'])
#     vec = vectorizer.transform([' '.join(tokens)])
#     sims = cosine_similarity(vec[0:1], currentDataHolder._text_vecs)[0]
#     sim_inds = sims.argsort()[:-10:-1]
#     for ind in sim_inds:
# #         if sims[ind] < 0.5: 
# #             continue
#         currentDataHolder.print_row(ind)
#         print(sims[ind])
#     print('-'*30)

What is the effect of reinstating a company that was in liquidation as regards money that may be recovered?
Is the variation of the date for settlement required to be in writing?
Maintenance and champerty and the requisite degree of control
agency fees and effective cause of sale of a boat
Should damages be reduced according to a sum that represents betterment
Consideration of a clause that provides that the contract is subject to the buyer being satisfied, in its absolute discretion, with the due diligence by a date
Maintenance and champerty
proportionality as a basis for striking out a defamation claim
In trade or commerce
Whether membership of an organisation is in trade or commerce
Organisations owing members a duty of care
That proprietary relief is only available after rescission 
Exemplary damages and retaining the benefit of any brokerage being a basis for their assessment
Google search results and defamation
Postponing the giving of particulars of a claim until after discovery

In [None]:
# tokens = tokenize('Is the variation of the date for settlement required to be in writing?')
# vec = vectorizer.transform([' '.join(tokens)])
# rows = currentDataHolder.get_matching_rows_substr('act-1974-076')
# sims = cosine_similarity(vec, currentDataHolder._text_vecs[rows[0]:rows[-1]])[0]
# sim_inds = sims.argsort()[:-10:-1]
# for ind in sim_inds:
#     currentDataHolder.print_row(rows[0]+ind)
#     print(sims[ind])