## Point-in-Time

In [None]:
from LegalDefAgent.src.existdb import existdb_handler
import polars as pl
from tqdm import tqdm


def get_definition_timeline(definition):
    cons = existdb_handler.find_consolidated(definition['metadata'])
    if cons:
        earliest_entries = {}
        for item in cons:
            definition_text = item['definition'].strip()
            current_entry = earliest_entries.get(definition_text)
            
            if current_entry is None or item['date'] < current_entry['date']:
                earliest_entries[definition_text] = item
                
        ordered_definitions = sorted(earliest_entries.values(), key=lambda x: x['date'])
        for entry in ordered_definitions:
            entry['date'] = entry['date'].strftime('%Y-%m-%d')

        return ordered_definitions

    static = existdb_handler.extract_definition_from_exist(definition['metadata'])
    if static:
        for entry in static:
            entry['date'] = entry['date'].strftime('%Y-%m-%d')
        return static
    return None

df = pl.read_csv('../data/definitions_corpus/definitions.csv')

results = []

for d in tqdm(df.iter_rows(), total=len(df)):
    label = d[3]
    dataset = d[4]
    doc_id = d[5]
    frbr_work = d[6]
    frbr_expression = d[7]

    dict = {
        "metadata": {
            "dataset": dataset,
            "definendum_label": label,
            "frbr_work": frbr_work,
            "frbr_expression": frbr_expression,
            "doc_id": doc_id
        }
    }

    tl = get_definition_timeline(dict)

    if tl and len(tl) > 1:
        dict['tl'] = tl
        results.append(dict)

In [None]:
results[0]

In [None]:
import pickle

with open('results.pkl', 'wb') as f:   
    pickle.dump(results, f)

In [None]:
for res in results:
    if res['metadata']['definendum_label'] == '#marketManipulation':
        print(res)

In [None]:
records = []

for el in results:
    metadata = el['metadata']
    for entry in el['tl']:
        definendum = re.search(r'^.*?"\s?"?\s*([^"]*)\s*', entry['definition']).group(1).strip()
        if entry['definition'].startswith('person'):
            definendum = 'person'
        elif entry['definition'].startswith('automatic exchange'):
            definendum = 'automatic exchange'
        records.append({
            'Term': definendum,
            'Date': entry['date'],
            'Definition': entry['definition'],
            'Dataset': metadata['dataset'],
            'Label': metadata['definendum_label'],
            'CELEX': metadata['doc_id'].split('.')[0],
            'FRBR_Work': metadata['frbr_work'],
            'FRBR_Expression': metadata['frbr_expression']
        })

df = pl.DataFrame(records)

df = df.sort(['Term', 'Date'])

#pivot_df = df.pivot(
    #values='Definition',
    #index='Term',
    #columns='Date'
#)


df.sort('Term')
df.to_pandas().to_excel('definitions_with_modifications.xlsx', header=True)

df

In [None]:
olddf = pl.read_excel('./datasets/point-in-time/definitions_with_modifications.xlsx')
df.join(olddf, on=['Term', 'Date', 'Definition'], how='anti')

In [None]:
olddf.group_by('Term').agg(pl.len()).join(df.group_by('Term').agg(pl.len()), on='Term', how='left').filter(pl.col('len') != pl.col('len_right'))

In [None]:
# old from csv


import re

result = {}

date_pattern = r"'date':\s*'([^']+)'"
definition_pattern = r"'definition':\s*'([^']+)'"

with open("../notebooks/definition_timeline.csv", "r") as f:
    file = f.read()
    for line in file.split('\n')[1:]:
        definendum = re.search(r'""\s*([^"]*)\s*""', line).group(1).strip()
        if definendum not in result:
            result[definendum] = {}

        tl = {}
        for el in line.split('","'):
            date_match = re.search(date_pattern, el).group(1)
            definition_match = re.search(definition_pattern, el).group(1)
            result[definendum][date_match] = definition_match

result

---
## Multi-legislation

In [None]:
import polars as pl

cols = ['id',
 'definition_text',
 'def_n',
 'document_id',
 'frbr_work',
 'frbr_expression']

juris_df = defs.group_by(['label', 'dataset']).agg(pl.all()).sort('label').filter(pl.col('label').is_duplicated()).explode(cols)#.group_by('label').agg(pl.all()).sort('label')

juris_df

In [None]:
from LegalDefAgent.src.utils import camelcase_to_spaces
from pprint import pprint
import json

z = []

for el in juris_df.filter(pl.col('dataset').list.contains('EurLex')).to_dicts():
    term = camelcase_to_spaces(el['label'])
    num_jurisdictions = len(el['dataset'])
    for i in range(num_jurisdictions):
        entry = {
            "label": el['label'],
            "term": term,
            "dataset": el['dataset'][i],
            "id": el['id'][i][0] if el['id'][i] else None,
            "definition_text": el['definition_text'][i][0] if el['definition_text'][i] else None,
            "def_n": el['def_n'][i][0] if el['def_n'][i] else None,
            "document_id": el['document_id'][i][0].split('.')[0] if el['document_id'][i] else None,
        }
        z.append(entry)
    
pprint(z)


with open('./datasets/legislation_definitions.json', 'w') as f:
    json.dump(z, f, indent=4)