In [7]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Step 1: Split contract text into individual articles

Using the regex method or elliott's method

In [10]:
import os
import pipeline
import plutil

from smart_open import s3
from tqdm import tqdm

In [11]:
# Old: Load from disk
#canadian_pt_path = "/home/research/corpora/contracts/canadian/txt"
#len(os.listdir(canadian_pt_path)) # 44,589 contracts *total*

In [12]:
bucket_name = 'cuecon-textlab'
bucket_prefix = 'home/research/corpora/contracts/canadian/txt/'

In [13]:
#s3_contents = plutil.get_s3_contents(tl_bucket, bucket_prefix)
#len(s3_contents)

In [14]:
#for key, content in s3.iter_bucket(bucket, prefix=prefix, accept_key=lambda key: '/201' in key, workers=1, key_limit=3):



In [30]:
#pl = pipeline.Pipeline("canadian", canadian_pt_path, lang_list=["eng"],
#                       sample_N=100, splitter="elliott")
s3_options = {
    'bucket_name': 'cuecon-textlab',
    'bucket_prefix': 'home/research/corpora/contracts/canadian/txt/'
}
pl = pipeline.Pipeline("canadian", mode="s3", mode_options=s3_options,
                       output_path="./output/", lang_list=["eng"],
                       splitter="elliott", verbose=True)

Full output path: ./output/20220825
Loading filenames from cuecon-textlab/home/research/corpora/contracts/canadian/txt/...


44590it [00:20, 2200.56it/s]


Total files: 44589; files after language filter (['eng']): 35931
35931 filenames loaded (0000102a_eng.txt ... 1498101a_eng.txt)


In [31]:
len(pl.excluded_fnames)

8658

In [32]:
pl.excluded_fnames[0], pl.excluded_fnames[-1]

('1074906c_fra.txt', '1265602c_fra.txt')

In [33]:
# done!
#pl.split_contracts()

In [34]:
#import json
#fpath = "../canadian_output/01_artsplit_elliott_json/0000102a.json"
#with open(fpath, 'r') as f:
#    data = json.load(f)

In [35]:
# Get the paths to the folders where the pkl and json files should be saved
pkl_path = os.path.join(pl.get_output_path(), f"01_artsplit_{pl.splitter}_pkl")
print(pkl_path)
json_path = os.path.join(pl.get_output_path(), f"01_artsplit_{pl.splitter}_json")

./output/20220825/01_artsplit_elliott_pkl


In [36]:
import detect_sections_elliott as dse

In [37]:
def split_contract(contract_id, contract_lang, contract_text):
    arts, headers = dse.detect_sections(contract_text)
    # Convert to the dict format for compatibility with regex splitter
    # It looks like len(headers) is almost always greater than (often
    # like double or triple) len(arts). So for now I'm ignoring headers
    art_list = []
    for i in range(len(arts)):
        cur_art = arts[i]
        cur_art_data = {'header': None, 'text': cur_art, 'section_num': i,
                        'contract_id': contract_id, 'lang': contract_lang}
        art_list.append(cur_art_data)
    return art_list

In [38]:
save_json = False

In [42]:
accept_rule = lambda fname: fname.endswith('_eng.txt')
# Include key_limit=16 for debugging
bucket_iter = s3.iter_bucket(bucket_name, prefix=bucket_prefix,
                             accept_key=accept_rule, workers=16, key_limit=100)
for fpath, content in tqdm(bucket_iter, total=pl.get_num_docs()):
    fname = os.path.basename(fpath)
    # First we get the info from the filename
    fname_data = plutil.parse_fname(fname)
    contract_prefix = fname_data['prefix']
    contract_id = fname_data['id']
    contract_lang = fname_data['lang']
    # Check if already processed
    pkl_fpath = os.path.join(pkl_path, f"{contract_prefix}.pkl")
    if os.path.isfile(pkl_fpath):
        continue
    # Now we process the content
    #print(fname, len(content))
    contract_text = content.decode('utf-8')
    art_list = split_contract(contract_id, contract_lang, contract_text)
    # And save the article list as .pkl (for internal use) and .json
    # (for human reading)
    
    plutil.safe_to_pickle(art_list, pkl_fpath)
    #print(f"Saved to {pkl_fpath}")
    if save_json:
        json_fpath = os.path.join(json_path, f"{contract_prefix}.json")
        plutil.safe_to_json(art_list, json_fpath)

  0%|▎                                                                                                                  | 100/35931 [00:10<1:00:08,  9.93it/s]


## Step 2: Parse the articles using spaCy

In [None]:
# Python imports
import functools
import glob
import json
import logging
import os

# 3rd party imports
import joblib

# Local imports
import pipeline
import main02_spacy_parse

In [None]:
# Set up logging
logger = logging.getLogger()
logging.basicConfig(format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO)

In [None]:
# And set it to work with spacy's use of multiprocessing
import multiprocessing_logging
multiprocessing_logging.install_mp_handler()

Load the actual spaCy NLP object (`nlp_eng`), and extend it to include neuralcoref annotations

In [None]:
import spacy
import neuralcoref

In [None]:
print("Loading spaCy core model")
nlp_eng = spacy.load('en_core_web_md', disable=["ner"])
print("Loading spaCy coref model. May take a while...")
neuralcoref.add_to_pipe(nlp_eng);

Annoying but necessary additional step: adding "contract_id" and "art_num" attributes to spacy's Doc class, so that we can serialize and deserialize without headaches [x__x]

See https://spacy.io/usage/processing-pipelines#custom-components-attributes

In [None]:
# The force=True is just so that we can change (e.g.) the names or default values and overwrite the extensions
# (otherwise this would always cause an Exception)
spacy.tokens.Doc.set_extension("contract_id", default=None, force=True)
spacy.tokens.Doc.set_extension("article_num", default=None, force=True)
spacy.tokens.Doc.set_extension("coref_list", default=[], force=True)

Aaand yet another necessary workaround...

See https://github.com/huggingface/neuralcoref/issues/82#issuecomment-569431503

[update: holding off on this one actually, since I need the coref data... ugh]

In [None]:
cr_test_doc = nlp_eng(u'My sister has a dog. She loves him.')

In [None]:
mentions = [
    {
        "start": mention.start_char,
        "end": mention.end_char,
        "text": mention.text,
        "resolved": cluster.main.text,
    }
    for cluster in cr_test_doc._.coref_clusters
    for mention in cluster.mentions
]
#clusters = list(
#    list(span.text for span in cluster)
#    for cluster in cr_test_doc._.coref_clusters
#)
#resolved = cr_test_doc._.coref_resolved
#response = {}
#response["mentions"] = mentions
#response["clusters"] = clusters
#response["resolved"] = resolved
mentions

K I guess we'll use this representation to avoid the serialization errors :|

In [None]:
# All the neuralcoref attributes for a doc, for future reference:
#cr_test_doc._.has_coref
#cr_test_doc._.coref_resolved
#cr_test_doc._.coref_clusters
#cr_test_doc._.coref_scores

In [None]:
#for cluster in cr_test_doc._.coref_clusters:
#    print(f"===== #{cluster.i}")
#    print(cluster)
#    print(f"main: '{cluster.main}'")
#    print(cluster.mentions)
#    for mention in cluster.mentions:
#        print(mention)
#        print(mention.start)
#        print(mention.end)

In [None]:
def stream_art_data(test_N=None):
    """
    test_N: If set to an int, the function will only yield article data for the first `test_N` contracts.
            Otherwise, if set to None, article data for all contracts is yielded.
    """
    art_data_fpaths = glob.glob("../canadian_output/01_artsplit_elliott_json/*.json")
    # Loop over contracts
    for fnum, fpath in enumerate(art_data_fpaths):
        if test_N is not None and fnum >= test_N:
            # We've already yielded the first `test_N` contracts, so terminate
            break
        with open(fpath, 'r') as f:
            all_articles = json.load(f)
        # Now loop over the articles
        for cur_article in all_articles:
            # We want to yield tuples of (string, {contract_id, article_num})
            art_str = cur_article['text']
            art_data = {'contract_id':cur_article['contract_id'],
                        'article_num':cur_article['section_num']}
            yield (art_str, art_data)

In [None]:
#art_data_fpaths = glob.glob("../canadian_output/01_artsplit_elliott_json/*.json")
#first_fpath = art_data_fpaths[0]
#with open(first_fpath, 'r') as f:
#    data = json.load(f)

In [None]:
def remove_unserializable_results(doc):
    doc.user_data = {}
    for x in dir(doc._):
        if x in ['get', 'set', 'has']: continue
        setattr(doc._, x, None)
    for token in doc:
        for x in dir(token._):
            if x in ['get', 'set', 'has']: continue
            setattr(token._, x, None)
    return doc

In [None]:
def get_coref_data(doc_obj):
    mentions = [
        {
            "start": mention.start_char,
            "end": mention.end_char,
            "text": mention.text,
            "resolved": cluster.main.text,
        }
        for cluster in doc_obj._.coref_clusters
        for mention in cluster.mentions
    ]
    return mentions

In [None]:
def transform_texts(nlp, batch_id, batch_tuples, output_dir):
    # Using spacy's "DocBin" functionality: see https://spacy.io/usage/saving-loading#docs
    batch_bin = spacy.tokens.DocBin(store_user_data=True)
    #print(nlp.pipe_names)
    output_fpath = os.path.join(output_dir, f"{batch_id}.bin")
    if os.path.isfile(output_fpath):  # return None in case same batch is called again
        return None
    print("Processing batch", batch_id)
    for art_doc, art_meta in nlp.pipe(batch_tuples, as_tuples=True):
        # This is the weird part where we now have to change contract_id and art_num
        # from being metadata to being attributes of the spacy Doc objects themselves
        contract_id = art_meta["contract_id"]
        article_num = art_meta["article_num"]
        art_doc._.contract_id = contract_id
        art_doc._.article_num = article_num
        # And now we don't need the meta object anymore, since it's encoded in the Doc itself
        # But next we need to get a serializable representation of the detected corefs
        art_doc._.coref_list = get_coref_data(art_doc)
        # Ok now we can get rid of the original coref attributes that break the data
        art_doc = remove_unserializable_results(art_doc)
        batch_bin.add(art_doc)
    # Now we can use spacy's serialization methods [joblib basically fails at serializing
    # spacy Docs for various reasons]
    # [see https://spacy.io/usage/saving-loading#docs]
    batch_bytes = batch_bin.to_bytes()
    # And save the bytes object to file
    with open(output_fpath, "wb") as f:
        f.write(batch_bytes)
    print("Saved {} texts to {}.bin".format(len(batch_tuples), batch_id))

In [None]:
# Trying to use multiprocessing like in
# https://spacy.io/usage/examples#multi-processing
#output_dir = "./mp_test"
output_dir = "./mp_full"
#art_tuple_stream = stream_art_data(test_N=50)
art_tuple_stream = stream_art_data()

print("Processing texts...")
batch_size = 1000
#batch_size = 200
n_jobs = 16
art_partitions = spacy.util.minibatch(art_tuple_stream, size=batch_size)
executor = joblib.Parallel(n_jobs=n_jobs, backend="multiprocessing", prefer="processes")
do = joblib.delayed(functools.partial(transform_texts, nlp_eng))
tasks = (do(i, batch_tuples, output_dir) for i, batch_tuples in enumerate(art_partitions))
executor(tasks);

Test that it worked

In [None]:
bin_fpath = "./mp_test/6.bin"
with open(bin_fpath, "rb") as f:
    loaded_bytes = f.read()
loaded_bin = spacy.tokens.DocBin().from_bytes(loaded_bytes)

In [None]:
doc_iter = loaded_bin.get_docs(nlp_eng.vocab)
doc_list = list(doc_iter)

In [None]:
doc_list[5]

In [None]:
processed_arts = []

In [None]:
#for art_nlp, art_meta in nlp_eng.pipe(stream_art_data(test_N=1), as_tuples=True):
#    logger.info(f"Finished processing: {art_meta}")
#    processed_arts.append((art_nlp, art_meta))

In [None]:
len(processed_arts)

In [None]:
type(processed_arts[0][0])

In [None]:
statement_list = main02_parse_articles.parallel_parse(pl, nlp_eng, stream_art_data)