In [69]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [70]:
import os

from smart_open import s3
from tqdm import tqdm

import pipeline
import plutil
from plutil import DocMeta, ArtMeta

## Step 0: Set up pipeline

In [128]:
pl = pipeline.Pipeline("canadian", config_fname="canadian_jj.conf", mode="s3",
                       lang_list=["eng"], splitter="elliott",
                       verbose=True)

Looking for config file ./configs/canadian_jj.conf
Full output path: C:/Dropbox/Labor_Contracts_Canada/./analysis/output/analysis/2022-12-jeff-python/
Looking for config file ./configs/canadian_jj.conf


### Step 0b: Load all the filenames, so we can exclude the non-eng ones

In [72]:
pl.init_plaintext_list()

2022-12-29 15:03:24,836 : INFO : Found credentials in shared credentials file: ~/.aws/credentials


Loading filenames from cuecon-textlab/home/research/corpora/contracts/canadian/txt/...


44590it [00:13, 3315.33it/s]

Total files: 44589; files after language filter (['eng']): 35931
35931 filenames loaded (0000102a_eng.txt ... 1498101a_eng.txt)





In [73]:
len(pl.excluded_fnames)

8658

In [74]:
pl.excluded_fnames[0], pl.excluded_fnames[-1]

('0670408c_fra.txt', '0030904c_fra.txt')

## Step 1: Split contract text into individual articles

Using the regex method or elliott's method

In [75]:
# Get the paths to the folders where the pkl and json files should be saved
artsplit_pkl_path = pl.get_artsplit_output_path("pkl")
print(artsplit_pkl_path)

C:/Dropbox/Labor_Contracts_Canada/./analysis/output/analysis/2022-12-jeff-python/01_artsplit_elliott_pkl


In [14]:
import detect_sections_elliott as dse

In [15]:
def split_contract(contract_text, contract_meta):
    arts, headers = dse.detect_sections(contract_text)
    # Convert to the dict format for compatibility with regex splitter
    # It looks like len(headers) is almost always greater than (often
    # like double or triple) len(arts). So for now I'm ignoring headers
    art_list = []
    for i in range(len(arts)):
        cur_art_text = arts[i]
        cur_art_meta = ArtMeta(contract_meta.contract_id, contract_meta.lang, i)
        art_list.append((cur_art_text, cur_art_meta))
    return art_list

In [16]:
save_json = False

In [19]:
accept_rule = lambda fname: fname.endswith('_eng.txt')
# Include key_limit=16 for debugging
bucket_iter = s3.iter_bucket(pl.s3_bucket_name, prefix=pl.s3_bucket_prefix,
                             accept_key=accept_rule, workers=16, key_limit=20)
for fpath, content in tqdm(bucket_iter, total=pl.get_num_docs()):
    fname = os.path.basename(fpath)
    # First we get the info from the filename
    fname_data = plutil.parse_fname(fname)
    contract_prefix = fname_data['prefix']
    contract_meta = DocMeta(fname_data['id'], fname_data['lang'])
    # Now we process the content
    #print(fname, len(content))
    contract_text = content.decode('utf-8')
    art_list = split_contract(contract_text, contract_meta)
    # And save the article list as .pkl (for internal use) and .json
    # (for human reading)
    pkl_fpath = os.path.join(pl.get_artsplit_output_path("pkl"), f"{contract_prefix}.pkl")
    plutil.safe_to_pickle(art_list, pkl_fpath)
    #print(f"Saved to {pkl_fpath}")
    if save_json:
        json_fpath = os.path.join(pl.get_artsplit_output_path("json"), f"{contract_prefix}.json")
        plutil.safe_to_json(art_list, json_fpath)

  0%|          | 20/35931 [00:04<2:00:51,  4.95it/s]


## Step 2: Parse the articles using spaCy, resolve coreference

In [76]:
#!pip install multiprocessing_logging

In [77]:
# Python imports
import functools
import glob
import json
import logging
import os

# 3rd party imports
import joblib

# Local imports
import pipeline
import plutil

In [78]:
# Set up logging
logger = logging.getLogger()
logging.basicConfig(format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO)

In [79]:
# And set it to work with spacy's use of multiprocessing
import multiprocessing_logging
multiprocessing_logging.install_mp_handler()

In [80]:
nlp_eng = pl.get_spacy_model()

Loading spaCy core model


In [81]:
def stream_art_lists(test_N=None):
    """

    :param test_N:
    :return:
    """
    art_data_fpaths = glob.glob(os.path.join(artsplit_pkl_path, "*.pkl"))
    for fnum, fpath in enumerate(art_data_fpaths):
        if test_N is not None and fnum >= test_N:
            # We've already yielded the first `test_N` contracts, so terminate
            break
        # Get the contract info from the fpath
        fname = os.path.basename(fpath)
        fname_data = plutil.parse_fname(fname)
        contract_meta = DocMeta(fname_data['id'], fname_data['lang'])
        contract_articles = joblib.load(fpath)
        yield contract_articles, contract_meta


In [82]:
#art_data_fpaths = glob.glob("../canadian_output/01_artsplit_elliott_json/*.json")
#first_fpath = art_data_fpaths[0]
#with open(first_fpath, 'r') as f:
#    data = json.load(f)

In [83]:
def remove_unserializable_results(doc):
    doc.user_data = {}
    for x in dir(doc._):
        if x in ['get', 'set', 'has']: continue
        setattr(doc._, x, None)
    for token in doc:
        for x in dir(token._):
            if x in ['get', 'set', 'has']: continue
            setattr(token._, x, None)
    return doc

In [84]:
def get_coref_data(doc_obj):
    mentions = [
        {
            "start": mention.start_char,
            "end": mention.end_char,
            "text": mention.text,
            "resolved": cluster.main.text,
        }
        for cluster in doc_obj._.coref_clusters
        for mention in cluster.mentions
    ]
    return mentions

In [85]:
def transform_texts(nlp, batch_id, batch_tuples, output_dir):
    batch_results = []
    #print(nlp.pipe_names)
    output_fpath = os.path.join(output_dir, f"{batch_id}.pkl")
    if os.path.isfile(output_fpath):  # return None in case same batch is called again
        return None
    print("Processing batch", batch_id)
    for art_doc, art_meta in nlp.pipe(batch_tuples, as_tuples=True):
        # This is the weird part where we now have to change contract_id and art_num
        # from being metadata to being attributes of the spacy Doc objects themselves
        contract_id = art_meta["contract_id"]
        article_num = art_meta["article_num"]
        art_doc._.contract_id = contract_id
        art_doc._.article_num = article_num
        # And now we don't need the meta object anymore, since it's encoded in the Doc itself
        # But next we need to get a serializable representation of the detected corefs
        art_doc._.coref_list = get_coref_data(art_doc)
        # Ok now we can get rid of the original coref attributes that break the data
        art_doc = remove_unserializable_results(art_doc)
        batch_results.append(art_doc)
    # And save the bytes object to file
    joblib.dump(batch_results, output_fpath)
    print(f"Saved {len(batch_tuples)} texts to {output_fpath}")

In [86]:
def spacy_parse_article(nlp, art_str):
    art_doc = nlp(art_str)
    ## But next we need to get a serializable representation of the detected corefs
    #art_doc._.coref_list = get_coref_data(art_doc)
    ## Ok now we can get rid of the original coref attributes that break the data
    #art_doc = remove_unserializable_results(art_doc)
    return art_doc

In [87]:
spacy_output_path = pl.get_spacy_output_path()
contract_iter = tqdm(stream_art_lists())
for art_list, contract_meta in contract_iter:
    contract_id = contract_meta.get_contract_id()
    contract_fname = contract_meta.gen_fname("pkl")
    output_fpath = os.path.join(spacy_output_path, contract_fname)
    # Check if it's already been processed
    if os.path.isfile(output_fpath):
        pl.vprint(f"Parse already exists: {output_fpath}")
        continue
    contract_art_list = []
    #print(contract_data)
    #print(f"Processing contract {contract_id}")
    num_arts = len(art_list)
    contract_iter.set_description(f"{contract_id} ({num_arts} articles)")
    #pl.vprint(f"Parsing {len(art_list)} articles")
    for cur_art_text, cur_art_meta in art_list:
        cur_art_doc = spacy_parse_article(nlp_eng, cur_art_text)
        contract_art_list.append((cur_art_doc, cur_art_meta))
    #print(cur_art_doc)
    # Save the contract docs
    plutil.safe_to_pickle(contract_art_list, output_fpath)

20it [00:00, 3396.06it/s]

Parse already exists: C:/Dropbox/Labor_Contracts_Canada/./analysis/output/analysis/2022-12-jeff-python/02_spacy_pkl\0000102a_eng.pkl
Parse already exists: C:/Dropbox/Labor_Contracts_Canada/./analysis/output/analysis/2022-12-jeff-python/02_spacy_pkl\0000103a_eng.pkl
Parse already exists: C:/Dropbox/Labor_Contracts_Canada/./analysis/output/analysis/2022-12-jeff-python/02_spacy_pkl\0000104a_eng.pkl
Parse already exists: C:/Dropbox/Labor_Contracts_Canada/./analysis/output/analysis/2022-12-jeff-python/02_spacy_pkl\0000105a_eng.pkl
Parse already exists: C:/Dropbox/Labor_Contracts_Canada/./analysis/output/analysis/2022-12-jeff-python/02_spacy_pkl\0000106a_eng.pkl
Parse already exists: C:/Dropbox/Labor_Contracts_Canada/./analysis/output/analysis/2022-12-jeff-python/02_spacy_pkl\0000201a_eng.pkl
Parse already exists: C:/Dropbox/Labor_Contracts_Canada/./analysis/output/analysis/2022-12-jeff-python/02_spacy_pkl\0000202a_eng.pkl
Parse already exists: C:/Dropbox/Labor_Contracts_Canada/./analysis/ou




## Step 3: Extract the spaCy dependency parse data (to csv format)

In [88]:
from collections import Counter
import glob

import joblib
import pandas as pd

import process_spacy as pspacy
import plutil
from plutil import DocMeta, ArtMeta

In [89]:
nlp_eng = pl.get_spacy_model()

### Step 3.1: First extract the statement data (sdata)

In [90]:
def get_num_contract_parses(verbose=False):
    vprint = print if verbose else lambda x: None
    spacy_fpaths = glob.glob(os.path.join(spacy_output_path, "*.pkl"))
    num_fpaths = len(spacy_fpaths)
    vprint(f"Found {num_fpaths} files in spacy output path: {spacy_output_path}")
    return num_fpaths

def stream_contract_parses():
    spacy_fpaths = glob.glob(os.path.join(spacy_output_path, "*.pkl"))
    for cur_fpath in spacy_fpaths:
        fname = os.path.basename(cur_fpath)
        fname_data = plutil.parse_fname(fname)
        contract_id = fname_data['id']
        contract_lang = fname_data['lang']
        contract_meta = DocMeta(contract_id, contract_lang)
        # A list where each element is the spaCy parse of one article within the contract
        cur_art_parse_list = joblib.load(cur_fpath)
        yield cur_art_parse_list, contract_meta

def save_pdata_df(pdata_df, chunk_num=None):
    suffix = ""
    if chunk_num:
        suffix = f"_{str(chunk_num)}"
    pdata_fpath = os.path.join(pl.get_pdata_path(), f"{pl.get_corpus_name()}_pdata{suffix}.pkl")
    plutil.safe_to_pickle(pdata_df, pdata_fpath)

In [91]:
num_parses = get_num_contract_parses(verbose=True)
for cur_contract_data in tqdm(stream_contract_parses(), total=num_parses):
    cur_contract_parses = cur_contract_data[0]
    cur_contract_meta = cur_contract_data[1]
    cur_contract_id = cur_contract_meta.contract_id
    # A list of statements across the entire contract
    contract_statements = []
    # The generator gives a list of parses, so run get_statements() on each item
    for cur_art_data in cur_contract_parses:
        #print(cur_art_data)
        cur_art_parse = cur_art_data[0]
        cur_art_meta = cur_art_data[1]
        cur_art_statements = pspacy.get_statements(cur_art_parse, cur_contract_id, cur_art_meta.art_num)
        contract_statements.extend(cur_art_statements)
    # Now serialize
    output_fname = cur_contract_meta.gen_fname("pkl")
    output_fpath = os.path.join(pl.get_sdata_output_path(), output_fname)
    plutil.safe_to_pickle(contract_statements, output_fpath)

Found 20 files in spacy output path: C:/Dropbox/Labor_Contracts_Canada/./analysis/output/analysis/2022-12-jeff-python/02_spacy_pkl


100%|██████████| 20/20 [00:55<00:00,  2.78s/it]


In [92]:
# Check an example sdata file
sdata_fpath = glob.glob(os.path.join(pl.get_sdata_output_path(), "*"))[0]
sdata = plutil.safe_load_pickle(sdata_fpath)

In [93]:
type(sdata)

list

In [94]:
len(sdata)

978

In [95]:
type(sdata[0])

dict

In [96]:
len(sdata[0])

21

In [97]:
sdata[0]

{'orig_subject': 'It',
 'orig_slem': 'it',
 'in_coref': False,
 'subject': 'It',
 'slem': 'it',
 'coref_replaced': False,
 'modal': None,
 'neg': '',
 'verb': 'be',
 'passive': 0,
 'md': 0,
 'subject_branch': ['it',
  'the',
  'rate',
  'of',
  'pay',
  'and',
  'all',
  'other',
  'item',
  'that',
  'both',
  'party',
  'have',
  'agree',
  'to',
  'through',
  'the',
  'process',
  'of',
  'collective',
  'bargaining'],
 'subject_tags': ['PRP',
  'DT',
  'NNS',
  'IN',
  'NN',
  'CC',
  'DT',
  'JJ',
  'NNS',
  'WDT',
  'DT',
  'NNS',
  'VBP',
  'VBN',
  'TO',
  'IN',
  'DT',
  'NN',
  'IN',
  'JJ',
  'NN'],
 'object_branches': [['the', 'general', 'purpose', 'of', 'this', 'agreement'],
  ['to',
   'set',
   'forth',
   'the',
   'working',
   'condition',
   'living',
   'condition',
   'within',
   'the',
   'power',
   'and/or',
   'ability',
   'of',
   'the',
   'employers',
   'to',
   'control',
   'the',
   'hour',
   'of',
   'work']],
 'object_tags': [['DT', 'JJ', 'NN', 'IN

### Step 3.2: Extract the parse data (pdata) for each statement (from its sdata)

In [98]:
from collections import Counter

import joblib
import pandas as pd

import plutil

In [99]:
pl.vprint("Starting extract_pdata()")
pdata_path = pl.get_pdata_output_path()
subject_counts = Counter()
subnoun_counts = Counter()
modal_counts = Counter()

Starting extract_pdata()


In [100]:
def stream_sdata():
    sdata_path = pl.get_sdata_output_path()
    sdata_fpaths = glob.glob(os.path.join(sdata_path, "*.pkl"))
    for cur_fpath in sdata_fpaths:
        cur_fname = os.path.basename(cur_fpath)
        fname_data = plutil.parse_fname(cur_fname)
        contract_id = fname_data['id']
        contract_lang = fname_data['lang']
        contract_meta = DocMeta(contract_id, contract_lang)
        # A list where each element is the data for a statement in the contract
        contract_sdata = joblib.load(cur_fpath)
        yield contract_sdata, contract_meta

In [101]:
# Loop over the items, getting counts but also producing a .csv where each
# row is a statement
# Num to process is just num_contracts
num_to_process = pl.get_num_docs()
sdata_iter = tqdm(stream_sdata(), total=num_to_process)
for doc_sdata, doc_meta in sdata_iter:
    contract_id = doc_meta.contract_id
    contract_lang = doc_meta.lang
    sdata_iter.set_description(contract_id)
    doc_pdata = []
    # Loop over each statement, getting the subject/subject_branch/subject_tag
    for statement_data in doc_sdata:
        subject = statement_data['subject']
        statement_dict = {
            'contract_id':contract_id,
            'article_num':statement_data['article_num'],
            'sentence_num':statement_data['sentence_num'],
            'statement_num':statement_data['statement_num'],
            'full_sentence':statement_data['full_sentence'],
            'full_statement':statement_data['full_statement'],
            'subject':statement_data['subject'],
            'passive':statement_data['passive'],
            'subject_tags':statement_data['subject_tags'],
            'subject_branch':statement_data['subject_branch'],
            'object_tags':statement_data['object_tags'],
            'verb':statement_data['verb'],
            'modal':statement_data['modal'],
            'md':statement_data['md'],
            'neg':statement_data['neg'],
            'object_branches':statement_data['object_branches']
        }
        doc_pdata.append(statement_dict)
        subnouns = sorted([subject_branch
                           for subject_branch, subject_tag
                           in zip(statement_data['subject_branch'], statement_data['subject_tags'])
                           if subject_tag.startswith('N')])
        subject_counts[subject] += 1
        if statement_data['md'] == 1:
            modal_counts[subject] += 1
        for x in subnouns:
            if x != subject:
                subnoun_counts[x] += 1
    # Print a message and save the contract pdata
    #pl.vprint(f"Saving sdata for contract {contract_id}")
    pdata_df = pd.DataFrame(doc_pdata)
    pdata_fpath = os.path.join(pdata_path, f"{contract_id}_{contract_lang}.pkl")
    plutil.safe_to_pickle(pdata_df, pdata_fpath)
sub_counts_fpath = os.path.join(pl.get_output_path(), f"{pl.get_corpus_name()}_subject_counts.pkl")
pd.to_pickle(subject_counts, sub_counts_fpath)
modal_counts_fpath = os.path.join(pl.get_output_path(), f"{pl.get_corpus_name()}_modal_counts.pkl")
pd.to_pickle(modal_counts, modal_counts_fpath)

print("Most common subjects:")
print(subject_counts.most_common()[:50])

0003405a:   0%|          | 20/35931 [00:02<1:07:48,  8.83it/s]

Most common subjects:
[('employee', 1289), ('who', 969), ('he', 961), ('Company', 752), ('i', 636), ('it', 394), ('It', 380), ('employees', 377), ('Employer', 273), ('Employees', 209), ('they', 183), ('time', 174), ('which', 173), ('that', 132), ('there', 128), ('l', 109), ('hours', 105), ('seniority', 102), ('work', 97), ('Union', 91), ('grievance', 88), ('rate', 83), ('party', 68), ('Plan', 68), ('tools', 67), ('days', 66), ('shift', 65), ('period', 64), ('amount', 63), ('matter', 61), ('decision', 55), ('schedule', 54), ('provisions', 52), ('I', 50), ('leave', 49), ('operations', 49), ('Seniority', 49), ('notice', 48), ('arrangements', 47), ('day', 47), ('week', 46), ('t', 46), ('There', 46), ('s', 44), ('holidays', 44), ('change', 42), ('payment', 42), ('list', 41), ('Time', 38), ('rates', 38)]





## Step 4: Compute authority measures

In [102]:
import glob

import inflect
import unidecode

import plutil

In [110]:
# Set to "contract" for contract-by-contract parsing. Otherwise, set to "batch"
# to process the statements in batches (runs faster)
batch_mode = "contract"
#batch_mode = "batch"

In [111]:
subject_dict = {
    'other':0,'worker':1,'union':2,'firm':3,'manager':4
}
# And make it reversible
for k, v in subject_dict.copy().items():
    subject_dict[v] = k

snpdata_header = [
    "contract_id","section_num","sentence_num","statement_num","md",
    "strict_modal","neg","passive","verb","object_branches","full_sentence"
]

In [112]:
def stream_pdata():
    pdata_path = pl.get_pdata_output_path()
    pdata_fpaths = glob.glob(os.path.join(pdata_path, "*.pkl"))
    for cur_pdata_fpath in pdata_fpaths:
        # Get info from the fpath itself
        cur_fname = os.path.basename(cur_pdata_fpath)
        fname_info = plutil.parse_fname(cur_fname)
        contract_id = fname_info['id']
        contract_lang = fname_info['lang']
        cur_contract_meta = DocMeta(contract_id, contract_lang)
        cur_pdata_df = plutil.safe_load_pickle(cur_pdata_fpath)
        yield cur_pdata_df, cur_contract_meta

In [113]:
def check_neg(statement_row):
    return statement_row['neg'] == 'not'

def check_strict_modal(statement_row):
    strict_modal = False
    if statement_row['md']:
        strict_modal = statement_row['modal'] in ['shall','must','will']
    return strict_modal

In [114]:
def compute_statement_auth(contract_meta, contract_pdata_df, use_tqdm=False):
    disable_tqdm = not use_tqdm
    #pl.vprint(f"starting compute_statement_auth()")
    vars_to_keep = [
        "contract_id","article_num","sentence_num","statement_num",
        "subject","modal","md","neg","verb","passive","full_sentence"
    ]
    ### Soo after this we're grabbing stuff out of auth_df A LOT.
    ### so I'm renaming it to just df
    df = contract_pdata_df[vars_to_keep].copy()
    # We can save memory by converting some of the ints to bools
    df["md"] = df["md"].astype('bool')
    df["passive"] = df["passive"].astype('bool')
    df["subject"] = df["subject"].str.lower()
    df["subnorm"] = df["subject"].apply(pl.normalize_subject)
    # Strict modal check. axis=1 means apply row-by-row
    df["strict_modal"] = df.apply(check_strict_modal, axis=1).astype('bool')
    df["neg"] = df['neg'].apply(lambda x: x == 'not').astype('bool')
    auth_progress = tqdm(total=12) if use_tqdm else lambda x: None
    with tqdm(total=12, disable=disable_tqdm) as pbar:
        df['count'] = 1
        ##### (1/12) Permissive modals are may and can
        df['permissive_modal'] = (df['md'] & ~df['strict_modal']).astype('bool')
        pbar.update(1)
        ##### (2/12) Obligation verbs
        df_passive = df['passive']
        df['obligation_verb'] = (df_passive &
                                 df['verb'].isin(['require', 'expect', 'compel', 'oblige', 'obligate'])).astype('bool')
        pbar.update(1)
        ##### (3/12) Constraint verbs
        df['constraint_verb'] = (df_passive &
                                 df['verb'].isin(['prohibit', 'forbid', 'ban', 'bar', 'restrict', 'proscribe'])).astype('bool')
        pbar.update(1)
        ##### (4/12) Permission verbs are: be allowed, be permitted, and be authorized
        df['permission_verb'] =  (df_passive &
                                  df['verb'].isin(['allow', 'permit', 'authorize'])).astype('bool')
        pbar.update(1)
        ##### (5/12) Entitlement verbs
        df_notpassive = ~df_passive
        df['entitlement_verb'] =  (df_notpassive &
                                   df['verb'].isin(['have', 'receive','retain'])).astype('bool')
        pbar.update(1)
        ##### (6/12) Promise verbs
        df['promise_verb'] = (df_notpassive &
                              df['verb'].isin(['agree','promise','commit','recognize',
                                               'consent','assent','affirm','assure',
                                               'guarantee','insure','ensure','stipulate',
                                               'undertake','pledge'])).astype('bool')
        pbar.update(1)
        #pl.dprint("Computed up to promise_verb")
        ##### (7/12) Special verbs
        df['special_verb'] = (df['obligation_verb'] | df['constraint_verb'] | df['permission_verb'] | df['entitlement_verb'] | df['promise_verb']).astype('bool')
        pbar.update(1)
        ##### (8/12) Active verbs
        df['active_verb'] = (df_notpassive & ~df['special_verb']).astype('bool')
        pbar.update(1)
        #df['verb_type'] = 0 + 1 *df['passive'] + 2*df['obligation_verb'] + 3*df['constraint_verb'] + 4*df['permission_verb'] + 5*df['entitlement_verb']
        ##### (9/12) Obligation verbs
        df_neg = df['neg']
        df_notneg = ~df_neg
        df['obligation'] = ((df_notneg & df['strict_modal'] & df['active_verb']) |     #positive, strict modal, action verb
                            (df_notneg & df['strict_modal'] & df['obligation_verb']) | #positive, strict modal, obligation verb
                            (df_notneg & ~df['md'] & df['obligation_verb'])).astype('bool') #positive, non-modal, obligation verb
        pbar.update(1)
        ##### (10/12) Constraint verbs
        df['constraint'] = ((df_neg & df['md'] & ~df['obligation_verb']) | # negative, any modal, any verb except obligation verb
                            (df_notneg & df['strict_modal'] & df['constraint_verb'])).astype('bool') # positive, strict modal, constraint verb
        pbar.update(1)
        ##### (11/12) Permission verbs
        df['permission'] = ((df_notneg & ( (df['permissive_modal'] & df['active_verb']) |
                                           df['permission_verb'])) |
                            (df['neg'] & df['constraint_verb'])).astype('bool')
        pbar.update(1)
        ##### (12/12) Entitlement verbs
        df['entitlement'] = ((df_notneg & df['entitlement_verb']) |
                             (df_notneg & df['strict_modal'] & df['passive']) |
                             (df_neg & df['obligation_verb'])).astype('bool')
        pbar.update(1)
    #pl.vprint("Authority measures computed.")
    sauth_path = pl.get_sauth_output_path()
    sauth_fpath = os.path.join(sauth_path, f"{contract_meta.contract_id}_{contract_meta.lang}.pkl")
    plutil.safe_to_pickle(df, sauth_fpath)

In [115]:
pdata_fpaths = glob.glob(os.path.join(pl.get_pdata_output_path(), "*.pkl"))
num_to_process = len(pdata_fpaths)

In [116]:
pdata_iter = tqdm(stream_pdata(), total=num_to_process)
for cur_pdata_df, contract_meta in pdata_iter:
    contract_id = contract_meta.contract_id
    pdata_iter.set_description(contract_id)
    # Check if empty
    if len(cur_pdata_df) == 0:
        continue
    # And process
    compute_statement_auth(contract_meta, cur_pdata_df)

0003405a: 100%|██████████| 20/20 [00:01<00:00, 15.17it/s]


### Step 4.2: Combine the individual batches to get one file per contract

(Not needed if you're already doing contract-by-contract parsing!)

In [117]:
if batch_mode == "batch":
    auth_path = pl.get_statement_auth_path()
    fpath_data = plutil.sort_by_suffix(auth_path)
    new_fname = plutil.remove_suffix(auth_path)
    auth_df = pd.DataFrame()
    for fnum, fpath in fpath_data:
        cur_df = pd.read_pickle(fpath)
        auth_df = pd.concat([auth_df,cur_df])
    # Once combined, save a version without a numeric suffix
    plutil.safe_to_pickle(auth_df, os.path.join(pl.get_output_path(), new_fname))

## Step 5: Sum authority measures to contract level

In [118]:
def stream_sauth():
    sauth_path = pl.get_sauth_output_path()
    sauth_fpaths = glob.glob(os.path.join(sauth_path, "*.pkl"))
    for cur_sauth_fpath in sauth_fpaths:
        # Get info from the fpath itself
        cur_fname = os.path.basename(cur_sauth_fpath)
        fname_info = plutil.parse_fname(cur_fname)
        contract_id = fname_info['id']
        contract_lang = fname_info['lang']
        cur_contract_meta = DocMeta(contract_id, contract_lang)
        cur_sauth_df = plutil.safe_load_pickle(cur_sauth_fpath)
        yield cur_sauth_df, cur_contract_meta

In [119]:
def sum_auth_df(auth_df):
    """
    Helper function which sums up a *single* (chunk) dataframe containing
    auth measures, so that they can be combined into one final summed df by sum_auth()
    """
    unique_ids = ["contract_id","subnorm"]
    grp_df = auth_df.groupby(unique_ids).sum()
    grp_df.reset_index(inplace=True)
    def conditional_measure(x,subnorm,measure):
        # This function returns the actual measure if the subnorm of the row
        # is equal to the subnorm argument, and 0.0 otherwise. This makes it
        # so that when we "sum" up to just contract_id we get just the measure
        # for the subnorm of interest
        return x[measure] if x["subnorm"]==subnorm else 0.0
    subnorms_to_process = pl.subnorm_list
    if "other" in subnorms_to_process:
        subnorms_to_process.remove("other")
    for cur_measure in pl.AUTH_MEASURES:
        pl.iprint("Making cols for " + cur_measure)
        for cur_subnorm in subnorms_to_process:
            print("Making cols for " + cur_measure + " x " + cur_subnorm)
            new_col_name = cur_measure + "_" + cur_subnorm
            grp_df[new_col_name] = grp_df.apply(conditional_measure,
                                                axis="columns",args=(cur_subnorm,cur_measure))
    # And now just sum! It's not actually summing anything, just getting rid
    # of all the 0.0 cells, for example permission_worker in a row with subnorm firm
    sum_df = grp_df.groupby(["contract_id"]).sum()
    sum_df.reset_index(inplace=True)
    # Return it so these summed chunk dfs can be combined into a final summed df
    # by sum_auth()
    return sum_df

In [120]:
sauth_dfs = []
summed_dfs = []
for cur_sauth_df, contract_meta in stream_sauth():
    sauth_dfs.append(cur_sauth_df)
    # Now we do a final sum to obtain the (non-chunked) contract-level dataset
    cauth_df = cur_sauth_df.groupby(["contract_id"]).sum()
    cauth_df.reset_index(inplace=True)
    summed_dfs.append(cauth_df)
# First, we combine the individual summed dfs
full_df = pd.concat(summed_dfs)
# And output the combined df
pkl_fpath = pl.get_cauth_output_fpath("pkl")
plutil.safe_to_pickle(full_df, pkl_fpath)
csv_fpath = pl.get_cauth_output_fpath("csv")
full_df.to_csv(csv_fpath, index=False)

# Now we combine the *statement-level* dfs
allsauth_df = pd.concat(sauth_dfs)
# And output this combined df
allsauth_pkl_fpath = pl.get_allsauth_output_fpath("pkl")
plutil.safe_to_pickle(allsauth_df, allsauth_pkl_fpath)
allsauth_csv_fpath = pl.get_allsauth_output_fpath("csv")
plutil.safe_to_pickle(allsauth_df, allsauth_csv_fpath)

In [121]:
full_df

Unnamed: 0,contract_id,article_num,sentence_num,statement_num,md,neg,passive,strict_modal,count,permissive_modal,...,constraint_verb,permission_verb,entitlement_verb,promise_verb,special_verb,active_verb,obligation,constraint,permission,entitlement
0,0000102a,12074,84760,696,429,57,343,380,978,49,...,1,6,53,18,97,564,170,28,29,219
0,0000103a,9017,54548,611,344,44,266,299,810,45,...,1,4,31,18,73,495,154,22,25,158
0,0000104a,4122,36906,810,398,53,294,343,780,55,...,1,5,42,22,93,422,171,27,30,188
0,0000105a,9289,24819,798,460,59,332,396,869,64,...,1,5,50,20,99,467,191,33,34,218
0,0000106a,8353,22334,759,445,62,321,384,849,61,...,1,5,52,20,102,456,184,34,31,216
0,0000201a,3837,4659,470,222,26,149,193,397,29,...,0,2,27,4,41,217,93,15,14,102
0,0000202a,3915,5104,367,229,30,154,201,417,28,...,0,2,28,4,42,231,95,18,14,111
0,0000203a,3550,5214,340,219,29,153,190,402,29,...,0,2,28,3,40,218,83,18,15,110
0,0000204a,3352,7065,441,240,28,167,207,443,33,...,0,1,29,5,42,242,93,16,17,118
0,0000205a,4029,5701,337,250,30,168,217,434,33,...,0,1,25,5,38,236,101,17,16,117


## Step 6: Compute corpus-wide summary stats

In [122]:
import os
import re

import plutil

In [123]:
# Change this list to change which conditionals get counted
cond_list = [
    "if","in_case","where","were","had","could","unless","should",
    "as_long_as","so_long_as","provided_that","otherwise","supposing"
]

# And this dict to change which LIWC categories get counted
liwc_fpath_dict = {
    "number": os.path.join(pl.get_liwc_path(), "024-number.txt"),
    "quant": os.path.join(pl.get_liwc_path(), "025-quant.txt"),
    "certainty": os.path.join(pl.get_liwc_path(), "055-certain.txt"),
    "tentative": os.path.join(pl.get_liwc_path(), "054-tentat.txt")
}

In [124]:
# There are actually two different possible word counts: the first is just
# the number of tokens in the word-tokenized plaintext. This, to me, is not
# what we want, since (for example) if there are a bunch of spaces between
# each letter this method will produce some huge number of tokens, even
# though each token is actually just one letter. So I think what we want to
# do instead is count the number of words in the full_sentence values for
# each contract.
#pl.iprint("compute_word_counts()")
auth_df = plutil.safe_load_pickle(pl.get_allsauth_output_fpath("pkl"))
# First, while it's at the statement level, compute counts
vars_to_keep = ["contract_id","article_num","sentence_num","statement_num",
                "full_sentence","subnorm"]
wc_df = auth_df[vars_to_keep].copy()
wc_df["statement_count"] = 1
# Subnorm-specific counts
main_subnorms = pl.get_subnorm_list()
if "other" in main_subnorms:
    main_subnorms.remove("other")
for cur_subnorm in main_subnorms:
    count_var = cur_subnorm + "_count"
    wc_df[count_var] = wc_df["subnorm"] == cur_subnorm
    wc_df[count_var] = wc_df[count_var].astype(int)
# Aggregate to sentence level (full_sentence is redundant for different
# statements within the same sentence)
wc_groups = wc_df.groupby(['contract_id','article_num','sentence_num'])
pl.vprint("Summing to sentence level")
agg_dict = {'full_sentence':'first','statement_count':'sum','firm_count':'sum',
            'manager_count':'sum','union_count':'sum','worker_count':'sum'}
sent_df = wc_groups.agg(agg_dict)
# slow :(
#main_subnorms = pl.subnorm_list
#main_subnorms.remove("other")
#main_subnorms = ["firm"]
#for cur_subnorm in main_subnorms:
#    cur_count_var = cur_subnorm + "_count"
#    pl.iprint("Computing " + str(cur_count_var))
#    sent_df[cur_count_var] = auth_groups["subnorm"].agg(lambda x:(x==cur_subnorm).sum())

# Count the words. This regex method is faster than any other, according to
# some stackoverflow post I'm too lazy to find again
wordcount = re.compile(r'\w+')
def num_tokens(sent_str):
    return len(wordcount.findall(sent_str))
pl.vprint("Computing num_words")
sent_df["num_words"] = sent_df["full_sentence"].apply(num_tokens)
pl.vprint("Finished computing num_words")

def count_conditionals(sent_str):
    # Uncomment this line if you want to produce a *vector* of counts, i.e.,
    # how many times each separate conditional appears in the string
    #count_vec = cur_text.count(cur_cond.replace("_"," ")) for cur_cond in cond_list]
    # Otherwise, we just sum all these individual counts up to get one final
    # conditional count
    return sum([sent_str.count(cur_cond.replace("_"," ")) for cur_cond in cond_list])
sent_df["conditional_count"] = sent_df["full_sentence"].apply(count_conditionals)
pl.vprint("Finished counting conditionals")

# LIWC counts
def num_matches(reg_str, test_str):
    # NaNs, by definition, have zero matches, but Python blows up if we don't
    # manually specify...
    if str(test_str) == "nan":
        return 0
    num_matches = len(re.findall(reg_str,test_str))
    return num_matches
# Loop over each category specified by LIWC_DICT in pipeline.py
for cur_liwc_cat, cur_liwc_fpath in liwc_fpath_dict.items():
    pl.vprint("Counting LIWC category: " + str(cur_liwc_cat))
    # Get the filename of the current LIWC file
    cur_liwc_list = plutil.stopwords_from_file(cur_liwc_fpath)
    cur_regex = plutil.list_to_regex(cur_liwc_list)
    col_name = "liwc_" + str(cur_liwc_cat) + "_count"
    sent_df[col_name] = sent_df.apply(lambda row: num_matches(cur_regex, row["full_sentence"]), axis=1)

#print(sent_df.head())
#print(sent_df.columns)
pl.vprint("Finished LIWC counts")

# And aggregate to contract level
article_df = sent_df.groupby(["contract_id","article_num"]).sum()
contract_df = article_df.groupby(["contract_id"]).sum()
contract_df.reset_index(inplace=True)

output_csv_fpath = pl.get_sumstats_fpath(ext="csv")
pl.vprint("Saving sumstats to " + output_csv_fpath)
output_pkl_fpath = pl.get_sumstats_fpath(ext="pkl")
plutil.safe_to_csv(contract_df, output_csv_fpath, index=False)
plutil.safe_to_pickle(contract_df, output_pkl_fpath)
output_dta_fpath = pl.get_sumstats_fpath(ext="dta")
plutil.safe_to_stata(contract_df, output_dta_fpath)

Summing to sentence level
Computing num_words
Finished computing num_words
Finished counting conditionals
Counting LIWC category: number
Counting LIWC category: quant
Counting LIWC category: certainty
Counting LIWC category: tentative
Finished LIWC counts
Saving sumstats to C:/Dropbox/Labor_Contracts_Canada/./analysis/output/analysis/2022-12-jeff-python/06_sumstats.csv


## Step 7: Merge the auth data with all other contract metadata

In [129]:
main_subnorms

['firm', 'union', 'manager', 'worker']

In [136]:
pl.vprint("merge_text_meta()")
# Merges the full contract-level dataset created by generateContractAuth()
# with the metadata in <corpus_name>_meta.csv
authsums_pkl_fpath = pl.get_cauth_output_fpath(ext="pkl")
pl.vprint("Loading authsums file " + authsums_pkl_fpath)
authsums_df = plutil.safe_load_pickle(authsums_pkl_fpath)

meta_fpath = pl.get_meta_fpath()
pl.vprint("Loading meta file " + str(meta_fpath))
meta_df = plutil.safe_load_csv(meta_fpath)
# contract_id unique key
#meta_df.set_index(plu.UID_CONTRACT,inplace=True)

# Merge on contract_id
merged_df = authsums_df.merge(meta_df, how='inner', on=['contract_id'])
output_csv_fpath = pl.get_merged_meta_fpath(ext="csv")
output_pkl_fpath = pl.get_merged_meta_fpath(ext="pkl")
pl.vprint("Saving " + output_pkl_fpath)
# Make sure contract is unique id
#merged_df.set_index(plu.UID_CONTRACT,inplace=True)
plutil.safe_to_csv(merged_df, output_csv_fpath)
plutil.safe_to_pickle(merged_df, output_pkl_fpath)

merge_text_meta()
Loading authsums file C:/Dropbox/Labor_Contracts_Canada/./analysis/output/analysis/2022-12-jeff-python/05_summed_auth.pkl
Loading meta file C:/Dropbox/Labor_Contracts_Canada/./data_text/canadian_meta.csv
Saving C:/Dropbox/Labor_Contracts_Canada/./analysis/output/analysis/2022-12-jeff-python/07_merged_meta.pkl


  return pd.read_csv(csv_fpath)


In [137]:
merged_df

Unnamed: 0,contract_id,article_num,sentence_num,statement_num,md,neg,passive,strict_modal,count,permissive_modal,...,wage_4th_inc,wage_5th_inc,wage_6th_inc,wage_7th_inc,wage_8th_inc,wage_9th_inc,wage_10th_inc,language,union_code,union_desc_eng
0,0000102a,12074,84760,696,429,57,343,380,978,49,...,5.76159,,,,,,,eng,,
1,0000103a,9017,54548,611,344,44,266,299,810,45,...,,,,,,,,eng,,
2,0000104a,4122,36906,810,398,53,294,343,780,55,...,,,,,,,,eng,,
3,0000105a,9289,24819,798,460,59,332,396,869,64,...,1.994885,2.006018,,,,,,eng,,
4,0000106a,8353,22334,759,445,62,321,384,849,61,...,2.018349,1.978417,,,,,,eng,,
5,0000201a,3837,4659,470,222,26,149,193,397,29,...,,,,,,,,eng,,
6,0000202a,3915,5104,367,229,30,154,201,417,28,...,,,,,,,,eng,,
7,0000203a,3550,5214,340,219,29,153,190,402,29,...,,,,,,,,eng,,
8,0000204a,3352,7065,441,240,28,167,207,443,33,...,,,,,,,,eng,,
9,0000205a,4029,5701,337,250,30,168,217,434,33,...,,,,,,,,eng,,
