In [None]:
import sys
sys.path.insert(0, '../')

In [None]:
# standard library
import pickle
# third party
import pandas as pd
from spacy.util import load_model
# local
from src.config import PARAM, PATH_MODEL, PATH_DATA_P, PATH_RESULTS
from src.spacy_helpers import serialize_batch, fetch_docs
from src.doc_analysis import basic_stats, attribute_counter, most_common
batches = PARAM.lexisnexis.batches

### Serialize LexisNexis documents

In [None]:
nlp = load_model(PATH_MODEL)

In [None]:
for batch in batches:
    serialize_batch(nlp, batch)

### Store some general stats

In [None]:
df_stats = pd.DataFrame()
for batch in batches:
    pre_df = list()

    for doc in fetch_docs(PATH_DATA_P / batch, nlp.vocab):
        stats = basic_stats(doc)   
        pre_df.append(stats)

    df_doc = pd.DataFrame(pre_df)
    df_doc.columns = [col.lower() for col in df_doc.columns]

    df_stats = df_stats.append(df_doc, sort=False)
df_stats.to_pickle(PATH_RESULTS / 'df_nlp_stats.pkl')

### Store entity and token counts

In [None]:
all_fails = list()
batches_totals = dict()
batches_unique = dict()
for batch in batches:
    batch_totals = dict()
    batch_unique = dict()
    for doc in fetch_docs(PATH_DATA_P / batch, nlp.vocab):
        totals, fails = attribute_counter(doc)
        unique, _ = attribute_counter(doc, unique=True)
        if fails:
            all_fails.append(fails)
        for key in totals:
            if key not in batch_totals:
                batch_totals[key] = totals[key]
            else:
                batch_totals[key] = batch_totals[key] + totals[key]
        for key in unique:
            if key not in batch_unique:
                batch_unique[key] = unique[key]
            else:
                batch_unique[key] = batch_unique[key] + unique[key]
    batches_totals[batch] = batch_totals
    batches_unique[batch] = batch_unique

In [None]:
n_fails = 0
for fails in all_fails:
    for fail in fails:
        n_fails += 1
n_fails

In [None]:
d = {
    'dct_total_tokens_and_entities': batches_totals,
    'dct_unique_tokens_and_entities': batches_unique,
    'lst_unrecognized_tokens': all_fails,
    }
for key in d:
    with open(PATH_RESULTS / f"{key}.pkl", 'wb') as handle:
        pickle.dump(d[key], handle)