In [None]:
import pandas as pd
from spacy.util import load_model

from src.config import PARAM, PATH_MODEL, PATH_DATA_P, PATH_RESULTS
from src.spacy_helpers import fetch_docs
from src.doc_analysis import attribute_counter, most_common

batches = PARAM.lexisnexis.batches

In [None]:
nlp = load_model(PATH_MODEL)

In [None]:
all_fails = list()
batches_totals = dict()
batches_unique = dict()
for batch in batches:
    batch_totals = dict()
    batch_unique = dict()
    for doc in fetch_docs(PATH_DATA_P / batch, nlp.vocab):
        totals, fails = attribute_counter(doc)
        unique, _ = attribute_counter(doc, unique=True)
        if fails:
            all_fails.append(fails)
        for key in totals:
            if key not in batch_totals:
                batch_totals[key] = totals[key]
            else:
                batch_totals[key] = batch_totals[key] + totals[key]
        for key in unique:
            if key not in batch_unique:
                batch_unique[key] = unique[key]
            else:
                batch_unique[key] = batch_unique[key] + unique[key]
    batches_totals[batch] = batch_totals
    batches_unique[batch] = batch_unique

In [None]:
df_all = pd.DataFrame()
for batch in batches:
    df = pd.DataFrame.from_dict(batches_totals[batch], orient='index')
    df = df.stack().to_frame().rename(columns={0: batch})
    if df_all.empty:
        df_all = df
    else:
        df_all = df.merge(df_all, how='outer', left_index=True, right_index=True)

In [None]:
df_all.to_pickle(PATH_RESULTS / 'df_attribute_counts.pkl')

In [None]:
n_fails = 0
for fails in all_fails:
    for fail in fails:
        n_fails += 1
n_fails

In [None]:
geo_ents = ['countries', 'places', 'places_uk', 'places_nl', 'places_fr']

In [None]:
data = list()
for geo_ent in geo_ents:
    for batch in batches_totals:
        data.append([geo_ent, batch, len(batches_totals[batch][geo_ent])])
pd.DataFrame(data, columns=['geo_entity', 'source', 'count']).set_index(['geo_entity', 'source']).unstack(0)

In [None]:
df = pd.DataFrame()
for geo_ent in geo_ents:
    df_ = most_common(batches_totals, geo_ent, n=12)
    df_ = pd.concat([df_], keys=[geo_ent], names=['entity_type'])
    df = df.append(df_)
df.update(df.xs('count', level=1, axis=1, drop_level=False).astype('Int64'))
df

In [None]:
df = pd.DataFrame()
for geo_ent in geo_ents:
    df_ = most_common(batches_unique, geo_ent, n=12)
    df_ = pd.concat([df_], keys=[geo_ent], names=['entity_type'])
    df = df.append(df_)
df.update(df.xs('count', level=1, axis=1, drop_level=False).astype('Int64'))
df

In [None]:
most_common(batches_totals, 'lemma', n=15)

In [None]:
most_common(batches_unique, 'lemma', n=15)