In [None]:
import pandas as pd
from spacy.util import load_model

from src.config import PARAM, PATH_MODEL, PATH_DATA_I, PATH_DATA_P
from src.pd_helpers import normalize_cols, aggregate
from src.spacy_helpers import fetch_docs
from src.doc_analysis import basic_stats

batches = PARAM.batches

In [None]:
nlp = load_model(PATH_MODEL)

In [None]:
df_all = pd.DataFrame()
for batch in batches:
    df = pd.read_pickle(PATH_DATA_I / f'{batch}_.pkl')
    pre_df = list()

    for doc in fetch_docs(PATH_DATA_P / batch, nlp.vocab):
        stats = basic_stats(doc)   
        pre_df.append(stats)

    df_doc = pd.DataFrame(pre_df)
    df_doc.columns = [col.lower() for col in df_doc.columns]
    df_doc['n_words'] = df_doc['n_tokens'] - df_doc['n_stopwords']

    normalizations = [
        ('n_words',     'n_tokens',    True),
        ('n_entities',  'n_tokens',    True),
        ('n_sentences', 'n_tokens',    True),
        ('pos',         'n_tokens',    True),
        ('ent',         'n_tokens',    True),
        ('uniq_ent',    'n_entities',  True),
        ]
    
    for n in normalizations:
        cols = [col for col in df_doc.columns if col.startswith(n[0])]
        df_doc = normalize_cols(df_doc, cols, n[1], invert=n[2])

    df_ = df.merge(df_doc, left_index=True, right_index=True)
    df_all = df_all.append(df_, sort=False)

In [None]:
cols = [
    'n_sentences',
    'n_tokens',
    'n_stopwords',
    'n_words',
    'n_entities',
    'n_tokens/n_sentences',
    'n_tokens/n_words',
    'n_tokens/n_entities']
aggregate(df_all, cols, 'source')

In [None]:
cols = ['n_tokens']
cols.extend(sorted([col for col in df_all.columns if 'n_tokens/' in col and 'pos_' in col]))
aggregate(df_all, cols, 'source')

In [None]:
cols = ['n_entities']
cols.extend(sorted([col for col in df_all.columns if 'n_entities/' in col and 'ent_' in col]))
aggregate(df_all, cols, 'source')

In [None]:
cols = ['n_tokens', 'n_entities']
cols.extend(sorted([col for col in df_all.columns if 'n_tokens/' in col and 'ent_' in col]))
aggregate(df_all, cols, 'source')

In [None]:
crit1 = 'n_entities/'
crit2 = 'place'
cols = ['n_tokens', 'n_entities']
cols.extend(sorted([col for col in df_all.columns if crit1 in col and crit2 in col]))
df_avg_ent = df_all.groupby(['source', 'section'])[cols].agg(['mean']).round(1)
df_avg_ent.columns = [f"avg:{col[0]}" for col in df_avg_ent.columns.values]
df_avg_ent = df_avg_ent.join(df_all.groupby(['source', 'section'])['section'].count())
df_avg_ent.query('section >= 10').sort_values(f"avg:{cols[2]}"
    ).sort_index(level='source', sort_remaining=False)

In [None]:
crit1 = 'n_tokens/'
crit2 = 'place'
cols = ['n_tokens', 'n_entities']
cols.extend(sorted([col for col in df_all.columns if crit1 in col and crit2 in col]))
df_avg_ent = df_all.groupby(['source', 'section'])[cols].agg(['mean']).round(1)
df_avg_ent.columns = [f"avg:{col[0]}" for col in df_avg_ent.columns.values]
df_avg_ent = df_avg_ent.join(df_all.groupby(['source', 'section'])['section'].count())
df_avg_ent.query('section >= 10').sort_values(f"avg:{cols[2]}"
    ).sort_index(level='source', sort_remaining=False)