In [1]:
%cd ..

C:\Users\lcvri\projects_lc\lexisnexis_place_extraction


In [2]:
# third party
import pandas as pd
# local
from src.config_ import PATHS
from src.pd_helpers import normalize_cols, aggregate
from src.doc_analysis import load_lexisnexis_data

In [7]:
df = load_lexisnexis_data()
stats = pd.read_pickle(PATHS.results / 'df_nlp_stats.pkl').set_index('id')
df = df.merge(stats, left_index=True, right_index=True)

In [8]:
normalizations = [
    ('n_words',     'n_tokens',    True),
    ('n_entities',  'n_tokens',    True),
    ('n_sentences', 'n_tokens',    True),
    ('pos',         'n_tokens',    True),
    ('ent',         'n_tokens',    True),
    ('unique_ent',  'n_entities',  True),
]

for n in normalizations:
    cols = [col for col in df.columns if col.startswith(n[0])]
    df = normalize_cols(df, cols, n[1], invert=n[2])

In [9]:
cols = [
    'n_sentences',
    'n_tokens',
    'n_stopwords',
    'n_words',
    'n_entities',
    'n_tokens/n_sentences',
    'n_tokens/n_words',
    'n_tokens/n_entities']
aggregate(df, cols, 'source')

source,De Telegraaf,Leeuwarder Courant,Trouw,de Volkskrant,std
source,487.0,301.0,497.0,584.0,119.09
"(n_sentences, mean)",29.7,34.1,45.3,52.3,10.32
"(n_tokens, mean)",489.9,585.2,772.5,862.7,170.33
"(n_stopwords, mean)",202.1,249.4,337.4,375.1,79.29
"(n_words, mean)",287.8,335.9,435.0,487.6,91.06
"(n_entities, mean)",6.6,7.6,10.6,10.3,1.98
"(n_tokens/n_sentences, mean)",18.0,18.5,18.3,17.7,0.35
"(n_tokens/n_words, mean)",1.7,1.7,1.8,1.8,0.06
"(n_tokens/n_entities, mean)",137.0,133.6,144.4,146.6,6.12


In [10]:
cols = ['n_tokens']
cols.extend(sorted([col for col in df.columns if 'n_tokens/' in col and 'pos_' in col]))
aggregate(df, cols, 'source')

source,De Telegraaf,Leeuwarder Courant,Trouw,de Volkskrant,std
source,487.0,301.0,497.0,584.0,119.09
"(n_tokens, mean)",489.9,585.2,772.5,862.7,170.33
"(n_tokens/pos_adj, mean)",16.3,16.9,15.8,16.3,0.45
"(n_tokens/pos_adp, mean)",9.7,9.5,9.5,9.5,0.1
"(n_tokens/pos_adv, mean)",17.1,17.2,16.6,17.1,0.27
"(n_tokens/pos_conj, mean)",26.9,24.3,23.5,23.1,1.71
"(n_tokens/pos_det, mean)",10.9,10.6,10.9,10.4,0.24
"(n_tokens/pos_noun, mean)",4.5,4.5,4.6,4.5,0.05
"(n_tokens/pos_num, mean)",68.9,68.1,64.7,69.9,2.26
"(n_tokens/pos_pron, mean)",23.8,20.6,17.7,17.3,3.02


In [7]:
cols = [col for col in df_all.columns if col.startswith('unique_ent')]
df_all['n_entities_unique'] = df_all[cols].sum(axis=1, skipna=True)

In [8]:
cols = ['n_entities', 'n_entities_unique']
cols.extend(sorted([col for col in df_all.columns if 'n_entities/' in col and 'ent_' in col]))
aggregate(df_all, cols, 'source')

source,De Telegraaf,Leeuwarder Courant,Trouw,de Volkskrant,std
source,487.0,301.0,497.0,584.0,119.09
"(n_entities, mean)",6.6,7.6,10.6,10.3,1.98
"(n_entities_unique, mean)",3.4,3.7,5.0,5.0,0.85
"(n_entities/unique_ent_countries, mean)",3.5,3.4,3.7,3.9,0.22
"(n_entities/unique_ent_places, mean)",6.3,7.0,6.6,7.1,0.37
"(n_entities/unique_ent_places_fr, mean)",9.5,6.0,13.0,21.5,6.65
"(n_entities/unique_ent_places_nl, mean)",6.5,8.3,14.4,10.6,3.41
"(n_entities/unique_ent_places_uk, mean)",7.1,8.3,10.4,9.4,1.42


In [9]:
cols = ['n_tokens', 'n_entities']
cols.extend(sorted([col for col in df_all.columns if 'n_tokens/' in col and 'ent_' in col]))
aggregate(df_all, cols, 'source')

source,De Telegraaf,Leeuwarder Courant,Trouw,de Volkskrant,std
source,487.0,301.0,497.0,584.0,119.09
"(n_tokens, mean)",489.9,585.2,772.5,862.7,170.33
"(n_entities, mean)",6.6,7.6,10.6,10.3,1.98
"(n_tokens/ent_countries, mean)",208.0,228.6,236.3,271.3,26.36
"(n_tokens/ent_places, mean)",336.0,385.3,398.2,467.1,54.08
"(n_tokens/ent_places_fr, mean)",520.0,543.0,604.5,2037.5,741.69
"(n_tokens/ent_places_nl, mean)",379.7,411.1,647.9,627.2,140.65
"(n_tokens/ent_places_uk, mean)",353.1,340.9,421.2,482.9,65.86


In [10]:
crit1 = 'n_entities/'
crit2 = 'place'
cols = ['n_tokens', 'n_entities']
cols.extend(sorted([col for col in df_all.columns if crit1 in col and crit2 in col]))
df_avg_ent = df_all.groupby(['source', 'section'])[cols].agg(['mean']).round(1)
df_avg_ent.columns = [f"avg:{col[0]}" for col in df_avg_ent.columns.values]
df_avg_ent = df_avg_ent.join(df_all.groupby(['source', 'section'])['section'].count())
df_avg_ent.query('section >= 10').sort_values(f"avg:{cols[2]}"
    ).sort_index(level='source', sort_remaining=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,avg:n_tokens,avg:n_entities,avg:n_entities/unique_ent_places,avg:n_entities/unique_ent_places_fr,avg:n_entities/unique_ent_places_nl,avg:n_entities/unique_ent_places_uk,section
source,section,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
De Telegraaf,buitenland,437.8,5.8,4.9,,16.8,5.8,158
De Telegraaf,journaal,674.7,9.8,7.0,,5.0,2.0,12
De Telegraaf,financieel,431.8,6.1,7.1,,5.7,7.0,222
De Telegraaf,binnenland,633.2,8.5,7.8,10.0,7.4,11.1,69
Leeuwarder Courant,in het nieuws,502.8,6.5,5.9,3.0,11.1,7.7,95
Leeuwarder Courant,economie,337.3,4.5,6.1,3.2,4.7,4.0,53
Leeuwarder Courant,advertenties,510.1,8.4,6.3,13.4,9.5,9.8,74
Leeuwarder Courant,buitenland,553.8,10.7,7.8,,19.5,12.2,14
Leeuwarder Courant,algemeen,838.3,8.3,8.3,3.1,6.0,9.9,24
Trouw,service,493.1,2.3,2.0,,2.5,2.5,14


In [11]:
crit1 = 'n_tokens/'
crit2 = 'place'
cols = ['n_tokens', 'n_entities']
cols.extend(sorted([col for col in df_all.columns if crit1 in col and crit2 in col]))
df_avg_ent = df_all.groupby(['source', 'section'])[cols].agg(['mean']).round(1)
df_avg_ent.columns = [f"avg:{col[0]}" for col in df_avg_ent.columns.values]
df_avg_ent = df_avg_ent.join(df_all.groupby(['source', 'section'])['section'].count())
df_avg_ent.query('section >= 10').sort_values(f"avg:{cols[2]}"
    ).sort_index(level='source', sort_remaining=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,avg:n_tokens,avg:n_entities,avg:n_tokens/ent_places,avg:n_tokens/ent_places_fr,avg:n_tokens/ent_places_nl,avg:n_tokens/ent_places_uk,section
source,section,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
De Telegraaf,buitenland,437.8,5.8,249.7,,421.8,349.0,158
De Telegraaf,binnenland,633.2,8.5,318.2,125.0,425.1,468.4,69
De Telegraaf,journaal,674.7,9.8,356.1,,361.1,662.0,12
De Telegraaf,financieel,431.8,6.1,390.1,,357.2,324.9,222
Leeuwarder Courant,buitenland,553.8,10.7,221.8,,687.0,436.8,14
Leeuwarder Courant,economie,337.3,4.5,281.3,154.4,257.3,226.8,53
Leeuwarder Courant,advertenties,510.1,8.4,347.4,886.6,421.6,374.0,74
Leeuwarder Courant,in het nieuws,502.8,6.5,360.5,492.5,482.1,319.7,95
Leeuwarder Courant,algemeen,838.3,8.3,643.7,362.6,422.0,699.4,24
Trouw,economie,522.9,8.7,279.3,,236.2,314.6,70
