In [None]:
%cd ..

In [None]:
# standard library
import json
import pickle
from itertools import combinations

# third party
import pandas as pd
from rbo.rbo import rbo

# local
from src.config_ import PATHS, LEXISNEXIS
from src.doc_analysis import most_common

Load total and unique token and entity counts.

In [None]:
files = {
    'total': 'dct_total_tokens_and_entities.pkl',
    'unique': 'dct_unique_tokens_and_entities.pkl',
}

d = dict()
for key, file in files.items():
    with open(PATHS.results / file, 'rb') as f:
        d[key] = pickle.load(f)

Delete the alts from the data and add their count to the main key. 

In [None]:
with open(PATHS.parameters / 'alts_countries.json', 'r', encoding='utf8') as f:
    alts_countries = json.load(f)

for count_type in d:
    for batch in LEXISNEXIS.batches:
        for country in alts_countries:
            for alt in alts_countries[country]:
                n = d[count_type][batch]['countries'][alt]
                del d[count_type][batch]['countries'][alt]
                d[count_type][batch]['countries'][country] += n

## Unique entities per type and newspaper

In [None]:
geo_entities = ['countries', 'places', 'places_uk', 'places_nl', 'places_fr']

In [None]:
data = list()
for ent in geo_entities:
    for batch in d['total']:
        data.append([ent, batch, len(d['total'][batch][ent])])
pd.DataFrame(data, columns=['geo_entity', 'source', 'count']).set_index(['geo_entity', 'source']).unstack(0)

## Number of occurrences per entity type and label
Each occurrence is counted.

In [None]:
df = pd.DataFrame()
for ent in geo_entities:
    df_ = most_common(d['total'], ent, n=12)
    df_ = pd.concat([df_], keys=[ent], names=['entity_type'])
    df = df.append(df_)
df.update(df.xs('count', level=1, axis=1, drop_level=False).astype('Int64'))
df

## Number of occurrences per entity type and label
Labels are only counted once per article.

In [None]:
df = pd.DataFrame()
for ent in geo_entities:
    df_ = most_common(d['unique'], ent, n=12)
    df_ = pd.concat([df_], keys=[ent], names=['entity_type'])
    df = df.append(df_)
df.update(df.xs('count', level=1, axis=1, drop_level=False).astype('Int64'))
df

## Compare rankings between newspapers
First compare between entities:

In [None]:
results = pd.DataFrame()
for ent in geo_entities:
    rankings = df.xs(ent).xs('label', axis=1, level=1)
    for combo in combinations(rankings.columns, 2):
        data = rbo(rankings[combo[0]], rankings[combo[1]], 0.9)
        s = pd.Series(data=data, name=(ent, combo))
        if results.empty:
            results = s.to_frame()
        else:
            results = results.join(s, how='outer')
results = results.T
results.index = pd.MultiIndex.from_tuples(results.index)
results.index.names = ('geo_entity', 'comparison')

In [None]:
results.sort_values(by=['geo_entity', 'min'], ascending=False)

Then compare when combining all counts into one:

In [None]:
results = pd.DataFrame()
rankings = df.xs('label', axis=1, level=1)
for combo in combinations(rankings.columns, 2):
    ranking1 = df.xs(combo[0], axis=1).sort_values('count', ascending=False).label
    ranking2 = df.xs(combo[1], axis=1).sort_values('count', ascending=False).label
    data = rbo(ranking1, ranking2, 0.8)
    s = pd.Series(data=data, name=combo)
    if results.empty:
        results = s.to_frame()
    else:
        results = results.join(s, how='outer')
results = results.T

In [None]:
results.sort_values('min', ascending=False)

## Number of lemma occurrences
All occurrences are counted.

In [None]:
most_common(d['total'], 'lemma', n=15)

## Number of unique lemma occurrences
Lemma's are only counted once per article.

In [None]:
most_common(d['unique'], 'lemma', n=15)