In [1]:
%cd ..

C:\Users\lcvri\projects_lc\lexisnexis_place_extraction


In [2]:
# standard library
import json
import pickle
from itertools import combinations

# third party
import pandas as pd
from rbo.rbo import rbo

# local
from src.config_ import PATHS, LEXISNEXIS
from src.doc_analysis import most_common

Load total and unique token and entity counts.

In [3]:
files = {
    'total': 'dct_total_tokens_and_entities.pkl',
    'unique': 'dct_unique_tokens_and_entities.pkl',
}

d = dict()
for key, file in files.items():
    with open(PATHS.results / file, 'rb') as f:
        d[key] = pickle.load(f)

Delete the alts from the data and add their count to the main key. 

In [4]:
with open(PATHS.parameters / 'alts_countries.json', 'r', encoding='utf8') as f:
    alts_countries = json.load(f)

for count_type in d:
    for batch in LEXISNEXIS.batches:
        for country in alts_countries:
            for alt in alts_countries[country]:
                n = d[count_type][batch]['countries'][alt]
                del d[count_type][batch]['countries'][alt]
                d[count_type][batch]['countries'][country] += n

## Unique entities per type and newspaper

In [5]:
geo_entities = ['countries', 'places', 'places_uk', 'places_nl', 'places_fr']

In [6]:
data = list()
for ent in geo_entities:
    for batch in d['total']:
        data.append([ent, batch, len(d['total'][batch][ent])])
pd.DataFrame(data, columns=['geo_entity', 'source', 'count']).set_index(['geo_entity', 'source']).unstack(0)

Unnamed: 0_level_0,count,count,count,count,count
geo_entity,countries,places,places_fr,places_nl,places_uk
source,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
leeuwarder_courant,52,61,11,32,19
telegraaf,56,66,2,39,36
trouw,82,119,1,41,49
volkskrant,86,153,2,57,72


## Number of occurrences per entity type and label
Each occurrence is counted.

In [7]:
df = pd.DataFrame()
for ent in geo_entities:
    df_ = most_common(d['total'], ent, n=12)
    df_ = pd.concat([df_], keys=[ent], names=['entity_type'])
    df = df.append(df_)
df.update(df.xs('count', level=1, axis=1, drop_level=False).astype('Int64'))
df

Unnamed: 0_level_0,Unnamed: 1_level_0,volkskrant,volkskrant,trouw,trouw,telegraaf,telegraaf,leeuwarder_courant,leeuwarder_courant
Unnamed: 0_level_1,Unnamed: 1_level_1,label,count,label,count,label,count,label,count
entity_type,ranking,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
countries,0,Nederland,637,Verenigd Koninkrijk,502.0,Nederland,452,Verenigd Koninkrijk,310
countries,1,Verenigd Koninkrijk,436,Nederland,494.0,Verenigd Koninkrijk,360,Nederland,301
countries,2,Frankrijk,336,Verenigde Staten,362.0,Verenigde Staten,136,Verenigde Staten,89
countries,3,Duitsland,261,China,224.0,Duitsland,88,Frankrijk,64
countries,4,Verenigde Staten,239,Duitsland,193.0,Frankrijk,85,China,52
countries,5,Rusland,109,Rusland,155.0,China,63,Duitsland,51
countries,6,Polen,94,Frankrijk,153.0,Gibraltar,47,Gibraltar,44
countries,7,China,80,Turkije,102.0,Spanje,46,Ierland,37
countries,8,Ierland,68,Ierland,95.0,Ierland,45,Rusland,32
countries,9,Spanje,60,Spanje,71.0,Polen,30,Spanje,32


## Number of occurrences per entity type and label
Labels are only counted once per article.

In [8]:
df = pd.DataFrame()
for ent in geo_entities:
    df_ = most_common(d['unique'], ent, n=12)
    df_ = pd.concat([df_], keys=[ent], names=['entity_type'])
    df = df.append(df_)
df.update(df.xs('count', level=1, axis=1, drop_level=False).astype('Int64'))
df

Unnamed: 0_level_0,Unnamed: 1_level_0,volkskrant,volkskrant,trouw,trouw,telegraaf,telegraaf,leeuwarder_courant,leeuwarder_courant
Unnamed: 0_level_1,Unnamed: 1_level_1,label,count,label,count,label,count,label,count
entity_type,ranking,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
countries,0,Verenigd Koninkrijk,289,Verenigd Koninkrijk,291.0,Verenigd Koninkrijk,225,Verenigd Koninkrijk,166
countries,1,Nederland,210,Verenigde Staten,183.0,Nederland,160,Nederland,108
countries,2,Verenigde Staten,159,Nederland,182.0,Verenigde Staten,91,Verenigde Staten,64
countries,3,Duitsland,122,Frankrijk,90.0,Duitsland,54,Duitsland,37
countries,4,Frankrijk,114,Duitsland,88.0,Frankrijk,53,Frankrijk,35
countries,5,Polen,51,Ierland,52.0,China,32,Ierland,25
countries,6,Rusland,46,Rusland,46.0,Ierland,30,China,13
countries,7,China,42,China,45.0,Polen,20,Rusland,12
countries,8,Ierland,41,Polen,39.0,Spanje,18,Spanje,12
countries,9,Spanje,38,Spanje,38.0,Luxemburg,13,Turkije,11


## Compare rankings between newspapers
First compare between entities:

In [9]:
results = pd.DataFrame()
for ent in geo_entities:
    rankings = df.xs(ent).xs('label', axis=1, level=1)
    for combo in combinations(rankings.columns, 2):
        data = rbo(rankings[combo[0]], rankings[combo[1]], 0.9)
        s = pd.Series(data=data, name=(ent, combo))
        if results.empty:
            results = s.to_frame()
        else:
            results = results.join(s, how='outer')
results = results.T
results.index = pd.MultiIndex.from_tuples(results.index)
results.index.names = ('geo_entity', 'comparison')

In [10]:
results.sort_values(by=['geo_entity', 'min'], ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,min,res,ext
geo_entity,comparison,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
places_uk,"(telegraaf, leeuwarder_courant)",0.660951,0.168216,0.714316
places_uk,"(volkskrant, trouw)",0.65017,0.161465,0.71243
places_uk,"(volkskrant, telegraaf)",0.594364,0.168216,0.64773
places_uk,"(volkskrant, leeuwarder_courant)",0.583086,0.174041,0.627558
places_uk,"(trouw, leeuwarder_courant)",0.545584,0.168216,0.59895
places_uk,"(trouw, telegraaf)",0.510522,0.174041,0.554993
places_nl,"(volkskrant, trouw)",0.768946,0.144495,0.848995
places_nl,"(trouw, leeuwarder_courant)",0.726568,0.161465,0.788828
places_nl,"(volkskrant, leeuwarder_courant)",0.714356,0.161465,0.776616
places_nl,"(trouw, telegraaf)",0.710766,0.161465,0.773027


Then compare when combining all counts into one:

In [28]:
results = pd.DataFrame()
rankings = df.xs('label', axis=1, level=1)
for combo in combinations(rankings.columns, 2):
    ranking1 = df.xs(combo[0], axis=1).sort_values('count', ascending=False).label
    ranking2 = df.xs(combo[1], axis=1).sort_values('count', ascending=False).label
    data = rbo(ranking1, ranking2, 0.8)
    s = pd.Series(data=data, name=combo)
    if results.empty:
        results = s.to_frame()
    else:
        results = results.join(s, how='outer')
results = results.T

In [29]:
results.sort_values('min', ascending=False)

Unnamed: 0,Unnamed: 1,min,res,ext
volkskrant,leeuwarder_courant,0.925002,2e-06,0.925003
volkskrant,telegraaf,0.91081,2e-06,0.910811
telegraaf,leeuwarder_courant,0.878022,2e-06,0.878023
trouw,leeuwarder_courant,0.824253,2e-06,0.824254
volkskrant,trouw,0.813513,2e-06,0.813514
trouw,telegraaf,0.770114,2e-06,0.770115


## Number of lemma occurrences
All occurrences are counted.

In [30]:
most_common(d['total'], 'lemma', n=15)

Unnamed: 0_level_0,volkskrant,volkskrant,trouw,trouw,telegraaf,telegraaf,leeuwarder_courant,leeuwarder_courant
Unnamed: 0_level_1,label,count,label,count,label,count,label,count
ranking,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0,gaan,1500,eu,1365,brexit,927,brexit,590
1,eu,1376,gaan,1172,gaan,865,gaan,532
2,brexit,1305,brexit,1154,eu,823,jaar,516
3,jaar,1262,jaar,1139,jaar,754,komen,484
4,komen,1242,groot,1053,komen,695,groot,482
5,groot,1235,komen,1053,brits,670,eu,470
6,europees,1165,land,1033,groot,578,may,442
7,maken,955,europees,855,europees,490,brits,401
8,zeggen,935,brits,826,goed,473,europees,391
9,land,865,zien,710,land,469,partij,354


## Number of unique lemma occurrences
Lemma's are only counted once per article.

In [31]:
most_common(d['unique'], 'lemma', n=15)

Unnamed: 0_level_0,volkskrant,volkskrant,trouw,trouw,telegraaf,telegraaf,leeuwarder_courant,leeuwarder_courant
Unnamed: 0_level_1,label,count,label,count,label,count,label,count
ranking,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0,brexit,567,brexit,483,brexit,478,brexit,293
1,gaan,451,gaan,380,gaan,326,groot,216
2,komen,436,groot,377,komen,305,jaar,193
3,groot,435,komen,368,jaar,295,komen,193
4,jaar,400,jaar,351,groot,274,gaan,187
5,maken,390,maken,313,brits,267,maken,171
6,staan,351,land,300,maken,255,brits,162
7,zien,336,brits,294,goed,235,eu,151
8,goed,335,goed,286,eu,227,goed,144
9,land,331,zien,280,land,218,nieuw,143
