# Quality Assessment

In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

## Import data

In [2]:
input_path = "data/"

decades = list(range(1840, 2000, 10))

In [3]:
mentions_fr = pd.read_pickle(input_path + "mentions_fr.pkl")
mentions_de = pd.read_pickle(input_path + 'mentions_de.pkl')
all_mentions = pd.concat([mentions_fr, mentions_de])

## Sample and Check articles per decade

In [4]:
def sample_per_dec(df, decade, n=5):
    df_dec = df[df['decade']==decade]

    if len(df_dec) < n:
        return df_dec
    else:
        return df_dec.sample(n, random_state=2023)

In [5]:
checked = pd.DataFrame()

### 1840

In [6]:
#French
fr1840 = sample_per_dec(mentions_fr, 1840)
fr1840

Unnamed: 0,entity,surface,qid,lSentenceOffset,rSentenceOffset,sentence_idx:,lArticleOffset,rArticleOffset,id,article,newspaper,date,year,decade,agency,language,country
635366,org.ent.pressagency.Havas,Estafette,Q2826560,1,10,8,1216,1225,LCG-1849-06-23-a-i0009:8:1:10:1216:1225:newsag...,LCG-1849-06-23-a-i0009,LCG,1849-06-23,1849,1840,Havas,fr,CH
2331384,org.ent.pressagency.unk,". Z,",NIL,2,6,7,892,896,luxwort-1849-06-29-a-i0006:7:2:6:892:896:newsa...,luxwort-1849-06-29-a-i0006,luxwort,1849-06-29,1849,1840,unk,fr,LU
950058,org.ent.pressagency.Extel,Esl,Q1525848,1,4,13,2245,2248,LNF-1841-07-27-a-i0003:13:1:4:2245:2248:newsag...,LNF-1841-07-27-a-i0003,LNF,1841-07-27,1841,1840,Extel,fr,CH
2174786,org.ent.pressagency.Havas,Estafette,Q2826560,1,10,1,112,121,LCG-1849-05-15-a-i0009:1:1:10:112:121:newsag:b...,LCG-1849-05-15-a-i0009,LCG,1849-05-15,1849,1840,Havas,fr,CH
2603746,org.ent.pressagency.Havas,Univers,Q2826560,2,9,53,8053,8060,GAZ-1847-10-09-a-i0014:53:2:9:8053:8060:newsag...,GAZ-1847-10-09-a-i0014,GAZ,1847-10-09,1847,1840,Havas,fr,CH


In [7]:
#German
de1840 = sample_per_dec(mentions_de, 1840)
de1840

Unnamed: 0,entity,surface,qid,lSentenceOffset,rSentenceOffset,sentence_idx:,lArticleOffset,rArticleOffset,id,article,newspaper,date,year,decade,agency,language,country
2120716,org.ent.pressagency.ag,ag,NIL,63,65,47,8208,8210,NZZ-1849-10-29-a-i0003:47:63:65:8208:8210:news...,NZZ-1849-10-29-a-i0003,NZZ,1849-10-29,1849,1840,ag,de,CH
2204328,org.ent.pressagency.ATS-SDA,sw,Q430109,0,2,8,1746,1748,NZZ-1849-04-26-a-i0009:8:0:2:1746:1748:newsag:...,NZZ-1849-04-26-a-i0009,NZZ,1849-04-26,1849,1840,ATS-SDA,de,CH
1649829,org.ent.pressagency.SPK-SMP,spi,Q2256560,16,19,80,7087,7090,NZZ-1841-01-08-a-i0004:80:16:19:7087:7090:news...,NZZ-1841-01-08-a-i0004,NZZ,1841-01-08,1841,1840,SPK-SMP,de,CH
311399,org.ent.pressagency.Reuters,Reue,Q130879,0,4,9,137,141,NZZ-1844-05-06-a-i0001:9:0:4:137:141:newsag:be...,NZZ-1844-05-06-a-i0001,NZZ,1844-05-06,1844,1840,Reuters,de,CH
112464,org.ent.pressagency.unk,Korrespondenz,NIL,2,15,88,7120,7133,SGZ-1840-09-01-a-i0008:88:2:15:7120:7133:newsa...,SGZ-1840-09-01-a-i0008,SGZ,1840-09-01,1840,1840,unk,de,CH


In [8]:
#manually check articles
fr1840['ground_truth'] = fr1840['article'].map({
                "EDA-1841-08-15-a-i0008": "O", 
                "LCG-1849-07-07-a-i0007": "pers.ind.articleauthor"
                })
de1840['ground_truth'] = "todo"

#append to general DataFrame
checked = pd.concat([checked, fr1840, de1840])
checked.tail(7)

Unnamed: 0,entity,surface,qid,lSentenceOffset,rSentenceOffset,sentence_idx:,lArticleOffset,rArticleOffset,id,article,newspaper,date,year,decade,agency,language,country,ground_truth
2174786,org.ent.pressagency.Havas,Estafette,Q2826560,1,10,1,112,121,LCG-1849-05-15-a-i0009:1:1:10:112:121:newsag:b...,LCG-1849-05-15-a-i0009,LCG,1849-05-15,1849,1840,Havas,fr,CH,
2603746,org.ent.pressagency.Havas,Univers,Q2826560,2,9,53,8053,8060,GAZ-1847-10-09-a-i0014:53:2:9:8053:8060:newsag...,GAZ-1847-10-09-a-i0014,GAZ,1847-10-09,1847,1840,Havas,fr,CH,
2120716,org.ent.pressagency.ag,ag,NIL,63,65,47,8208,8210,NZZ-1849-10-29-a-i0003:47:63:65:8208:8210:news...,NZZ-1849-10-29-a-i0003,NZZ,1849-10-29,1849,1840,ag,de,CH,todo
2204328,org.ent.pressagency.ATS-SDA,sw,Q430109,0,2,8,1746,1748,NZZ-1849-04-26-a-i0009:8:0:2:1746:1748:newsag:...,NZZ-1849-04-26-a-i0009,NZZ,1849-04-26,1849,1840,ATS-SDA,de,CH,todo
1649829,org.ent.pressagency.SPK-SMP,spi,Q2256560,16,19,80,7087,7090,NZZ-1841-01-08-a-i0004:80:16:19:7087:7090:news...,NZZ-1841-01-08-a-i0004,NZZ,1841-01-08,1841,1840,SPK-SMP,de,CH,todo
311399,org.ent.pressagency.Reuters,Reue,Q130879,0,4,9,137,141,NZZ-1844-05-06-a-i0001:9:0:4:137:141:newsag:be...,NZZ-1844-05-06-a-i0001,NZZ,1844-05-06,1844,1840,Reuters,de,CH,todo
112464,org.ent.pressagency.unk,Korrespondenz,NIL,2,15,88,7120,7133,SGZ-1840-09-01-a-i0008:88:2:15:7120:7133:newsa...,SGZ-1840-09-01-a-i0008,SGZ,1840-09-01,1840,1840,unk,de,CH,todo


## Check use of "unk" token

In [9]:
df_unk = all_mentions[all_mentions['agency'] == "unk"]

In [18]:
print(f"#articles classified as 'unk': {len(df_unk)} ({round(len(df_unk)/len(all_mentions)*100)}% of all classified agency mentions)")

#articles classified as 'unk': 98194 (4% of all classified agency mentions)


In [19]:
df_unk['surface'].value_counts()[:50]

. P.             7005
Fournier         5094
Bureau           4317
FN               4108
D. N. B.         3902
. T. S           2001
Telunion         1844
Korrespondenz    1639
. N. B.          1635
Cosmo            1615
C. P.            1608
az               1591
Agence           1517
Kp                970
KP                929
Belga             928
S.                890
. S.              816
APRIL             799
SR                784
Agency            755
havas             693
. T. S.           667
. T.              650
Amtlich           648
Uniteb            610
Agenzia           583
GB                519
ry                517
. B.              503
SBolff            489
Sp                488
S. Sp.            484
TASS              476
. P. S.           445
CND               432
S. P.             408
B. C. V.          375
Spx               371
sr                363
C. P. S.          355
ADN               352
ugu               350
P. T. S.          328
AWP               323
T. P.     

In [20]:
df_unk[df_unk['surface']=="FN"].head()

Unnamed: 0,entity,surface,qid,lSentenceOffset,rSentenceOffset,sentence_idx:,lArticleOffset,rArticleOffset,id,article,newspaper,date,year,decade,agency,language,country
252,org.ent.pressagency.unk,FN,NIL,2,4,2,324,326,FZG-1981-11-25-a-i0028:2:2:4:324:326:newsag:be...,FZG-1981-11-25-a-i0028,FZG,1981-11-25,1981,1980,unk,de,CH
2718,org.ent.pressagency.unk,FN,NIL,2,4,1,125,127,FZG-1992-06-19-a-i0169:1:2:4:125:127:newsag:be...,FZG-1992-06-19-a-i0169,FZG,1992-06-19,1992,1990,unk,de,CH
2779,org.ent.pressagency.unk,FN,NIL,2,4,10,1065,1067,FZG-1991-08-05-a-i0056:10:2:4:1065:1067:newsag...,FZG-1991-08-05-a-i0056,FZG,1991-08-05,1991,1990,unk,de,CH
3164,org.ent.pressagency.unk,FN,NIL,2,4,31,5968,5970,FZG-1986-04-21-a-i0094:31:2:4:5968:5970:newsag...,FZG-1986-04-21-a-i0094,FZG,1986-04-21,1986,1980,unk,de,CH
3320,org.ent.pressagency.unk,FN,NIL,2,4,15,1060,1062,FZG-1988-07-21-a-i0092:15:2:4:1060:1062:newsag...,FZG-1988-07-21-a-i0092,FZG,1988-07-21,1988,1980,unk,de,CH


In [21]:
df_unk.loc[df_unk['surface']=="FN", 'newspaper'].unique()

array(['FZG'], dtype=object)

-> abbreviation for "Freiburger Nachrichten"

In [22]:
df_unk[df_unk['surface']==". P."].head()

Unnamed: 0,entity,surface,qid,lSentenceOffset,rSentenceOffset,sentence_idx:,lArticleOffset,rArticleOffset,id,article,newspaper,date,year,decade,agency,language,country
1240,org.ent.pressagency.unk,. P.,NIL,65,69,0,65,69,IMP-1960-05-18-a-i0043:0:65:69:65:69:newsag:be...,IMP-1960-05-18-a-i0043,IMP,1960-05-18,1960,1960,unk,fr,CH
2015,org.ent.pressagency.unk,. P.,NIL,70,74,0,70,74,JDG-1957-08-19-a-i0076:0:70:74:70:74:newsag:be...,JDG-1957-08-19-a-i0076,JDG,1957-08-19,1957,1950,unk,fr,CH
2813,org.ent.pressagency.unk,. P.,NIL,-1,3,44,8784,8788,LLE-1974-11-21-a-i0150:44:-1:3:8784:8788:newsa...,LLE-1974-11-21-a-i0150,LLE,1974-11-21,1974,1970,unk,fr,CH
4503,org.ent.pressagency.unk,. P.,NIL,4,8,10,886,890,GDL-1942-04-21-a-i0054:10:4:8:886:890:newsag:b...,GDL-1942-04-21-a-i0054,GDL,1942-04-21,1942,1940,unk,fr,CH
4737,org.ent.pressagency.unk,. P.,NIL,4,8,7,393,397,JDG-1952-08-08-a-i0108:7:4:8:393:397:newsag:be...,JDG-1952-08-08-a-i0108,JDG,1952-08-08,1952,1950,unk,fr,CH


-> part of (C.P.), correspondance particulière (?) - appears with local news

## Check ratio of noisy mentions

In [23]:
print(f"#unique tokens (surface) classified as agency: {len(all_mentions['surface'].unique())}")
print(f"Most common ones:\n{all_mentions['surface'].value_counts()[:50]}")

#unique tokens (surface) classified as agency: 24621
Most common ones:
AFP             371179
ATS             321580
Reuter          210040
ats             197044
ag.             156559
Havas           147285
sda             107755
afp              88501
AP               86331
ap               58151
United Press     39893
A. F. P.         35686
UPI              32464
Wolff            27052
Afp              26424
ATP              24571
ag               24548
Kipa             21571
reuter           20837
dpa              19047
DPA              18048
Exchange         16279
DNB              15765
spk              15084
United Preß      14115
Ats              13375
Ap               10779
ag)               8516
. P.              7275
Preß              7061
United            6866
UP                6132
FN                5242
Fournier          5129
Agence            5081
U. P.             4810
A. T. S.          4684
Bureau            4317
si                4247
Ag                4129
upi      

In [28]:
all_mentions[all_mentions['surface']=="ATP"].head()

Unnamed: 0,entity,surface,qid,lSentenceOffset,rSentenceOffset,sentence_idx:,lArticleOffset,rArticleOffset,id,article,newspaper,date,year,decade,agency,language,country
90,org.ent.pressagency.AFP,ATP,Q40464,56,59,6,1042,1045,JDG-1989-10-03-a-i0168:6:56:59:1042:1045:newsa...,JDG-1989-10-03-a-i0168,JDG,1989-10-03,1989,1980,AFP,fr,CH
91,org.ent.pressagency.AFP,ATP,Q40464,77,80,13,2090,2093,JDG-1989-10-03-a-i0168:13:77:80:2090:2093:news...,JDG-1989-10-03-a-i0168,JDG,1989-10-03,1989,1980,AFP,fr,CH
92,org.ent.pressagency.AFP,ATP,Q40464,77,80,13,2090,2093,JDG-1989-10-03-a-i0168:13:77:80:2090:2093:news...,JDG-1989-10-03-a-i0168,JDG,1989-10-03,1989,1980,AFP,fr,CH
93,org.ent.pressagency.AFP,ATP,Q40464,130,133,27,3666,3669,JDG-1989-10-03-a-i0168:27:130:133:3666:3669:ne...,JDG-1989-10-03-a-i0168,JDG,1989-10-03,1989,1980,AFP,fr,CH
141,org.ent.pressagency.AFP,ATP,Q40464,22,25,1,164,167,LLE-1996-06-29-a-i0310:1:22:25:164:167:newsag:...,LLE-1996-06-29-a-i0310,LLE,1996-06-29,1996,1990,AFP,fr,CH


### Rough ratio of noisy mentions

In [39]:
#check noisy mentions manually
all_mentions['surface'].value_counts()[:50].index

Index(['AFP', 'ATS', 'Reuter', 'ats', 'ag.', 'Havas', 'sda', 'afp', 'AP', 'ap',
       'United Press', 'A. F. P.', 'UPI', 'Wolff', 'Afp', 'ATP', 'ag', 'Kipa',
       'reuter', 'dpa', 'DPA', 'Exchange', 'DNB', 'spk', 'United Preß', 'Ats',
       'Ap', 'ag)', '. P.', 'Preß', 'United', 'UP', 'FN', 'Fournier', 'Agence',
       'U. P.', 'A. T. S.', 'Bureau', 'si', 'Ag', 'upi', 'D. N. B.', 'SDA',
       'reuters', 'AfS', 'Haoas', 'Agentur', 'ddp', 'sp', 'Ilavas'],
      dtype='object')

In [31]:
not_noisy_surface = ['AFP', 'ATS', 'Reuter', 'ats', 'ag.', 'Havas', 'sda', 'afp', 'AP', 'ap',
       'United Press', 'A. F. P.', 'UPI', 'Wolff', 'Afp', 'ag', 'Kipa',
       'reuter', 'dpa', 'DPA', 'Exchange', 'DNB', 'spk', 'United Preß', 'Ats',
       'Ap', 'ag)', 'Preß', 'United', 'UP', 'FN', 'Fournier', 'Agence',
       'U. P.', 'A. T. S.', 'Bureau', 'si', 'Ag', 'upi', 'D. N. B.', 'SDA',
       'reuters', 'AfS', 'Agentur', 'ddp', 'sp', 
 'Stefani',
 'Tass',
 'Europapreß',
 'ag(',
 'Sp',
 'télégraphique',
 'up',
 'az',
 'Agence télégraphique suisse',
 'suivre',
 'Telunion',
 'at',
 'Korrespondenz',
 'DP',
 'Cosmo',
 'C. P.',
 'Belga',
 'télégraphique suisse',
 'Press',
 'svp',
 'havas',
 'APA',
 'A. F. P',
 'Ansa',
 'Ag.',
 'Taß',
 'Agency',
 'Ofi',
 'REUTER',
 'A. T. S',
 'Amtlich',
 'Interinf',
 'ADN',
 'Agenzia',
 'Extel',
 'D. P. A.',
 'TASS',
 'Nouvelle',
 'Associated',
 'Telepress',
 'apa',
 'Keystone',
 'ANSA',
 'KIPA',
 'P. T. S.',
 'Dpa',
 'Agence télégraphique',
 ]

In [40]:
print(f"Most frequent noisy/unknown mentions:\n{[surface for surface in all_mentions['surface'].value_counts()[:200].index if surface not in not_noisy_surface]},\nFrequency of last checked item: {all_mentions['surface'].value_counts()[200]}")

Most frequent noisy/unknown mentions:
['ATP', '. P.', 'Haoas', 'Ilavas', 'Hauas', '. T. S', 'DNV', 'llavas', 'ag,', 'apx', '. N. B.', 'agr.', '. P', 'fdp', 'GB', 'resp', 'Reuler', 'Resp', 'af', 'Rey', 'dds', 'Kp', 'sdi', 'ju', 'KP', 'Beuter', 'APS', 'AT', 'S.', 'pk', 'APRIL', '. S.', 'pw', 'Bavas', 'AFS', 'Europapreh', '. T.', 'SR', 'A. S.', 'APP', 'Saas', 'A. T.', 'bp', 'Hanas', '. T. S.', 'Hava', 'TS', 'ASL', 'Uniteb', 'sr', 'DRB', 'A TS', 'AF', '. B.', 'ry', 'Renier', 'U. P', 'Rex', 'ATE', 'spg', 'span', 'SBolff', 'S. Sp.', 'APEI', 'CND', 'sw', 'At', '. P. S.', 'APIC', 'United Preh', 'DR', 'APG', 'S. P.', 'APF', 'Haras', 'tp', 'stv', 'Spx', 'KPI', 'pp', 'API', 'A. B.', 'DNN', 'DK', 'B. C. V.', 'SNB', 'Uniled Press', 'ATÇ', 'AWP', 'A.', 'C. P. S.', 'IDNB', 'ugu', 'AJt', 'Europapretz', 'Reu', 'APD', 'A. R.', 'A. T', 'T. P.', 'DIE', 'P.', 'Ha', 'Bp', 'Af', 'alp', 'vas'],
Frequency of last checked item: 296


In [47]:
df_noisy_surface = all_mentions[~all_mentions['surface'].isin(not_noisy_surface)]

print(f"Rough ratio of noisy mentions: {round(len(df_noisy_surface) / len(all_mentions) *100, 2)}%")
print(f"Rough ratio of noisy mentions, unk excluded: {round(len(df_noisy_surface[df_noisy_surface['agency'] != 'unk']) / len(all_mentions) *100, 2)}%")
print(f"Rough ratio of noisy mentions, FR: {round(len(df_noisy_surface[df_noisy_surface['language'] != 'fr']) / len(mentions_fr) *100, 2)}%")
print(f"Rough ratio of noisy mentions, DE: {round(len(df_noisy_surface[df_noisy_surface['language'] != 'de']) / len(mentions_de) *100, 2)}%")

Rough ratio of noisy mentions: 9.34%
Rough ratio of noisy mentions, unk excluded: 6.77%
Rough ratio of noisy mentions, FR: 7.23%
Rough ratio of noisy mentions, DE: 13.05%


-> more or less the same as in annotated set, German a bit more