# Quality Assessment

In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

## Import data

In [2]:
input_path = "data/"

decades = list(range(1840, 2000, 10))

In [3]:
mentions_fr = pd.read_pickle(input_path + "mentions_fr.pkl")
mentions_de = pd.read_pickle(input_path + 'mentions_de.pkl')
all_mentions = pd.concat([mentions_fr, mentions_de])

## Sample and Check articles per decade

In [4]:
def sample_per_dec(df, decade, n=5):
    df_dec = df[df['decade']==decade]

    if len(df_dec) < n:
        return df_dec
    else:
        return df_dec.sample(n, random_state=2023)

In [5]:
checked = pd.DataFrame()

### 1840

In [6]:
#French
fr1840 = sample_per_dec(mentions_fr, 1840)
fr1840

Unnamed: 0,entity,surface,qid,lSentenceOffset,rSentenceOffset,sentence_idx:,lArticleOffset,rArticleOffset,id,article,newspaper,date,year,decade,agency,language,country
4204937,org.ent.pressagency.Havas,Nouv,Q2826560,1,5,50,4427,4431,LNF-1847-09-03-a-i0003:50:1:5:4427:4431:newsag...,LNF-1847-09-03-a-i0003,LNF,1847-09-03,1847,1840,Havas,fr,CH
3879776,org.ent.pressagency.Havas,Id,Q2826560,1,3,21,2243,2245,LNF-1847-09-28-a-i0001:21:1:3:2243:2245:newsag...,LNF-1847-09-28-a-i0001,LNF,1847-09-28,1847,1840,Havas,fr,CH
3819985,org.ent.pressagency.Havas,Hclvétie,Q2826560,1,9,38,5797,5805,LCG-1849-09-22-a-i0003:38:1:9:5797:5805:newsag...,LCG-1849-09-22-a-i0003,LCG,1849-09-22,1849,1840,Havas,fr,CH
3571526,org.ent.pressagency.unk,. S.,NIL,1,5,22,3407,3411,GDL-1847-06-29-a-i0007:22:1:5:3407:3411:newsag...,GDL-1847-06-29-a-i0007,GDL,1847-06-29,1847,1840,unk,fr,CH
3702482,org.ent.pressagency.unk,.- E.,NIL,-1,4,11,897,902,LSR-1847-05-08-a-i0009:11:-1:4:897:902:newsag:...,LSR-1847-05-08-a-i0009,LSR,1847-05-08,1847,1840,unk,fr,CH


In [7]:
#German
de1840 = sample_per_dec(mentions_de, 1840)
de1840

Unnamed: 0,entity,surface,qid,lSentenceOffset,rSentenceOffset,sentence_idx:,lArticleOffset,rArticleOffset,id,article,newspaper,date,year,decade,agency,language,country
556557,org.ent.pressagency.Havas,Desaaas,Q2826560,0,7,53,2441,2448,WHD-1845-02-07-a-i0006:53:0:7:2441:2448:newsag...,WHD-1845-02-07-a-i0006,WHD,1845-02-07,1845,1840,Havas,de,CH
3322985,org.ent.pressagency.ag,ag,NIL,1,3,4,98,100,SGZ-1840-04-22-a-i0009:4:1:3:98:100:newsag:ber...,SGZ-1840-04-22-a-i0009,SGZ,1840-04-22,1840,1840,ag,de,CH
3434689,org.ent.pressagency.SPK-SMP,spot,Q2256560,17,21,32,3005,3009,EZR-1849-03-23-a-i0004:32:17:21:3005:3009:news...,EZR-1849-03-23-a-i0004,EZR,1849-03-23,1849,1840,SPK-SMP,de,CH
106213,org.ent.pressagency.Reuters,Reyscher,Q130879,0,8,8,3621,3629,NZZ-1849-05-20-a-i0003:8:0:8:3621:3629:newsag:...,NZZ-1849-05-20-a-i0003,NZZ,1849-05-20,1849,1840,Reuters,de,CH
310283,org.ent.pressagency.Reuters,Reuß,Q130879,37,41,2,68,72,MGS-1843-03-22-a-i0003:2:37:41:68:72:newsag:be...,MGS-1843-03-22-a-i0003,MGS,1843-03-22,1843,1840,Reuters,de,CH


In [8]:
#manually check articles
fr1840['ground_truth'] = fr1840['article'].map({
                "EDA-1841-08-15-a-i0008": "O", 
                "LCG-1849-07-07-a-i0007": "pers.ind.articleauthor"
                })
de1840['ground_truth'] = "todo"

#append to general DataFrame
checked = pd.concat([checked, fr1840, de1840])
checked.tail(7)

Unnamed: 0,entity,surface,qid,lSentenceOffset,rSentenceOffset,sentence_idx:,lArticleOffset,rArticleOffset,id,article,newspaper,date,year,decade,agency,language,country,ground_truth
2174786,org.ent.pressagency.Havas,Estafette,Q2826560,1,10,1,112,121,LCG-1849-05-15-a-i0009:1:1:10:112:121:newsag:b...,LCG-1849-05-15-a-i0009,LCG,1849-05-15,1849,1840,Havas,fr,CH,
2603746,org.ent.pressagency.Havas,Univers,Q2826560,2,9,53,8053,8060,GAZ-1847-10-09-a-i0014:53:2:9:8053:8060:newsag...,GAZ-1847-10-09-a-i0014,GAZ,1847-10-09,1847,1840,Havas,fr,CH,
2120716,org.ent.pressagency.ag,ag,NIL,63,65,47,8208,8210,NZZ-1849-10-29-a-i0003:47:63:65:8208:8210:news...,NZZ-1849-10-29-a-i0003,NZZ,1849-10-29,1849,1840,ag,de,CH,todo
2204328,org.ent.pressagency.ATS-SDA,sw,Q430109,0,2,8,1746,1748,NZZ-1849-04-26-a-i0009:8:0:2:1746:1748:newsag:...,NZZ-1849-04-26-a-i0009,NZZ,1849-04-26,1849,1840,ATS-SDA,de,CH,todo
1649829,org.ent.pressagency.SPK-SMP,spi,Q2256560,16,19,80,7087,7090,NZZ-1841-01-08-a-i0004:80:16:19:7087:7090:news...,NZZ-1841-01-08-a-i0004,NZZ,1841-01-08,1841,1840,SPK-SMP,de,CH,todo
311399,org.ent.pressagency.Reuters,Reue,Q130879,0,4,9,137,141,NZZ-1844-05-06-a-i0001:9:0:4:137:141:newsag:be...,NZZ-1844-05-06-a-i0001,NZZ,1844-05-06,1844,1840,Reuters,de,CH,todo
112464,org.ent.pressagency.unk,Korrespondenz,NIL,2,15,88,7120,7133,SGZ-1840-09-01-a-i0008:88:2:15:7120:7133:newsa...,SGZ-1840-09-01-a-i0008,SGZ,1840-09-01,1840,1840,unk,de,CH,todo


## Check use of "unk" token

In [8]:
df_unk = all_mentions[all_mentions['agency'] == "unk"]

In [9]:
print(f"#articles classified as 'unk': {len(df_unk)} ({round(len(df_unk)/len(all_mentions)*100)}% of all classified agency mentions)")

#articles classified as 'unk': 177922 (4% of all classified agency mentions)


In [10]:
df_unk['surface'].value_counts()[:50]

. P.             12656
Fournier          9027
Bureau            7708
FN                7316
D. N. B.          7163
. T. S            3478
Telunion          3332
Korrespondenz     3053
. N. B.           2981
Cosmo             2927
C. P.             2871
az                2869
Agence            2779
Kp                1737
KP                1660
Belga             1659
S.                1642
. S.              1474
SR                1434
APRIL             1387
Agency            1384
havas             1338
. T. S.           1259
Amtlich           1198
. T.              1173
Uniteb            1120
Agenzia           1054
. B.               951
GB                 920
ry                 902
S. Sp.             881
SBolff             849
Sp                 844
TASS               812
. P. S.            787
B. C. V.           747
S. P.              714
CND                688
sr                 687
Spx                684
ugu                648
ADN                645
P. T. S.           616
KIPA       

In [11]:
df_unk[df_unk['surface']=="FN"].head()

Unnamed: 0,entity,surface,qid,lSentenceOffset,rSentenceOffset,sentence_idx:,lArticleOffset,rArticleOffset,id,article,newspaper,date,year,decade,agency,language,country
252,org.ent.pressagency.unk,FN,NIL,2,4,2,324,326,FZG-1981-11-25-a-i0028:2:2:4:324:326:newsag:be...,FZG-1981-11-25-a-i0028,FZG,1981-11-25,1981,1980,unk,de,CH
2718,org.ent.pressagency.unk,FN,NIL,2,4,1,125,127,FZG-1992-06-19-a-i0169:1:2:4:125:127:newsag:be...,FZG-1992-06-19-a-i0169,FZG,1992-06-19,1992,1990,unk,de,CH
2779,org.ent.pressagency.unk,FN,NIL,2,4,10,1065,1067,FZG-1991-08-05-a-i0056:10:2:4:1065:1067:newsag...,FZG-1991-08-05-a-i0056,FZG,1991-08-05,1991,1990,unk,de,CH
3164,org.ent.pressagency.unk,FN,NIL,2,4,31,5968,5970,FZG-1986-04-21-a-i0094:31:2:4:5968:5970:newsag...,FZG-1986-04-21-a-i0094,FZG,1986-04-21,1986,1980,unk,de,CH
3320,org.ent.pressagency.unk,FN,NIL,2,4,15,1060,1062,FZG-1988-07-21-a-i0092:15:2:4:1060:1062:newsag...,FZG-1988-07-21-a-i0092,FZG,1988-07-21,1988,1980,unk,de,CH


In [12]:
df_unk.loc[df_unk['surface']=="FN", 'newspaper'].unique()

array(['FZG'], dtype=object)

-> abbreviation for "Freiburger Nachrichten"

In [13]:
df_unk[df_unk['surface']==". P."].head()

Unnamed: 0,entity,surface,qid,lSentenceOffset,rSentenceOffset,sentence_idx:,lArticleOffset,rArticleOffset,id,article,newspaper,date,year,decade,agency,language,country
1240,org.ent.pressagency.unk,. P.,NIL,65,69,0,65,69,IMP-1960-05-18-a-i0043:0:65:69:65:69:newsag:be...,IMP-1960-05-18-a-i0043,IMP,1960-05-18,1960,1960,unk,fr,CH
2015,org.ent.pressagency.unk,. P.,NIL,70,74,0,70,74,JDG-1957-08-19-a-i0076:0:70:74:70:74:newsag:be...,JDG-1957-08-19-a-i0076,JDG,1957-08-19,1957,1950,unk,fr,CH
2813,org.ent.pressagency.unk,. P.,NIL,-1,3,44,8784,8788,LLE-1974-11-21-a-i0150:44:-1:3:8784:8788:newsa...,LLE-1974-11-21-a-i0150,LLE,1974-11-21,1974,1970,unk,fr,CH
4503,org.ent.pressagency.unk,. P.,NIL,4,8,10,886,890,GDL-1942-04-21-a-i0054:10:4:8:886:890:newsag:b...,GDL-1942-04-21-a-i0054,GDL,1942-04-21,1942,1940,unk,fr,CH
4737,org.ent.pressagency.unk,. P.,NIL,4,8,7,393,397,JDG-1952-08-08-a-i0108:7:4:8:393:397:newsag:be...,JDG-1952-08-08-a-i0108,JDG,1952-08-08,1952,1950,unk,fr,CH


-> part of (C.P.), correspondance particulière (?) - appears with local news

## Check ratio of noisy mentions
## TODO: only take clear agency names and do stats without "unk"

In [14]:
print(f"#unique tokens (surface) classified as agency: {len(all_mentions['surface'].unique())}")
print(f"Most common ones:\n{all_mentions['surface'].value_counts()[:50]}")

#unique tokens (surface) classified as agency: 36496
Most common ones:
AFP             670833
ATS             581200
Reuter          379627
ats             356665
ag.             283871
Havas           267151
sda             196004
afp             159931
AP              155882
ap              105499
United Press     73433
A. F. P.         64703
UPI              58647
Wolff            49105
Afp              47740
ag               44696
ATP              44290
Kipa             39263
reuter           37811
dpa              34564
DPA              32148
Exchange         29572
DNB              28448
spk              27295
United Preß      25449
Ats              24239
Ap               19434
ag)              15306
. P.             13157
Preß             12908
United           12489
UP               11039
FN                9389
Agence            9173
Fournier          9089
U. P.             8905
A. T. S.          8465
Bureau            7708
Ag                7607
si                7470
upi      

In [15]:
all_mentions[all_mentions['surface']=="ATP"].head()

Unnamed: 0,entity,surface,qid,lSentenceOffset,rSentenceOffset,sentence_idx:,lArticleOffset,rArticleOffset,id,article,newspaper,date,year,decade,agency,language,country
90,org.ent.pressagency.AFP,ATP,Q40464,56,59,6,1042,1045,JDG-1989-10-03-a-i0168:6:56:59:1042:1045:newsa...,JDG-1989-10-03-a-i0168,JDG,1989-10-03,1989,1980,AFP,fr,CH
91,org.ent.pressagency.AFP,ATP,Q40464,77,80,13,2090,2093,JDG-1989-10-03-a-i0168:13:77:80:2090:2093:news...,JDG-1989-10-03-a-i0168,JDG,1989-10-03,1989,1980,AFP,fr,CH
92,org.ent.pressagency.AFP,ATP,Q40464,77,80,13,2090,2093,JDG-1989-10-03-a-i0168:13:77:80:2090:2093:news...,JDG-1989-10-03-a-i0168,JDG,1989-10-03,1989,1980,AFP,fr,CH
93,org.ent.pressagency.AFP,ATP,Q40464,130,133,27,3666,3669,JDG-1989-10-03-a-i0168:27:130:133:3666:3669:ne...,JDG-1989-10-03-a-i0168,JDG,1989-10-03,1989,1980,AFP,fr,CH
141,org.ent.pressagency.AFP,ATP,Q40464,22,25,1,164,167,LLE-1996-06-29-a-i0310:1:22:25:164:167:newsag:...,LLE-1996-06-29-a-i0310,LLE,1996-06-29,1996,1990,AFP,fr,CH


### Rough ratio of noisy mentions

In [16]:
#check noisy mentions manually
all_mentions['surface'].value_counts()[:50].index

Index(['AFP', 'ATS', 'Reuter', 'ats', 'ag.', 'Havas', 'sda', 'afp', 'AP', 'ap',
       'United Press', 'A. F. P.', 'UPI', 'Wolff', 'Afp', 'ag', 'ATP', 'Kipa',
       'reuter', 'dpa', 'DPA', 'Exchange', 'DNB', 'spk', 'United Preß', 'Ats',
       'Ap', 'ag)', '. P.', 'Preß', 'United', 'UP', 'FN', 'Agence', 'Fournier',
       'U. P.', 'A. T. S.', 'Bureau', 'Ag', 'si', 'upi', 'D. N. B.', 'SDA',
       'reuters', 'Haoas', 'AfS', 'Agentur', 'ddp', 'sp', 'Stefani'],
      dtype='object')

In [17]:
not_noisy_surface = ['AFP', 'ATS', 'Reuter', 'ats', 'ag.', 'Havas', 'sda', 'afp', 'AP', 'ap',
       'United Press', 'A. F. P.', 'UPI', 'Wolff', 'Afp', 'ag', 'Kipa',
       'reuter', 'dpa', 'DPA', 'Exchange', 'DNB', 'spk', 'United Preß', 'Ats',
       'Ap', 'ag)', 'Preß', 'United', 'UP', 'FN', 'Fournier', 'Agence',
       'U. P.', 'A. T. S.', 'Bureau', 'si', 'Ag', 'upi', 'D. N. B.', 'SDA',
       'reuters', 'AfS', 'Agentur', 'ddp', 'sp', 
 'Stefani',
 'Tass',
 'Europapreß',
 'ag(',
 'Sp',
 'télégraphique',
 'up',
 'az',
 'Agence télégraphique suisse',
 'suivre',
 'Telunion',
 'at',
 'Korrespondenz',
 'DP',
 'Cosmo',
 'C. P.',
 'Belga',
 'télégraphique suisse',
 'Press',
 'svp',
 'havas',
 'APA',
 'A. F. P',
 'Ansa',
 'Ag.',
 'Taß',
 'Agency',
 'Ofi',
 'REUTER',
 'A. T. S',
 'Amtlich',
 'Interinf',
 'ADN',
 'Agenzia',
 'Extel',
 'D. P. A.',
 'TASS',
 'Nouvelle',
 'Associated',
 'Telepress',
 'apa',
 'Keystone',
 'ANSA',
 'KIPA',
 'P. T. S.',
 'Dpa',
 'Agence télégraphique',
 ]

In [40]:
print(f"Most frequent noisy/unknown mentions:\n{[surface for surface in all_mentions['surface'].value_counts()[:200].index if surface not in not_noisy_surface]},\nFrequency of last checked item: {all_mentions['surface'].value_counts()[200]}")

Most frequent noisy/unknown mentions:
['ATP', '. P.', 'Haoas', 'Ilavas', 'Hauas', '. T. S', 'DNV', 'llavas', 'ag,', 'apx', '. N. B.', 'agr.', '. P', 'fdp', 'GB', 'resp', 'Reuler', 'Resp', 'af', 'Rey', 'dds', 'Kp', 'sdi', 'ju', 'KP', 'Beuter', 'APS', 'AT', 'S.', 'pk', 'APRIL', '. S.', 'pw', 'Bavas', 'AFS', 'Europapreh', '. T.', 'SR', 'A. S.', 'APP', 'Saas', 'A. T.', 'bp', 'Hanas', '. T. S.', 'Hava', 'TS', 'ASL', 'Uniteb', 'sr', 'DRB', 'A TS', 'AF', '. B.', 'ry', 'Renier', 'U. P', 'Rex', 'ATE', 'spg', 'span', 'SBolff', 'S. Sp.', 'APEI', 'CND', 'sw', 'At', '. P. S.', 'APIC', 'United Preh', 'DR', 'APG', 'S. P.', 'APF', 'Haras', 'tp', 'stv', 'Spx', 'KPI', 'pp', 'API', 'A. B.', 'DNN', 'DK', 'B. C. V.', 'SNB', 'Uniled Press', 'ATÇ', 'AWP', 'A.', 'C. P. S.', 'IDNB', 'ugu', 'AJt', 'Europapretz', 'Reu', 'APD', 'A. R.', 'A. T', 'T. P.', 'DIE', 'P.', 'Ha', 'Bp', 'Af', 'alp', 'vas'],
Frequency of last checked item: 296


In [47]:
df_noisy_surface = all_mentions[~all_mentions['surface'].isin(not_noisy_surface)]

print(f"Rough ratio of noisy mentions: {round(len(df_noisy_surface) / len(all_mentions) *100, 2)}%")
print(f"Rough ratio of noisy mentions, unk excluded: {round(len(df_noisy_surface[df_noisy_surface['agency'] != 'unk']) / len(all_mentions) *100, 2)}%")
print(f"Rough ratio of noisy mentions, FR: {round(len(df_noisy_surface[df_noisy_surface['language'] != 'fr']) / len(mentions_fr) *100, 2)}%")
print(f"Rough ratio of noisy mentions, DE: {round(len(df_noisy_surface[df_noisy_surface['language'] != 'de']) / len(mentions_de) *100, 2)}%")

Rough ratio of noisy mentions: 9.34%
Rough ratio of noisy mentions, unk excluded: 6.77%
Rough ratio of noisy mentions, FR: 7.23%
Rough ratio of noisy mentions, DE: 13.05%


-> more or less the same as in annotated set, German a bit more