# Postprocessing and Annotation Correction

In [5]:
import pandas as pd
import re
from typing import List, Dict, Optional
import os
from os import listdir
from os.path import join, isfile
from pathlib import Path

from hipe_commons.own_helpers import tsv_to_dataframe, modify_NIL_fine_by_dict, get_newsag_df, \
get_dataframes_with_newsag_name, get_full_mentions_and_position

In [2]:
docs_in_de_dir = "../data/annotated_retok_autosegment/de/tsv/"
docs_in_fr_dir = "../data/annotated_retok_autosegment/fr/tsv/"

docs_out_de_dir = "../data/annotated_retok_autosegment/de/tsv-corrected/"
docs_out_fr_dir = "../data/annotated_retok_autosegment/fr/tsv-corrected/"

docs_de = [f for f in listdir(docs_in_de_dir) if isfile(join(docs_in_de_dir, f))]
docs_fr = [f for f in listdir(docs_in_fr_dir) if isfile(join(docs_in_fr_dir, f))]

## Check for missed agency mentions

In [34]:
searchwords_str = """
containing afp or afpj or afpl or fafp or iafp or agence france presse
containing anp or algemeen nederland; not containing nationale - tagged as article
containing ansa or agenzia nationale 
containing apa or austria press
containing ap or associated press or vassociated or associated or yassociated or lassociated or dassociated or ass. press or assoc. press 
containing ats or atsj or atsl or atsi or fatsï or fatsl or atsf or agence telegraphique suisse or schweizerische depeschenagentur or sda
containing belga or beiga
containing bta or agence bulgare AND bulgare or bulgarisch 
containing ctk or ceteka 
containing ddp or dapd or deutscher depeschendienst
containing nachrichtenbüro or nachrichtenbureau or richtenbüro or richtenbureau or dnb or ldnb or ldnv or dnib or idrb or d0lb or dnv0 or idnb or d0lv 
containing domei or domci 
containing dpa or deutsche presse agentur
containing europapreß or europapreh or europapretz or europaprefz or leuropapreß or eurovapreß or europapieß or eurovapreh or seuropapreß or leuropapreh or europapieh or europaprch or leuropapretz or euiopapreß or guropapreß 
containing extel or exlel 
containing havas or bavas or hayas or havae or ilavas or llavas or flavas or haivas or uavas or havais or havasj or huvas or tlavas or jlavas or fhavas or lavas or havat or havaa or tiavas or haoas or hawas or haveis or liavas or havasl or hivas or hauas or iiavas or llaoas or heivas or havai or havasi or lfavas or mavas or hava or havasagentur or havasmeldung or haoasagentur or havasnote  
containing interfax
containing afpreuters or afpreuter or atsafp or atsreuters or atsreuter or atsjafp or atsap or aplddp or aplafp or afplap or dpalafp or atsjreuter or atsfafp or ddplap or aplsda or aplddp or sdalafp or atsjréd or atsréd
containing pap AND polonais or polnisch 
containing reutersche or reuterfche or neutersche or reuterschen or reuter'sche or neuterschen or reuterfchen or reuterbureau or reutermeldung or reuterbüro or reuter or reuters or reuterï or reutei or rcuter or rcutcr or reulers or reuteragentur or reutcr or reuterbüro or reutermeldung
containing spk or schweizerische politische korrespondenz or schweizer mittelpresse or smp 
containing stefani or steîani or stelani or stcfani or slefani or stefanl 
containing tanjug or tanyoug or tanjoug 
containing tass or tafz or telegraphen-agentur or itar
containing telunion or ltelunion or stelunion or telunwn or telegraphen union 
containing tt AND suedois or suedoise or schwedisch or schwedische 
containing upi or united press or united prefz 
containing wolff or lwolff or lwolsf or lwolss or iwolff or wofff or wolffsbüro or wolffbüro or wolffbüros or wolffbureau or wolffbureaus or wolff'sche or wolffmeldung]
"""

searchwords = [word.strip() for word in re.split("\ncontaining|\sor\s", searchwords_str)]
#first entry is emmpty string
searchwords = searchwords[1:]
searchwords.remove("bulgarisch")
searchwords.remove("schwedisch")
searchwords.remove("schwedische")
searchwords.remove("suedoise")


entries_to_replace = {
    'algemeen nederland; not containing nationale - tagged as article': "algemeen nederland",
    'agence bulgare AND bulgare': "agence bulgare",
    'pap AND polonais': "pap",
    'tt AND suedois': "tt",
    'wolffmeldung]': "wolffmeldung"
}

for wrong, correct in entries_to_replace.items():
    searchwords.remove(wrong)
    searchwords.append(correct)

searchwords[:10]

['afp',
 'afpj',
 'afpl',
 'fafp',
 'iafp',
 'agence france presse',
 'anp',
 'ansa',
 'agenzia nationale',
 'apa']

In [56]:
dfs_de = []
for doc_path in [join(docs_in_de_dir, filename) for filename in docs_de]:
    dfs_de.append(tsv_to_dataframe(doc_path, keep_comments=True, hipe_format_version="v2"))

In [57]:
pat = r'\s({})\s'.format('|'.join(searchwords))

for doc in dfs_de:
    #find tokens which match with the original impresso searchwords
    matches = re.findall(pat, " ".join(doc["TOKEN"]), flags=re.IGNORECASE)
    #if matches found, split them in seperate words (e.g. "Agence France Presse" to "Agence", "France", "Presse")
    if matches:
        split_matches = [match.split(" ") for match in matches]
        split_matches = sum(split_matches, [])
        split_matches = list(set(split_matches))
        for match in split_matches:
            #get the index of the match
            indexes = doc[doc["TOKEN"] == match].index
            for index in indexes:
                #if the match is not annotated, print it with its context
                if (doc.loc[index, "NE-FINE-LIT"] == "O"):
                    print(doc["document_id"].values[0], f"({index}):", " ".join(doc.loc[index-5:index+5, "TOKEN"]))

buergerbeamten-1902-07-15-a-i0007 (501): Die erforderlichen Teile de « Ap < parate « werden von
buergerbeamten-1904-06-02-a-i0003 (131): und dann hat Hr . Reuter al « Advokat des Llerfer
buergerbeamten-1904-06-02-a-i0003 (161): Und dann hatte Hr . Reuter in der Wahlversammlung vom letzten
buergerbeamten-1904-06-14-a-i0011 (681): Haus « wohnenden Mich . Reuter wurde die im Garten hinter
buergerbeamten-1905-05-13-a-i0009 (53): , Hirchberg mil Lacharm « Reuter , Kirch « berg .
buergerbeamten-1907-01-24-a-i0008 (391): und ihrem Ehtgatieu Paul « tass « ns , ohne «
buergerbeamten-1908-10-24-a-i0018 (107): Sein Bnidcr . Eifenbahnassistent Theodor Wolff , der am Mitlwoch Nacht
buergerbeamten-1908-10-24-a-i0018 (156): cht nicht gewachtn . Eugen Wolff luar am 23 . September
buergerbeamten-1911-11-11-a-i0005 (1254): die Heeren Prüm 19 . Reuter 18 . Bech 17 ,
diekwochen-1841-07-24-a-i0005 (102): Tarauf sprach der Herr Pastor Wolff » ) aus Nymegen in
diekwochen-1843-11-18-a-i0013 (63): Hrelstraße t

In [58]:
dfs_fr = []
for doc_path in [join(docs_in_fr_dir, filename) for filename in docs_fr]:
    dfs_fr.append(tsv_to_dataframe(doc_path, keep_comments=True, hipe_format_version="v2"))

In [59]:
pat = r'\s({})\s'.format('|'.join(searchwords))

for doc in dfs_fr:
    #find tokens which match with the original impresso searchwords
    matches = re.findall(pat, " ".join(doc["TOKEN"]), flags=re.IGNORECASE)
    #if matches found, split them in seperate words (e.g. "Agence France Presse" to "Agence", "France", "Presse")
    if matches:
        split_matches = [match.split(" ") for match in matches]
        split_matches = sum(split_matches, [])
        split_matches = list(set(split_matches))
        for match in split_matches:
            indexes = doc[doc["TOKEN"] == match].index
            for index in indexes:
                #if the match is not annotated, print it with its context
                if (doc.loc[index, "NE-FINE-LIT"] == "O"):
                    print(doc["document_id"].values[0], f"({index}):", " ".join(doc.loc[index-5:index+5, "TOKEN"]))

avenirgdl-1868-09-08-a-i0013 (255): J . - P . WOLFF , entrepreneur à Luxembourg .
avenirgdl-1871-04-27-a-i0005 (1799): k confiance de l ' ap - dr > l '
avenirgdl-1871-04-27-a-i0005 (466): y , V . nves tt le Point - du -
CDV-1844-01-10-a-i0007 (888): au lieu de l ' ap pliration , de l '
CDV-1844-06-26-a-i0002 (574): Jacques , J ' . ap . rès le programme déjà
CDV-1850-02-02-a-i0005 (1593): puis ces comptes d ' ap jthicaire que ces messieurs de
CDV-1856-10-18-a-i0003 (1493): , Baie , Schaffouse et Ap pcnzcll ; « Attendu que
courriergdl-1845-02-01-a-i0002 (1113): pour anti - catholique ; ap . prenez donc que je
courriergdl-1845-06-21-a-i0013 (457): cette réforme se trouve être ap " l ' iivee cl
courriergdl-1845-06-21-a-i0013 (1093): « einen Äcnoitt enthalten , tt > et { d treil
courriergdl-1845-06-28-a-i0002 (1361): l ' enfant , Jean Wolff , domestique au même faubourg
courriergdl-1845-07-26-a-i0001 (385): MM . le - con"letirs Reuter el Wcrquin qui ne sont
courriergdl-1846-07-29-a-i0002 (1

In [60]:
selection_de = """
DTT-1949-01-01-a-i0017 (20): Radio Moskau hat die folgende Tass - Meldung über den Handeisverkehr
FZG-1923-08-04-a-i0031 (30): . . ig . ( Havas . ) « Präsident Harding
FZG-1924-09-11-a-i0015 (298): Sept . ag , ( Wolff , ) Seit « gestern
FZG-1924-09-11-a-i0015 (364): . ag . ( . Wolff . ) Eine Verfügung der
FZG-1978-07-14-a-i0070 (323): 000 Franken aus Restaurant gestohlen sda . Unbekannte Einbrecher haben aus
FZG-1992-10-31-a-i0145 (127): von den Fernmelde - ( sda ) Mit 478 Millionen Franken
FZG-1999-07-29-a-i0095 (548): Winzerfestspiele zu sehen waren . sda Die Freunde der Folklore undTrachten
luxwort-1918-03-26-a-i0003 (553): 24 . März . ( Hava « , ) 10,50 Uhr
luxwort-1923-03-15-a-i0012 (54): 14 . März . ( Hava . ) Der Streit der
luxwort-1924-06-02-a-i0006 (549): 1 . 3uni . ( Havas . ) Uruguay schlägt Frankreich
luxwort-1924-06-02-a-i0006 (573): 1 . 3uni . ( Havas . ) Schweben schlagt Aegypten
obermosel-1918-04-03-a-i0005 (15): betonte Ministerpräsident Paschitsch nach einer Havasmeldung , daß Serbien seinen Alliierten
obermosel-1918-09-20-b-i0011 (83): Sicherheit befinde . Nach einer Reutermeldung haben sich die in Baku
SGZ-1860-06-26-a-i0012 (806): Zollcr von Frauenfeld . Das Reutersche Telegravhcn - Bureau dringt folgende
"""

selection_fr = """  
EXP-1923-03-09-a-i0087 (1189): ennemis LONDRES , 8 ( Havas ) . — La situation
EXP-1923-03-09-a-i0087 (1249): . DUBLIN , 8 ( Havas ) . — Les troubles
EXP-1931-12-14-a-i0008 (23): Garonne ) , 14 ( Havas ) . — A Montagnac
EXP-1938-03-15-a-i0108 (542): Luxembourg PARIS , 15 ( Havas ) . — Le nouveau
EXP-1938-03-15-a-i0108 (600): Chamberlain LONDRES , 14 ( Havas ) . - Evoquant l
EXP-1938-03-15-a-i0108 (796): JTie note pessimiste d ' Havas PARIS , 15 . —
EXP-1938-03-15-a-i0108 (810): Londres à l ' agence Havas : La déclaration du premier
EXP-1999-04-24-a-i0161 (310): occupés . / ats - afp Pinochet Le Chili veut un
EXP-1999-06-03-a-i0122 (215): dit cet expert . / afp Munich Le Pen à l
EXP-1999-06-03-a-i0122 (53): l ' étranger . / ats - af p Bagdad Toxiques
EXP-1999-06-03-a-i0122 (335): déclaré à I ' Associated Press à Paris que cette amende
EXP-1999-06-03-a-i0122 (334): a déclaré à I ' Associated Press à Paris que cette
GDL-1859-04-25-a-i0002 (1134): pas . L ' agence Havas n ' a reçu à
GDL-1938-12-27-a-i0044 (1945): . 26 décembre . ( Havas . ) " — Dans
GDL-1938-12-27-a-i0044 (2218): , 26 décembre . ( Havas . ) — ' Au
GDL-1938-12-27-a-i0044 (1676): , 26 décembre . ( Hayas . ) — Une nouvelle
IMP-1946-09-13-a-i0173 (1623): MOSCOU , 13 . — Reuter . — Les « Isvestia
IMP-1946-09-13-a-i0173 (1717): BERLIN , 13 . — Reuter . — La délégation britannique
IMP-1946-09-13-a-i0173 (1807): SUCCESS , 13 . — AFP . — L ' ASSEMBLEE
IMP-1985-06-25-a-i0025 (59): jugement auprès de l ' ATS . Le condamné , un
indeplux-1904-01-11-a-i0003 (888): en croire la New YorTc Associated presse , la note russe
JDG-1854-07-08-a-i0006 (938): est de l ' office Havas ; malheureusement pour les Turcs
JDG-1854-07-08-a-i0006 (1012): rassurer , l ' office Havas nous envoie ceci , en
JDG-1862-03-25-a-i0005 (739): mars . — La Correspondance Havai , dans une note officielle
JDG-1865-08-30-a-i0021 (245): télégramme de l ' office Reuter , l ' Autriche ;
JDG-1876-12-19-a-i0027 (467): décembre . L ' agence Reuter publie des avis de Cons
JDG-1877-05-22-a-i0011 (940): côté , l ' agence Havas , d ' ailleurs très
JDG-1877-11-09-a-i0016 (344): , une dépêche de VAgence Havas nous a entretenus d '
JDG-1877-11-09-a-i0016 (443): avoir employé l ' agence Havas pour lancer une nouvelle si
JDG-1881-01-09-a-i0013 (33): celles de l ' agence Stefani , le moindre journal en
JDG-1882-09-17-a-i0014 (447): possible . L ' agence Reuter a reçu des musulmans notables
JDG-1886-07-21-a-i0036 (157): Russie à l ' agence Havas rectifie les renseignements donnés sur
JDG-1887-09-15-a-i0016 (373): , par l ' agence Havas , qu ' il ne
JDG-1888-08-09-a-i0017 (64): a envoyée l ' agence Stefani , que le capitaine Cugia
JDG-1888-08-19-a-i0033 (356): assertion . L ' agence llavas croit savoir que les ministres
JDG-1897-06-20-a-i0030 (157): l ' a ? gence Stefani que sur l ' invitation
JDG-1915-01-19-a-i0002 (38): major allemand et les dépêches Wolff l ' enflent démesurément .
JDG-1916-09-10-a-i0025 (454): loi constitutionnelle . » ( Wolff ) Une guerre qui finit
JDG-1916-09-10-a-i0025 (602): un soulèvement leligieux . ( Reuter ) . La situation aux
JDG-1916-09-10-a-i0025 (905): agiession militaire . » ( Reuter )
JDG-1938-04-22-a-i0125 (22): spécial de l ' agence Havasi après l ' avance effectuée
JDG-1959-08-22-a-i0157 (318): , anonce l ' agence Tass . Après ceux de Boston
JDG-1959-08-22-a-i0157 (381): , indique l ' agence Tass .
JDG-1962-11-16-a-i0139 (480): - le problème nucléaire ( AFP . ) Répondant aux journalistes
JDG-1975-12-16-a-i0010 (429): ' agence américaine UPI ( United Press International ) , a
JDG-1975-12-16-a-i0010 (430): agence américaine UPI ( United Press International ) , a été
JDG-1975-12-16-a-i0010 (411): technicien de l ' agence UPI . — M . Assaad
JDG-1975-12-16-a-i0010 (427): de l ' agence américaine UPI ( United Press International )
JDG-1990-08-23-a-i0196 (376): Provence Marseille , 22 ( ATS / Reuter / AFP )
JDG-1990-08-23-a-i0196 (380): ( ATS / Reuter / AFP ) . - Les sauveteurs
JDG-1990-08-23-a-i0196 (641): Jaffna Colombo , 22 ( AFP ) . ~ L '
JDG-1990-08-23-a-i0196 (766): San José , 22 ( AFP ) . - La guérilla
JDG-1990-09-29-a-i0082 (156): vendredi Bruxelles , 27 ( Reuter ) . - La Communauté
JDG-1991-10-10-a-i0115 (648): déclaré mercredi à l ' ATS le Dr Walter Schwarz .
JDG-1994-06-22-a-i0063 (122): a indiqué à l ' ATS le chef des finances François
LLE-1888-04-05-a-i0010 (411): Boulanger . — L Agence Ilavas publie la note suivante :
LLE-1888-04-05-a-i0010 (604): complètement inexact , dit lAgence Ilavas . La Cocarde invoque à
LLE-1900-12-21-a-i0037 (345): officieuse du gouvernement français , Havas , n a pas reproduit
LLE-1904-11-27-a-i0009 (470): en Autriche - Hon - VAssociated Press dit tenir de bonne
LLE-1914-12-04-a-i0017 (281): décembre , ; i ( flavas j . — ile Londres
LLE-1918-01-02-a-i0007 (1072): , 2 janvier . ( Slefani . ) — Au Sénat
LLE-1939-07-17-a-i0006 (97): Bozen , l envoyé dUnHcd Press a enlendu déclarer partout que
LLE-1940-01-24-a-i0004 (641): 24 janvier . ( Unigted Press . ) — Dans les
LLE-1950-03-23-a-i0087 (514): , 23 mars . ( Reuter . ) — M .
LLE-1952-05-23-a-i0115 (1286): Long , correspondant particulier dUnited Press , M . Kur Schumacher
LLE-1959-07-29-a-i0059 (44): Novossibirsk , annonce l agence Tass . L appareil transportant le
LLE-1959-07-29-a-i0059 (367): Ukraine , annonce l agence Tass . Aecompagne des dirigeants du
LLE-1998-07-08-a-i0260 (95): heures dérection . L agence AFP , citant des journaux espagnols
LLE-1998-07-08-a-i0260 (319): a confié à l agence Reuters que le nombre de patients
lunion-1870-06-02-a-i0014 (1441): télégrammes de VA - Mnce Havas . Une proposition , due
indeplux-1880-02-07-a-i0004 (595): Australie . L ' Agence Reuter de Bombay a ouvert une
"""

In [61]:
pat_selection = "([A-Za-z\d-]*) \((\d*)\): (.*)"

selection_de = re.findall(pat_selection, selection_de)
selection_fr = re.findall(pat_selection, selection_fr)

selection_de[:5]

[('DTT-1949-01-01-a-i0017',
  '20',
  'Radio Moskau hat die folgende Tass - Meldung über den Handeisverkehr'),
 ('FZG-1923-08-04-a-i0031', '30', '. . ig . ( Havas . ) « Präsident Harding'),
 ('FZG-1924-09-11-a-i0015', '298', 'Sept . ag , ( Wolff , ) Seit « gestern'),
 ('FZG-1924-09-11-a-i0015', '364', '. ag . ( . Wolff . ) Eine Verfügung der'),
 ('FZG-1978-07-14-a-i0070',
  '323',
  '000 Franken aus Restaurant gestohlen sda . Unbekannte Einbrecher haben aus')]

In [62]:
df_selection_de = pd.DataFrame(
    {
        "document_id": [entry[0] for entry in selection_de],
        "index": [entry[1] for entry in selection_de],
        "text": [entry[2] for entry in selection_de]
    }
)

df_selection_fr = pd.DataFrame(
    {
        "document_id": [entry[0] for entry in selection_fr],
        "index": [entry[1] for entry in selection_fr],
        "text": [entry[2] for entry in selection_fr]
    }
)

#df_selection_de.to_csv("./../data/data_info/missing_annotations_de_raw.csv")

### Modify missed agency entries 

-> through curation tool in inception

In [51]:
modified_docs_de_dir = "../data/modified_annotations/annotated/de/"
modified_docs_fr_dir = "../data/modified_annotations/annotated/fr/"


modified_docs_de_xmi = [f for f in listdir(modified_docs_de_dir) 
                        if isfile(join(modified_docs_de_dir, f)) and f[-1]=="i"] #exclude typesystem.xml
modified_docs_de = [f[:-4] for f in modified_docs_de_xmi]


modified_docs_fr_xmi = [f for f in listdir(modified_docs_fr_dir) 
                        if isfile(join(modified_docs_fr_dir, f)) and f[-1]=="i"]
modified_docs_fr = [f[:-4] for f in modified_docs_fr_xmi]

In [54]:
#check if correct documents have been curated and downloaded
assert len(modified_docs_de) == len(set(modified_docs_de).intersection(set(df_selection_de["document_id"])))
assert len(modified_docs_fr) == len(set(modified_docs_fr).intersection(set(df_selection_fr["document_id"])))
print(f"#articles modified in German corpus: {len(modified_docs_de)}")
print(f"#articles modified in French corpus: {len(modified_docs_fr)}")

#articles modified in German corpus: 12
#articles modified in French corpus: 46


**now**: save articles in "../data/annotated/" and redo segmentation pipeline

## Working with unk

In [3]:
unk_de = get_newsag_df("unk", docs_in_de_dir, docs_de)
unk_fr = get_newsag_df("unk", docs_in_fr_dir, docs_fr)

In [4]:
unk_fr[unk_fr["name"].str.contains("Chine")]

Unnamed: 0,name,n,doc_id
15,Chine Nouvelle,"[46, 47]",EXP-1988-01-25-a-i0403
20,Chine Nouvelle,"[192, 193]",GDL-1989-06-28-a-i0201
21,Chine Nouvelle,"[321, 323]",GDL-1989-06-28-a-i0201
30,Chine nouvelle,"[302, 303]",IMP-1959-09-12-a-i0224
31,Chine nouvelle,"[73, 74]",IMP-1966-12-29-a-i0004


In [5]:
unk_de["name"].value_counts()

ag .                                                     61
ag                                                       11
Kipa                                                     10
Exchange                                                  8
Telunion                                                  3
Tsch . P . V .                                            2
ADN                                                       2
sda                                                       2
Conti                                                     2
ATA                                                       2
Agence Fournier                                           1
Preß Te ! ,                                               1
KNA                                                       1
Pm                                                        1
Petersburger Telegr . - Agentur .                         1
Fournier                                                  1
Petersburger Telegraphen - Agentur      

In [6]:
unk_fr["name"].value_counts()

Exchange                             7
Bavas                                4
Ofi                                  3
Chine Nouvelle                       3
Press Association                    2
Keystone                             2
Chine nouvelle                       2
Kipa                                 2
P . T . S .                          2
C . N . B .                          2
B . C . H .                          2
ATQ                                  1
APP                                  1
CPS                                  1
Ex - Change                          1
NDB                                  1
OFI                                  1
patriarche                           1
I . P . S .                          1
Bloomberg                            1
télégraphe de Saint - Pétersbourg    1
R . K .                              1
Red                                  1
M . de B .                           1
ExcluuiKe                            1
Barras                   

In [3]:
substitutions = {
    ("ag", "ag.", "ag . ", "ag .", "Agency", "Ag", "Ag .") : ({"unk": "ag"}, {"unk": "unk"}),
    ("Bavas", "Agence Bavas", "Barras") : ({"unk": "Havas"}, {"unk": "Q2826560"}),
    ("Russische Telegraphen - Agentur", "Russische Teleglllvhen - Agentur") : 
                                        ({"unk": "TASS"}, {"unk": "Q223799"}),
    ("sda") : ({"unk": "ATS-SDA"}, {"unk": "Q430109"}),
    ("Kipa", "ttfinm") : ({"unk": "Kipa"}, {"unk": "Q1522416"}),
    ("Exchange", "Kxehange", "Kxchange", "Ex - Change", "ExcluuiKe") : ({"unk": "Extel"}, {"unk": "Q1525848"}),
    ("Chine Nouvelle", "Chine nouvelle") : ({"unk": "Xinhua"}, {"unk": "Q204839"})
}

In [4]:
#German corpus
modify_NIL_fine_by_dict(substitutions, docs_out_de_dir, docs_de)

replaced unk with ag in DTT-1943-09-30-a-i0009
replaced unk with ag in DTT-1943-09-30-a-i0009
replaced unk with ag in DTT-1946-04-27-a-i0108
replaced unk with ag in DTT-1946-04-27-a-i0108
replaced unk with ag in FZG-1935-09-25-a-i0006
replaced unk with ag in FZG-1946-03-12-a-i0019
replaced unk with ag in FZG-1946-03-12-a-i0019
replaced unk with ag in FZG-1946-03-12-a-i0019
replaced unk with ag in FZG-1946-03-12-a-i0019
replaced unk with ag in FZG-1946-03-12-a-i0019
replaced unk with ag in FZG-1946-03-12-a-i0019
replaced unk with ag in FZG-1946-03-12-a-i0019
replaced unk with ag in FZG-1946-03-12-a-i0019
replaced unk with ag in FZG-1946-03-12-a-i0019
replaced unk with ag in FZG-1948-02-04-a-i0021


In [5]:
#French Corpus
modify_NIL_fine_by_dict(substitutions, docs_out_fr_dir, docs_fr)

In [6]:
get_newsag_df("unk", docs_out_de_dir, docs_de)["name"].value_counts()

Telunion                                                 3
ADN                                                      2
Conti                                                    2
Tsch . P . V .                                           2
ATA                                                      2
Agence dAnatoiie                                         1
KNA                                                      1
Pm                                                       1
Agence Fournier                                          1
Petersburger Telegr . - Agentur .                        1
Petersburger Telegraphen - Agentur                       1
kaiserlich - lönigliche Wiener Korrespondenz - Bureau    1
Kori - esp . Bureau                                      1
rma                                                      1
Bernama News Agency                                      1
A . S .                                                  1
Interim                                                 

## Check United Preß vs United Preß.

In [7]:
UP_de = get_dataframes_with_newsag_name("Q493845", docs_out_de_dir, docs_de)
len(UP_de)

32

In [8]:
df_UP = pd.DataFrame()
for df in UP_de:
    df_UP = pd.concat([df_UP, df])

In [9]:
df_UP[df_UP["NE-FINE-LIT"].str.contains("UP") & df_UP["TOKEN"]=="."]

Unnamed: 0,n,TOKEN,NE-COARSE-LIT,NE-COARSE-METO,NE-FINE-LIT,NE-FINE-METO,NE-FINE-COMP,NE-NESTED,NEL-LIT,NEL-METO,RENDER,SEG,OCR-INFO,MISC,language,newspaper,date,document_id,news-agency-as-source,segment_iiif_link


## ag vs. ag.

In [10]:
ag_de = get_dataframes_with_newsag_name("unk", docs_out_de_dir, docs_de)

#get all mentions which contain the "ag" token without a tagged "." afterwards
ag_mentions_without_period = []
for df in ag_de:
    new_ags = get_full_mentions_and_position("\.ag", df)
    if new_ags:
        if "." not in new_ags[0]["name"]:
            ag_mentions_without_period += new_ags

#get the respective dataframes
id_ag_without_period = set([entry["doc_id"] for entry in ag_mentions_without_period])
ag_df_without_period = {df["document_id"].values[0]: df for df in ag_de if (df["document_id"].values[0] in id_ag_without_period) }

In [11]:
#add annotation for "." if there is any
for mention in ag_mentions_without_period:
    next_index = mention["n"][0] + 1
    doc_id = mention["doc_id"]
    df = ag_df_without_period[doc_id]
    if (df[df["n"] == next_index ]["TOKEN"] == ".").bool() and not \
        (df[df["n"] == next_index ]["NE-FINE-LIT"] == "I-org.ent.pressagency.ag").bool():
        
        print(df[df["n"] == next_index])


In [12]:
ag_fr = get_dataframes_with_newsag_name("unk", docs_out_fr_dir, docs_fr)

#get all mentions which contain the "ag" token without a tagged "." afterwards
ag_mentions_without_period = []
for df in ag_fr:
    new_ags = get_full_mentions_and_position("\.ag", df)
    if new_ags:
        if "." not in new_ags[0]["name"]:
            ag_mentions_without_period += new_ags

#get the respective dataframes
id_ag_without_period = set([entry["doc_id"] for entry in ag_mentions_without_period])
ag_df_without_period = {df["document_id"].values[0]: df for df in ag_fr if (df["document_id"].values[0] in id_ag_without_period) }

In [13]:
ag_mentions_without_period

[{'doc_id': 'GDL-1942-06-13-a-i0074', 'name': 'agà', 'n': [29]},
 {'doc_id': 'GDL-1998-01-13-a-i0115', 'name': 'Agences', 'n': [237]}]

## Replacing Words in tsv

In [1]:
#based on https://stackoverflow.com/questions/13089234/replacing-text-in-a-file-with-python

def replace_words_in_tsv(file_path, replacements):
    """  
    :params file_path: Path to tsv file where changes should be made
    :params replacements: dict of form {old_word: new_word, ...}, where old_word should replace new_word
    """
    lines = []
    with open(file_path) as infile:
        for line in infile:
            for src, target in replacements.items():
                line = line.replace(src, target)
            lines.append(line)
    with open(file_path, 'w') as outfile:
        for line in lines:
            outfile.write(line)

In [15]:
#changing some annotation afterwards, should be obsolete if convert_xmi2clef_format.py (and pipeline afterwards) is rerun
replacements = {'PySBDSegment':'EndOfSentence', 'Q1315548':'NIL'}

for doc in docs_de:
    replace_words_in_tsv(join(docs_out_de_dir, doc), replacements)

for doc in docs_fr:
    replace_words_in_tsv(join(docs_out_fr_dir, doc), replacements)

# Making a Multilingual Dataset

In [9]:
RELEASE_DIR = "../data/release/2/"
RELEASE = 2

In [7]:
basedir = os.path.join(RELEASE_DIR, "multilingual/")

if not os.path.exists(basedir):
    Path(basedir).mkdir(parents=True, exist_ok=True)

In [14]:
#for each set, concatenate the French and German tsv files
# (based on concat_tsv_files() from create_datasets.py)
for set in ["train", "dev", "test"]:
    
    with open(join(basedir, f"newsagency-data-{RELEASE}-{set}-multilingual.tsv"), 
              "w", encoding="utf-8", newline="") as out_tsv_file:
        data = []
        files = [join(RELEASE_DIR, f"de/newsagency-data-{RELEASE}-{set}-de.tsv"),
                 join(RELEASE_DIR, f"fr/newsagency-data-{RELEASE}-{set}-fr.tsv")]
        
        for n, file in enumerate(files):
            with open(file, "r", encoding="utf-8") as inp_tsv_file:
                #only first line of document should have 
                if n > 0:
                    lines = inp_tsv_file.readlines()[1:]
                else:
                    lines = inp_tsv_file.readlines()[0:]
                data.append("".join(lines))

        out_tsv_file.write("\n".join(data))

# Constructing different Datasets

### Delete pers.ind.articleauthor tag (in separate dataset)

In [19]:
no_pers_path_de = "./../data/release/2_no_pers/de/"
no_pers_path_fr = "./../data/release/2_no_pers/fr/"
no_pers_path_multi = "./../data/release/2_no_pers/multilingual/"

docs_no_pers = []
for dir in [no_pers_path_de, no_pers_path_fr, no_pers_path_multi]:
    docs_no_pers += [join(dir, doc) for doc in listdir(dir)]

In [41]:
# go through every line an replace the articleauthor tag; save in SAME folder
for doc in docs_no_pers:
    lines = []
    with open(doc) as infile:
        print("Removing pers.ind.articleauthor tags from file", doc)
        for line in infile:
            if "pers.ind.articleauthor" in line:
                #split line by tab
                splitted_line = line.split("\t")

                #coarse NE
                splitted_line[1] = "O"
                #fine NE
                splitted_line[3] = "O"
                #LED
                splitted_line[-2] = "_"
                
                #rejoin line
                line = "\t".join(splitted_line)
            lines.append(line)

    with open(doc, 'w') as outfile:
            for line in lines:
                outfile.write(line)

Removing pers.ind.articleauthor tags from file ./../data/release/2_no_pers/de/newsagency-data-2-no_pers-dev-de.tsv
Removing pers.ind.articleauthor tags from file ./../data/release/2_no_pers/de/newsagency-data-2-no_pers-test-de.tsv
Removing pers.ind.articleauthor tags from file ./../data/release/2_no_pers/de/newsagency-data-2-no_pers-train-de.tsv
Removing pers.ind.articleauthor tags from file ./../data/release/2_no_pers/fr/newsagency-data-2-no_pers-dev-fr.tsv
Removing pers.ind.articleauthor tags from file ./../data/release/2_no_pers/fr/newsagency-data-2-no_pers-test-fr.tsv
Removing pers.ind.articleauthor tags from file ./../data/release/2_no_pers/fr/newsagency-data-2-no_pers-train-fr.tsv
Removing pers.ind.articleauthor tags from file ./../data/release/2_no_pers/multilingual/newsagency-data-2-no_pers-train-multilingual.tsv


### Delete lines with agency mention completely (separate dataset)

In [43]:
no_agency_path = "./../data/release/2_no_agency/"
docs_no_agency = [join(no_agency_path, doc) for doc in listdir(no_agency_path)]

In [44]:
# go through every line an replace the articleauthor tag; save in SAME folder
for doc in docs_no_agency:
    lines = []
    with open(doc) as infile:
        print("Removing complete lines with agency mention from file", doc)
        for line in infile:
            if not "org.ent.pressagency" in line:
                lines.append(line)

    with open(doc, 'w') as outfile:
            for line in lines:
                outfile.write(line)

Removing complete lines with agency mention from file ./../data/release/2_no_agency/newsagency-data-2-no_agency-test-de.tsv
Removing complete lines with agency mention from file ./../data/release/2_no_agency/newsagency-data-2-no_agency-test-fr.tsv
