# Annotation Planning
Make CSV with all the information

In [108]:
import pandas as pd
import os
from os import listdir
from os.path import join, isfile
from typing import List

In [109]:
ann_planning_per_annotator = "./../data/data_info/annotation_planning_per_annotator.csv"
annotated_docID_dir = "./../data/data_info/"
DATA_PATH = "../data/"

In [110]:
def get_annotated_ids(annotated_docs: str, lang: str, recompute_file: bool = False) -> List[str]:
    """ 
    Parameters:
        annotated_docs: either directory which contains all the articles for annotation (xmi-articles are queried in subfolder core_{lang}/xmi/)
                        or path of txt-file which contains list of all the annotated articles
        lang: language ("de" or "fr")
    """

    if recompute_file:
        annotation_core_dir = join(DATA_PATH, f"for_annotation/core_{lang.lower()}/xmi/")
        annotation_IA_dir = join(DATA_PATH, f"for_annotation/IA_{lang.lower()}/xmi/")

        #get document IDs of docs which were annotated for the project 
        core_IDs_with_ending = [f for f in listdir(annotation_core_dir) if isfile(join(annotation_core_dir, f))]
        IA_IDs_with_ending = [f for f in listdir(annotation_IA_dir) if isfile(join(annotation_IA_dir, f))]

        core_IDs = sorted([filename[:-4] for filename in core_IDs_with_ending])
        IA_IDs = sorted([filename[:-4] for filename in IA_IDs_with_ending])

        with open(annotated_docs + f"annotated_core_{lang}_docIDs.txt", "w") as f:
            for id in core_IDs:
                f.write(id +"\n")
        
        with open(annotated_docs + f"annotated_IA_{lang}_docIDs.txt", "w") as f:
            for id in IA_IDs:
                f.write(id +"\n")

    else:
        with open(annotated_docs + f"annotated_core_{lang}_docIDs.txt", "r") as f:
            core_IDs = [line.rstrip() for line in f]

        with open(annotated_docs + f"annotated_IA_{lang}_docIDs.txt", "r") as f:
            IA_IDs = [line.rstrip() for line in f]
    
    return core_IDs, IA_IDs
        


def annotation_planning2df(annotation_planning_path: str):
    """
    Takes the csv file used to organise for annotation and turns it into a pandas Dataframe with the following columns:
        Annotator (str), Inception Project (str), Newspapers (list of str), Finished Annotation (bool)
    :rtype pd.DataFrame
    """
    ann_planning = pd.read_csv(annotation_planning_path, usecols=["Annotator", "Inception Project", "Newspapers", "Finished Annotation"])
    ann_planning['Annotator'] = ann_planning['Annotator'].fillna(method='ffill')
    ann_planning['Newspapers'] = ann_planning['Newspapers'].apply(lambda x: x.split(", "))

    return ann_planning

In [111]:
#get all the annotated document IDs from the core corpus
#[0]: only use the core_IDs
doc_IDs_fr = pd.DataFrame(get_annotated_ids(annotated_docID_dir, "fr")[0],
                          columns=['Document ID'])
doc_IDs_fr["Corpus"] = "fr"
doc_IDs_de = pd.DataFrame(get_annotated_ids(annotated_docID_dir, "de")[0],
                          columns=["Document ID"])
doc_IDs_de["Corpus"] = "de"
doc_IDs = pd.concat([doc_IDs_de, doc_IDs_fr])
doc_IDs.head()

Unnamed: 0,Document ID,Corpus
0,DTT-1943-01-13-a-i0005,de
1,DTT-1943-06-05-a-i0036,de
2,DTT-1943-07-20-a-i0033,de
3,DTT-1943-09-30-a-i0009,de
4,DTT-1945-08-09-a-i0008,de


In [112]:
annotation_df = annotation_planning2df(ann_planning_per_annotator)
annotation_df = annotation_df.explode("Newspapers", ignore_index=True)
#luxwort twice, for now put all annotations to mduring
i = annotation_df[(annotation_df["Newspapers"].isin(["luxwort", "waechtersauer"])) & (annotation_df["Annotator"]=="lmarxen")].index
annotation_df = annotation_df.drop(i)
annotation_df

Unnamed: 0,Annotator,Inception Project,Newspapers,Finished Annotation
0,eboros,impresso-newsagencies: FR,JDG,True
1,eboros,impresso-newsagencies: FR,LCR,True
2,eboros,impresso-newsagencies: FR-Minireference,all,True
3,mduring,impresso-newsagencies: DE,luxwort,True
4,mduring,impresso-newsagencies: DE,FZG,True
5,mduring,impresso-newsagencies: DE,DTT,True
6,mduring,impresso-newsagencies: DE,obermosel,True
7,mduring,impresso-newsagencies: DE,waechtersauer,True
8,mduring,impresso-newsagencies: DE,SGZ,True
9,mduring,impresso-newsagencies: DE,EZR,True


In [113]:
doc_IDs['Newspapers'] = doc_IDs['Document ID'].apply(lambda s: s.split("-")[0])
annotation_df = doc_IDs.merge(annotation_df, on='Newspapers')
annotation_df.drop("Newspapers", axis=1, inplace=True)
annotation_df.head()

Unnamed: 0,Document ID,Corpus,Annotator,Inception Project,Finished Annotation
0,DTT-1943-01-13-a-i0005,de,mduring,impresso-newsagencies: DE,True
1,DTT-1943-06-05-a-i0036,de,mduring,impresso-newsagencies: DE,True
2,DTT-1943-07-20-a-i0033,de,mduring,impresso-newsagencies: DE,True
3,DTT-1943-09-30-a-i0009,de,mduring,impresso-newsagencies: DE,True
4,DTT-1945-08-09-a-i0008,de,mduring,impresso-newsagencies: DE,True


In [114]:
#add information of split
split_df_de = pd.read_csv(os.path.join("./../data/data_info/", f"train-dev-test_de.csv"), 
                          usecols=["uid", "split"])
split_df_fr = pd.read_csv(os.path.join("./../data/data_info/", f"train-dev-test_fr.csv"), 
                          usecols=["uid", "split"])
split_df = pd.concat([split_df_de, split_df_fr])
annotation_df = annotation_df.merge(split_df, left_on="Document ID", right_on="uid", how="left")
annotation_df.drop(columns="uid", inplace=True)

In [115]:
sum(annotation_df["split"].isna()) # 0: every row is assigned a split

0

## Change Values Manually

In [116]:
def change_values_in_1col(df, col, values_to_change: dict):
    changed_df = df.copy().set_index("Document ID")
    for id, value in values_to_change.items():
        changed_df.loc[id, col] = value
    return changed_df.reset_index()

In [117]:
#annotations "curated" by lmarxen (resolved issues with unk token)
ann_to_lmarxen_ids = ["DTT-1943-06-05-a-i0036", "DTT-1953-01-24-a-i0204", "DTT-1959-12-01-a-i0079", "DTT-1962-02-26-a-i0044", 
"DTT-1969-10-13-a-i0105", "FZG-1926-11-19-a-i0020", "FZG-1929-12-06-a-i0015", "FZG-1966-11-23-a-i0016", 
"FZG-1968-06-15-a-i0099", "SGZ-1847-11-23-a-i0003", "luxwort-1917-06-27-a-i0010", "luxwort-1917-02-19-a-i0027", 
"luxwort-1917-05-10-a-i0005"]
ann_to_lmarxen = dict(zip(ann_to_lmarxen_ids, ["lmarxen"]*len(ann_to_lmarxen_ids)))

In [118]:
annotation_df = change_values_in_1col(annotation_df, "Annotator", ann_to_lmarxen)
annotation_df.head()

Unnamed: 0,Document ID,Corpus,Annotator,Inception Project,Finished Annotation,split
0,DTT-1943-01-13-a-i0005,de,mduring,impresso-newsagencies: DE,True,train
1,DTT-1943-06-05-a-i0036,de,lmarxen,impresso-newsagencies: DE,True,train
2,DTT-1943-07-20-a-i0033,de,mduring,impresso-newsagencies: DE,True,train
3,DTT-1943-09-30-a-i0009,de,mduring,impresso-newsagencies: DE,True,test
4,DTT-1945-08-09-a-i0008,de,mduring,impresso-newsagencies: DE,True,dev


In [119]:
#The German luxwort & waechtersauer was annotated by mduring, French luxwort & waechtersauer by lmarxen
luxwort_w_fr = doc_IDs_fr[doc_IDs_fr["Document ID"].str.contains("luxwort") | 
                          doc_IDs_fr["Document ID"].str.contains("waechtersauer")]
luxwort_w_fr = luxwort_w_fr["Document ID"].tolist()
luxwort_w_fr_dict = dict(zip(luxwort_w_fr, ["lmarxen"]*len(luxwort_w_fr)))
luxwort_w_fr_dict

{'luxwort-1863-08-15-a-i0004': 'lmarxen',
 'luxwort-1866-06-10-a-i0062': 'lmarxen',
 'luxwort-1867-07-21-a-i0016': 'lmarxen',
 'luxwort-1880-03-06-a-i0013': 'lmarxen',
 'luxwort-1898-05-21-b-i0016': 'lmarxen',
 'luxwort-1949-05-19-a-i0061': 'lmarxen',
 'waechtersauer-1851-12-03-a-i0013': 'lmarxen',
 'waechtersauer-1856-07-09-a-i0017': 'lmarxen',
 'waechtersauer-1859-10-22-a-i0040': 'lmarxen'}

In [120]:
annotation_df = change_values_in_1col(annotation_df, "Annotator", luxwort_w_fr_dict)
annotation_df.head()

Unnamed: 0,Document ID,Corpus,Annotator,Inception Project,Finished Annotation,split
0,DTT-1943-01-13-a-i0005,de,mduring,impresso-newsagencies: DE,True,train
1,DTT-1943-06-05-a-i0036,de,lmarxen,impresso-newsagencies: DE,True,train
2,DTT-1943-07-20-a-i0033,de,mduring,impresso-newsagencies: DE,True,train
3,DTT-1943-09-30-a-i0009,de,mduring,impresso-newsagencies: DE,True,test
4,DTT-1945-08-09-a-i0008,de,mduring,impresso-newsagencies: DE,True,dev


## Save Table (annotation per doc)

In [121]:
annotation_df.to_csv(os.path.join(annotated_docID_dir, "annotation_planning_per_doc.csv"))

In [122]:
annotation_df[annotation_df["Document ID"].str.contains("luxwort") & (annotation_df["Annotator"]=="lmarxen")]

Unnamed: 0,Document ID,Corpus,Annotator,Inception Project,Finished Annotation,split
347,luxwort-1917-02-19-a-i0027,de,lmarxen,impresso-newsagencies: DE,True,test
348,luxwort-1917-05-10-a-i0005,de,lmarxen,impresso-newsagencies: DE,True,train
349,luxwort-1917-06-27-a-i0010,de,lmarxen,impresso-newsagencies: DE,True,dev
401,luxwort-1863-08-15-a-i0004,fr,lmarxen,impresso-newsagencies: DE,True,train
402,luxwort-1866-06-10-a-i0062,fr,lmarxen,impresso-newsagencies: DE,True,train
403,luxwort-1867-07-21-a-i0016,fr,lmarxen,impresso-newsagencies: DE,True,dev
404,luxwort-1880-03-06-a-i0013,fr,lmarxen,impresso-newsagencies: DE,True,train
405,luxwort-1898-05-21-b-i0016,fr,lmarxen,impresso-newsagencies: DE,True,train
406,luxwort-1949-05-19-a-i0061,fr,lmarxen,impresso-newsagencies: DE,True,train
