# Annotation Planning
Make CSV with all the information

In [11]:
import pandas as pd
import os
from os import listdir
from os.path import join, isfile
from typing import List

In [12]:
ann_planning_per_annotator = "./../data/data_info/annotation_planning_per_annotator.csv"
annotated_docID_dir = "./../data/data_info/"
DATA_PATH = "../data/"

In [13]:
def get_annotated_ids(annotated_docs: str, lang: str, recompute_file: bool = False) -> List[str]:
    """ 
    Parameters:
        annotated_docs: either directory which contains all the articles for annotation (xmi-articles are queried in subfolder core_{lang}/xmi/)
                        or path of txt-file which contains list of all the annotated articles
        lang: language ("de" or "fr")
    """

    if recompute_file:
        annotation_core_dir = join(DATA_PATH, f"for_annotation/core_{lang.lower()}/xmi/")
        annotation_IA_dir = join(DATA_PATH, f"for_annotation/IA_{lang.lower()}/xmi/")

        #get document IDs of docs which were annotated for the project 
        doc_IDs_with_ending = [f for f in listdir(annotation_core_dir) if isfile(join(annotation_core_dir, f))]
        doc_IDs_with_ending += [f for f in listdir(annotation_IA_dir) if isfile(join(annotation_IA_dir, f))]

        doc_IDs = sorted([filename[:-4] for filename in doc_IDs_with_ending])

        with open(annotated_docs + f"annotated_{lang}_docIDs.txt", "w") as f:
            for id in doc_IDs:
                f.write(id +"\n")

    else:
        with open(annotated_docs + f"annotated_{lang}_docIDs.txt", "r") as f:
            doc_IDs = [line.rstrip() for line in f]
    
    return doc_IDs
        


def annotation_planning2df(annotation_planning_path: str):
    """
    Takes the csv file used to organise for annotation and turns it into a pandas Dataframe with the following columns:
        Annotator (str), Inception Project (str), Newspapers (list of str), Finished Annotation (bool)
    :rtype pd.DataFrame
    """
    ann_planning = pd.read_csv(annotation_planning_path, usecols=["Annotator", "Inception Project", "Newspapers", "Finished Annotation"])
    ann_planning['Annotator'] = ann_planning['Annotator'].fillna(method='ffill')
    ann_planning['Newspapers'] = ann_planning['Newspapers'].apply(lambda x: x.split(", "))

    return ann_planning

In [14]:
#get all the annotated documents
doc_IDs_fr = pd.DataFrame(get_annotated_ids(annotated_docID_dir, "fr"),
                          columns=['Document ID'])
doc_IDs_fr["Corpus"] = "fr"
doc_IDs_de = pd.DataFrame(get_annotated_ids(annotated_docID_dir, "de"),
                          columns=["Document ID"])
doc_IDs_de["Corpus"] = "de"
doc_IDs = pd.concat([doc_IDs_de, doc_IDs_fr])
doc_IDs.head()

Unnamed: 0,Document ID,Corpus
0,DTT-1943-01-13-a-i0005,de
1,DTT-1943-06-05-a-i0036,de
2,DTT-1943-07-20-a-i0033,de
3,DTT-1943-09-30-a-i0009,de
4,DTT-1945-08-09-a-i0008,de


In [15]:
annotation_df = annotation_planning2df(ann_planning_per_annotator)
annotation_df = annotation_df.explode("Newspapers")
annotation_df.head(5)

Unnamed: 0,Annotator,Inception Project,Newspapers,Finished Annotation
0,eboros,impresso-newsagencies: FR,JDG,True
0,eboros,impresso-newsagencies: FR,LCR,True
1,eboros,impresso-newsagencies: FR-Minireference,all,True
2,mduring,impresso-newsagencies: DE,luxwort,True
2,mduring,impresso-newsagencies: DE,FZG,True


In [16]:
doc_IDs['Newspapers'] = doc_IDs['Document ID'].apply(lambda s: s.split("-")[0])
annotation_df = doc_IDs.merge(annotation_df, on='Newspapers')
annotation_df.drop("Newspapers", axis=1, inplace=True)
annotation_df.head()

Unnamed: 0,Document ID,Corpus,Annotator,Inception Project,Finished Annotation
0,DTT-1943-01-13-a-i0005,de,mduring,impresso-newsagencies: DE,True
1,DTT-1943-06-05-a-i0036,de,mduring,impresso-newsagencies: DE,True
2,DTT-1943-07-20-a-i0033,de,mduring,impresso-newsagencies: DE,True
3,DTT-1943-09-30-a-i0009,de,mduring,impresso-newsagencies: DE,True
4,DTT-1945-08-09-a-i0008,de,mduring,impresso-newsagencies: DE,True


In [17]:
#add information of split
split_df_de = pd.read_csv(os.path.join("./../data/data_info/", f"train-dev-test_de.csv"), 
                          usecols=["uid", "split"])
split_df_fr = pd.read_csv(os.path.join("./../data/data_info/", f"train-dev-test_fr.csv"), 
                          usecols=["uid", "split"])
split_df = pd.concat([split_df_de, split_df_fr])
annotation_df = annotation_df.merge(split_df, left_on="Document ID", right_on="uid", how="left")
annotation_df.drop(columns="uid", inplace=True)

In [18]:
sum(annotation_df["split"].isna()) # 0: every row is assigned a split

0

## Save Table (annotation per doc)

In [19]:
annotation_df.to_csv(os.path.join(annotated_docID_dir, "annotation_planning_per_doc.csv"))