In [1]:
import pandas as pd
import glob
import os
import spacy
nlp = spacy.load("nl_core_news_lg")

In [2]:
def legacy_parser(tsv):
    """
    Function from Jenia to add metadata (will improve this description)
    """
    # I am also adding the file number as metadata as I loaded in several notes per file
    file_nr = tsv.split('_')[-1]
    file_nr = file_nr.replace(".conll/edwin.tsv", "")

    return dict(
        file_nr = file_nr,
        year = "2020", # tot 2021, 2022
        annotator = "edwin",
        batch = "ergotherapie",
    )

def tsv_to_df(filepath):
    """
    Jenia's function that reads the tsv file as a dataframe and adds metadata
    (will improve this description)
    """
    metadata = legacy_parser(filepath)
    
    # in my case, the first column isn't the sent_index but the note index + token within note index
    names = ['note_tok_index', 'char', 'token', 'label', 'relation']
    return pd.read_csv(
        filepath,
        sep='\t',
        skiprows=5,
        quoting=3,
        names=names
    ).dropna(how='all').query("note_tok_index.str[0] != '#'").assign(
        **metadata
    )

In [3]:
# get data
annotated_list = []

for file in glob.glob("./../../../inception_output/ergotherapie_output/annotation/ergotherapie_notities_*[0-9].conll/edwin.tsv"):
    annotated_list.append(tsv_to_df(file))

annotated = pd.concat(annotated_list)

In [4]:
# get the note index from the first column
annotated["NotitieID"] = annotated["note_tok_index"].apply(lambda x:x.split("-")[0])
# add an extra 0 to single digits (e.g. 1 becomes 01) so the order is preserved
annotated["NotitieID"] = annotated["NotitieID"].apply(lambda x: str("0" + x) if len(x)==1 else x)
# also add the file number to the note ID so none of the IDs are the same
annotated["NotitieID"] = annotated["NotitieID"] + annotated["file_nr"] + "003"

# 001 voor fysiotherapie
# 002 voor dietetiek
# 003 voor ergotherapie

In [5]:
# keep track of the order of the indices
annotated["order"] = annotated.index.values
# sort according to note IDs while preserving order within notes
annotated = annotated.sort_values(by=["NotitieID","order"]).reset_index()
annotated = annotated.drop(columns=["order"])

# create a temporary dataframe thats join the individual words together into notes again,
# so spaCy can determine sentence boundaries
annotated_notes = annotated.groupby(["NotitieID"])["token"].apply(lambda x: " ".join(x)).reset_index()
annotated_notes.rename(columns={'token': 'note'}, inplace=True)

In [6]:
index_list = []

# load notes into spacy and determine sentence boundaries,
# store all indices into list
for note in annotated_notes.iterrows():
    sen_index = 0
    tok_index = 0
    doc = nlp(str(note[1][1]))
    for token in doc:
        # add tuples of sentence index, note index and token index to list
        index_list.append({"sen_idx":sen_index,"note_idx":note[1][0],"tok_idx":tok_index})
        # in her original code, Jenia used 'token.is_sent_start' to check sentence boundaries
        if token.is_sent_start == True:
            sen_index += 1
        tok_index += 1
         

In [7]:
# add the sentence and token index to the original dataframe
df_tmp = pd.DataFrame(index_list)
annotated["sen_id"] = df_tmp["sen_idx"]
annotated["tok_id"] = df_tmp["tok_idx"]

In [8]:
annotated.to_pickle("./../../INCEPTION/ergotherapie.pkl")
annotated.to_csv("./../../INCEPTION/ergotherapie.csv")