In [1]:
import xml.etree.ElementTree as ET
import pandas as pd
import os

from tqdm import tqdm

In [2]:
folder = "../annotated_corpus/annotations_xml/"
segment_list = [] 

for file in tqdm(os.listdir(folder)):
    with open(os.path.join(folder, file), mode="r") as f:
        xml_data = f.read()
    root = ET.fromstring(xml_data)

    # One Row per Annotator/Segment-Pair
    for segment in root.findall("segment"):
        for anno in segment.findall("annotation"):
            #segment_data = {col: None for col in dtype_dict.keys()}  # Stelle sicher, dass alle Spalten vorhanden sind
            segment_data = dict()
            
            segment_data['file'] = file
            # attributes aus segment
            for k, v in segment.attrib.items():
                if k in ['begin', 'end']:  # Sicherstellen, dass begin und end als int gespeichert werden
                    segment_data[k] = int(v)
                else:
                    segment_data[k] = v
            
            segment_data["text"] = segment.find("text").text if segment.find("text") is not None else None

            # mainVerb und mainReferent verarbeiten
            for tag in ["mainVerb", "mainReferent"]:
                element = segment.find(tag)
                if element is not None:
                    segment_data[f'{tag}_text'] = element.text
                    segment_data[f'{tag}_begin'] = int(element.attrib['begin'])
                    segment_data[f'{tag}_end'] = int(element.attrib['end'])

            # attributes aus annotation
            for k, v in anno.attrib.items():
                segment_data[k] = v
            
            segment_list.append(segment_data)

anno_df = pd.DataFrame(segment_list)
anno_df.reset_index(drop=True, inplace=True)
anno_df.head()

100%|██████████| 367/367 [00:07<00:00, 50.15it/s]


Unnamed: 0,file,instanceid,begin,end,text,mainVerb_text,mainVerb_begin,mainVerb_end,mainReferent_text,mainReferent_begin,mainReferent_end,annotator,seType,mainReferentGenericity,habituality,mainVerbAspectualClass
0,blog_Acephalous-Cant-believe.xml,blog_Acephalous-Cant-believe.txt_1,0,15,I can't believe,believe,8.0,15.0,I,0.0,1.0,gold,STATE,NON-GENERIC,STATIC,STATIVE
1,blog_Acephalous-Cant-believe.xml,blog_Acephalous-Cant-believe.txt_1,0,15,I can't believe,believe,8.0,15.0,I,0.0,1.0,A,STATE,NON-GENERIC,STATIC,STATIVE
2,blog_Acephalous-Cant-believe.xml,blog_Acephalous-Cant-believe.txt_1,0,15,I can't believe,believe,8.0,15.0,I,0.0,1.0,D,STATE,NON-GENERIC,STATIC,STATIVE
3,blog_Acephalous-Cant-believe.xml,blog_Acephalous-Cant-believe.txt_1,0,15,I can't believe,believe,8.0,15.0,I,0.0,1.0,C,STATE,NON-GENERIC,STATIC,STATIVE
4,blog_Acephalous-Cant-believe.xml,blog_Acephalous-Cant-believe.txt_2,16,43,I wrote all that last year.,wrote,18.0,23.0,I,16.0,17.0,gold,EVENT,NON-GENERIC,EPISODIC,DYNAMIC


In [3]:
gold_df = anno_df[anno_df['annotator'] == 'gold']
gold_df['seType'].value_counts(dropna=False)

seType
STATE                    18337
EVENT                     9688
NaN                       7767
GENERIC_SENTENCE          7582
REPORT                    1617
GENERALIZING_SENTENCE     1466
QUESTION                  1056
IMPERATIVE                1046
CANNOT_DECIDE              970
GENERAL_STATIVE            432
SPEECH_ACT                  48
Name: count, dtype: int64

In [None]:
sample_df = gold_df[(gold_df['seType'] == "IMPERATIVE") & (gold_df['mainReferentGenericity'] != 'CANNOT_DECIDE')].copy()
if len(sample_df) > 30:
    sample_df = sample_df.sample(n=30)
sample_df.head()

Unnamed: 0,file,instanceid,begin,end,text,mainVerb_text,mainVerb_begin,mainVerb_end,mainReferent_text,mainReferent_begin,mainReferent_end,annotator,seType,mainReferentGenericity,habituality,mainVerbAspectualClass
90585,jokes_jokes1.xml,jokes_jokes1.txt_180,5820,5834,"and multiply.""",multiply,5824.0,5832.0,them,5802.0,5806.0,gold,IMPERATIVE,CANNOT_DECIDE,,CANNOT_DECIDE
6292,blog_Effing-Idiot.xml,blog_Effing-Idiot.txt_406,16937,16974,Embrace the derision and/or ridicule.,and/or,16958.0,16964.0,Embrace,16937.0,16944.0,gold,IMPERATIVE,CANNOT_DECIDE,,CANNOT_DECIDE
104105,jokes_jokes6.xml,jokes_jokes6.txt_138,4722,4742,Avoid Disappointment,Avoid,4722.0,4727.0,Low,4699.0,4702.0,gold,IMPERATIVE,CANNOT_DECIDE,,CANNOT_DECIDE
125297,letters_wwf12.xml,letters_wwf12.txt_56,4053,4151,"And please, take a moment today to renew your ...",take,4065.0,4069.0,,,,gold,IMPERATIVE,CANNOT_DECIDE,,CANNOT_DECIDE
124429,letters_NWF1.xml,letters_NWF1.txt_64,4801,4860,and become a member of NWF with a gift of only...,become,4805.0,4811.0,,,,gold,IMPERATIVE,CANNOT_DECIDE,,CANNOT_DECIDE


In [9]:
for i, row in sample_df.iterrows():
    print(row['file'])
    print(row['mainReferent_text'], row['mainVerb_text'])
    print(row['text'])
    print(row['seType'])
    print('---------------------')

jokes_jokes1.xml
them multiply
and multiply."
IMPERATIVE
---------------------
blog_Effing-Idiot.xml
Embrace and/or
Embrace the derision and/or ridicule.
IMPERATIVE
---------------------
jokes_jokes6.xml
Low Avoid
Avoid Disappointment
IMPERATIVE
---------------------
letters_wwf12.xml
nan take
And please, take a moment today to renew your critically important WWF annual membership.
IMPERATIVE
---------------------
letters_NWF1.xml
nan become
and become a member of NWF with a gift of only $15 or more.
IMPERATIVE
---------------------
jokes_jokes14.xml
nan Pretend
Pretend
IMPERATIVE
---------------------
jokes_jokes7.xml
nan nan
"Look,
IMPERATIVE
---------------------
travel_WhereToHongKong.xml
nan walk
For an interesting glimpse of small and family-owned shops, walk along Bonham Strand East and West, Man Wa Lane, and Cleverly Street.
IMPERATIVE
---------------------
travel_WhatToHongKong.xml
nan nan
Ask
IMPERATIVE
---------------------
blog_Effing-Idiot.xml
nan nan
Trust me, y'all,
IMPE