In [1]:

import joblib
import nltk
nltk.download('punkt_tab')
import numpy as np
import pandas as pd
import ipywidgets as widgets
from sklearn.metrics import classification_report
from IPython.display import display, HTML
from gitma import Catma, CatmaProject


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\jvonk\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [10]:
# Load Classifier
clf_path = 'models/animacy_clf_full.pkl'
classifier = joblib.load(clf_path)

# Load Vectorizer
vectorizer_path = 'models/full_feature_vectorizer.pkl'
vectorizer = joblib.load(vectorizer_path)

In [16]:
# Tokenize text
def tokenize(text):   
    return nltk.word_tokenize(text, 'german')

def make_clf_input(tokens):
    return [[[token] for token in tokens]]

# Find Offsets for tokens
def calc_offsets(text, tokens):
    offsets = []
    start = 0
    for token in tokens:
        start = text.find(token, start)
        end = start + len(token)
        offsets.append((start, end))
    return offsets

# Annotate Text
def clf_annotate(text):
    tokens = tokenize(text)
    features = vectorizer.transform(make_clf_input(tokens))
    probabilities = classifier.predict_proba(features)
    predictions = classifier.predict(features)
    predictions = ["belebt" if p == 0 else "unbelebt" for p in predictions]
    confidences = np.max(probabilities, axis=1)
    return tokens, predictions, confidences

def catma_annotate(text, annotations):
    tokens = tokenize(text)
    annotation_labels = []
    offsets = calc_offsets(text, tokens)
    print(tokens)
    print(offsets)
    for start, end in offsets:
        label = 'unbelebt'
        for annotation in annotations:
            if start >= annotation.start_point and end <= annotation.end_point:
                label = 'belebt'
        annotation_labels.append(label)
    
    return tokens, annotation_labels

    


In [17]:
# Interactive annotation widget
example_text = "Es war einmal ein Müller, der war arm, aber er hatte eine schöne Tochter."
text_input = widgets.Textarea(value=example_text, 
                              description="Texteingabe:", 
                              disabled=False,
                              layout=widgets.Layout(width='50%', height='80px')
                              )
button = widgets.Button(description="Annotieren")
output = widgets.Output()

def on_button_click(b):
    with output:
        output.clear_output()
        tokens, labels, confidences = clf_annotate(text_input.value)
        annotated = list(zip(tokens, labels, confidences))
        
        # HTML formatting: Mark animate entities (optional different colours for confidence thresholds)
        formatted_text = " ".join(
            f'<span style="background-color: lightgreen; padding:2px; border-radius:3px;">{token}</span>'
            if annotation == "belebt" and confidence >= 0.75 else
            f'<span style="background-color: yellow; padding:2px; border-radius:3px;">{token}</span>'
            if annotation == "belebt" and 0.5 <= confidence < 0.75 else token
            for token, annotation, confidence in annotated
        )

        # # Confidence Table
        # conficdence_table = pd.DataFrame(zip(tokens, labels, confidences), columns=['Token', 'Label', 'Confidence'])


        display(HTML(f"<p>{formatted_text}</p>"))
        # display(conficdence_table)

button.on_click(on_button_click)

display(text_input, button, output)

Textarea(value='Es war einmal ein Müller, der war arm, aber er hatte eine schöne Tochter.', description='Texte…

Button(description='Annotieren', style=ButtonStyle())

Output()

In [18]:
# Load Catma Project
my_project_name = 'Belebtheit_in_Mxrchen'

my_project = CatmaProject(
   projects_directory='catma',
   project_name=my_project_name
)

# Load Annotation Collection 
ac_name = "1. Der Froschkönig oder der eiserne Heinrich Belebte Entitäten A2"
annotation_collection = next((ac for ac in my_project.annotation_collections if ac.name == ac_name), None)

# Get Annotations from tag name
annotations = annotation_collection.get_annotation_by_tag("belebte_entität")

# Get Text from Annotation Collection
text = annotation_collection.text.plain_text

Loading tagsets ...
	Found 2 tagset(s).
Loading documents ...
	Found 1 document(s).
Loading annotation collections ...
	Found 1 annotation collection(s).
	Annotation collection "1. Der Froschkönig oder der eiserne Heinrich Belebte Entitäten A2" for document "1. Der Froschkönig oder der eiserne Heinrich"
		Annotations: 216


In [19]:
# Get tokens from text 
tokens = tokenize(text)

# Get Classifier Annotations
_,  clf_annotation_labels, _ = clf_annotate(text)

# Get Catma Annotations
_, catma_annotation_labels = catma_annotate(text, annotations)

['Der', 'Froschkönig', 'oder', 'der', 'eiserne', 'Heinrich', '.', 'In', 'den', 'alten', 'Zeiten', ',', 'wo', 'das', 'Wünschen', 'noch', 'geholfen', 'hat', ',', 'lebte', 'ein', 'König', ',', 'dessen', 'Töchter', 'waren', 'alle', 'schön', ',', 'aber', 'die', 'jüngste', 'war', 'so', 'schön', ',', 'daß', 'die', 'Sonne', 'selber', ',', 'die', 'doch', 'so', 'vieles', 'gesehen', 'hat', ',', 'sich', 'verwunderte', 'so', 'oft', 'sie', 'ihr', 'ins', 'Gesicht', 'schien', '.', 'Nahe', 'bei', 'dem', 'Schlosse', 'des', 'Königs', 'lag', 'ein', 'großer', 'dunkler', 'Wald', ',', 'und', 'in', 'dem', 'Walde', 'unter', 'einer', 'alten', 'Linde', 'war', 'ein', 'Brunnen', ':', 'wenn', 'nun', 'der', 'Tag', 'recht', 'heiß', 'war', ',', 'so', 'ging', 'das', 'Königskind', 'hinaus', 'in', 'den', 'Wald', 'und', 'setzte', 'sich', 'an', 'den', 'Rand', 'des', 'kühlen', 'Brunnens', ':', 'und', 'wenn', 'sie', 'Langeweile', 'hatte', ',', 'so', 'nahm', 'sie', 'eine', 'goldene', 'Kugel', ',', 'warf', 'sie', 'in', 'die', 

In [7]:
print(classification_report(y_true=catma_annotation_labels, y_pred=clf_annotation_labels))

              precision    recall  f1-score   support

      belebt       0.73      0.80      0.76       260
    unbelebt       0.96      0.94      0.95      1311

    accuracy                           0.92      1571
   macro avg       0.85      0.87      0.86      1571
weighted avg       0.92      0.92      0.92      1571



In [8]:
# Generate HTML File with annotations from classifier and catma marked in colour
def generate_html(tokens, clf_labels, catma_labels, output_file="annotations.html"):
    html_start = """
    <html>
    <head>
        <style>
            .clf { text-decoration: underline; text-decoration-color: blue; text-decoration-thickness: 3px; }
            .catma { text-decoration: underline; text-decoration-color: red; text-decoration-thickness: 3px; }
            .both { text-decoration: underline; text-decoration-color: yellow; text-decoration-thickness: 3px; }
        </style>
    </head>
    <body>
    <div class="legend">
        <span class="clf">Classifier</span><br>
        <span class="catma">Catma</span><br>
        <span class="both">Both</span><br>
        <br>
        <br>
    </div>
    """
    html_end = "</body></html>"
    
    body_content = ""
    for token, clf_label, catma_label in zip(tokens, clf_labels, catma_labels):
        if clf_label == "belebt" and catma_label == "belebt":
            span = f'<span class="both">{token}</span>'
        elif clf_label == "belebt":
            span = f'<span class="clf">{token}</span>'
        elif catma_label == "belebt":
            span = f'<span class="catma">{token}</span>'
        else:
            span = token
        
        body_content += span + " "
    
    html_content = html_start + body_content + html_end
    
    with open(output_file, "w", encoding="utf-8") as f:
        f.write(html_content)
    
    print(f"HTML-Datei wurde erstellt: {output_file}")



In [9]:
generate_html(tokens, clf_annotation_labels, catma_annotation_labels)


HTML-Datei wurde erstellt: annotations.html
