In [None]:
import stanza
import graphviz
import os
import json

import networkx as nx
import pandas as pd
import numpy as np


from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix
from potato.dataset.dataset import Dataset
from potato.models.trainer import GraphTrainer
from tuw_nlp.grammar.text_to_4lang import TextTo4lang
from tuw_nlp.graph.fourlang import FourLang
from tuw_nlp.text.pipeline import CachedStanzaPipeline, CustomStanzaPipeline
from graphviz import Source

In [None]:
config = {
    "lang": "en",
    "depth": 0,
    "substitute": False,
    "nr of samples": 1000,
}
data = {
 "dir": "data"
}

In [None]:
GOLD_ATTRIBUTES = "gold_attributes"
GOLD = "labels_gold"
ANNOTATED_ATTRIBUTES = "annotated_attributes"

def create_input(directory, attribute):
    sentence = []
    label = []
    for filename in os.listdir(directory):
        with open(os.path.join(directory, filename), "rt") as f:
            doc = json.load(f)
            for sen in doc["sens"].values():
                if doc[GOLD]:
                    lab = attribute in sen[GOLD_ATTRIBUTES]
                else:
                    lab = attribute in sen[ANNOTATED_ATTRIBUTES]
                sentence.append(sen["text"])
                label.append(lab)
    return pd.DataFrame(data=list(zip(sentence,label)),columns=["Text","Label"])



In [None]:
#Load in Data for specific feature
train_data = create_input(os.path.join(data["dir"], "train"), "Planzeichen")

#Save the labels for later. In this case its just 0,1 conversion for Planzeichen
labels = train_data.Label*1

train_data = train_data.drop([6020]) # Remove if bug is fixed

train_data

In [None]:
sentences = []

for index, row in train_data.iterrows():
    sentences.append((row["Text"], str(row["Label"])))
    
sentences

In [None]:
dataset = Dataset(sentences, label_vocab={str(False): 0, str(True): 1})
dataset.set_graphs(dataset.parse_graphs(graph_format="fourlang"))
dataset

In [None]:
df = dataset.to_dataframe()
df

In [None]:
trainer = GraphTrainer(df)
#extract features
features = trainer.prepare_and_train()


In [None]:
features

In [None]:
from potato.graph_extractor.extract import FeatureEvaluator
evaluator = FeatureEvaluator()

In [None]:
result = evaluator.match_features(df, features["True"])
result

In [None]:
result[result["Predicted label"] != "True"] = "False"
result

In [None]:
precision = precision_score(result["Predicted label"].apply(bool), train_data["Label"])
recall = recall_score(result["Predicted label"].apply(bool), train_data["Label"])
f1 = f1_score(result["Predicted label"].apply(bool), train_data["Label"])

tn, fp, fn, tp = confusion_matrix(result["Predicted label"].apply(bool), train_data["Label"]).ravel()
print(f"TP: {tp}, FP: {fp}, FN: {fn}, TN: {tn}")
print(f'Precision: {precision:5.2f}, Recall: {recall:5.2f}, F1: {f1:5.2f}')