# Sentiment Parsing Analysis

## Preliminaries

### Import I/O and confusion functions

In [None]:
%cd ..
import src.col_data as cd
import src.confusion as cf
import src.vocab as vcb
%cd -
import pickle
from typing import List, Tuple, NamedTuple, Dict
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict, Counter
import itertools as it
from colorama import Fore, Back, Style
%load_ext autoreload
%autoreload 2

## Colourful Labels

These are used for some matrices, but maybe more importantly when printing sentences to get _flat_ graphs.

In [None]:
fore = [Fore.BLACK, Fore.BLUE, Fore.CYAN, Fore.GREEN, Fore.LIGHTBLACK_EX, Fore.LIGHTBLUE_EX, Fore.LIGHTCYAN_EX, Fore.LIGHTGREEN_EX, Fore.LIGHTMAGENTA_EX, Fore.LIGHTRED_EX, Fore.LIGHTWHITE_EX, Fore.LIGHTYELLOW_EX, Fore.MAGENTA, Fore.RED, Fore.WHITE, Fore.YELLOW]
back = [Back.BLACK, Back.BLUE, Back.CYAN, Back.GREEN, Back.LIGHTBLACK_EX, Back.LIGHTBLUE_EX, Back.LIGHTCYAN_EX, Back.LIGHTGREEN_EX, Back.LIGHTMAGENTA_EX, Back.LIGHTRED_EX, Back.LIGHTWHITE_EX, Back.LIGHTYELLOW_EX, Back.MAGENTA, Back.RED, Back.WHITE, Back.YELLOW]
print(" ".join([c + str(i) + Style.RESET_ALL for i, c in enumerate((fore))]))
print(" ".join([c + Fore.WHITE + str(i) + Style.RESET_ALL for i, c in enumerate((back))]))

In [None]:
lbl_colors = {"targ": fore[15] + back[0], "holder": fore[3] + back[0]}

d = {}
d["exp-None"] = back[2]
d["exp-Positive"] = back[1]
d["exp-Negative"] = back[13]
d["exp-positive"] = d["exp-Positive"]
d["exp-negative"] = d["exp-Negative"]
d["exp-neutral"] = d["exp-None"]
d["exp-conflict"] = back[12]
lbl_colors.update(d)

d = {}
d["IN:targ"] = back[15] + fore[0]
d["IN:holder"] = back[3] + fore[0]
d["IN:exp-None"] = fore[2]
d["IN:exp-Positive"] = fore[1]
d["IN:exp-Negative"] = fore[13]
d["IN:exp-positive"] = d["IN:exp-Positive"]
d["IN:exp-negative"] = d["IN:exp-Negative"]
d["IN:exp-neutral"] = d["IN:exp-None"]
d["IN:exp-conflict"] = fore[12]
lbl_colors.update(d)
print(" ".join([c + l + Style.RESET_ALL for l,c in lbl_colors.items()]))

### Set the Main Parameters

In [None]:
data = "../data/sent_graphs/"
experiments = "../experiments/"
runs = [str(i) for i in range(1,6)]
languages = !ls $experiments
x = languages[0]
flavours = !ls $experiments/$x
dev = "dev.conllu"
test = "test.conllu"
train = "train.conllu"
dev_pred = dev + ".pred"
test_pred = test + ".pred"

In [None]:
languages, flavours

#### Is everything where it should be?

In [None]:
for l in languages:
    for f in flavours:
        print(l,f, end="\n\t")
        !ls $data/$l/$f
        for r in runs:
            print(r, end ="")
            !ls $experiments/$l/$f/$r

#### Create Vocabs per language

In [None]:
for l in languages:
    sentences = []
    for f in flavours:
        path = "/".join([data, l, f]) + "/"
        for fn in ["dev.conllu", "test.conllu", "train.conllu"]:
            sentences.extend(cd.read_col_data(path + fn))
    forms, norms, lemmas, uposs, xposs, synrels, semrels, chars, scoperels = vcb.make_vocabs(sentences)
    print([len(v.w2i) for v in [forms, norms, lemmas, uposs, xposs, synrels, semrels, chars, scoperels]])
    vocabs = vcb.Vocabs(forms, norms, lemmas, uposs, xposs, synrels, semrels, chars, scoperels)
    with open(f"{l}_vocabs.pickle", "wb") as fh:
        pickle.dump(vocabs, fh)

## Analysis

### Data itself

- how many arcs are there? per label? per POS pair? per dependencies?=
- how long are they?
- overlap between flavours

### Results

- how many arcs?
- how long?
- overlap between flavours
- precision / recall / fscore per label / POS pair / dependencies / arc length
- which schemes are best for which label?


In [None]:
!ls 

In [None]:
def load_vocab(fn: str) -> vcb.Vocabs:
    with open(fn, "rb") as fh:
        v = pickle.load(fh)
    return v

In [None]:
%matplotlib inline

In [None]:
# colours for plotting
colours = ['tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple', 'tab:brown', 'tab:pink', 'tab:gray', 'tab:olive', 'tab:cyan']

In [None]:
# little helper for plotting
_1 = lambda x: 1 if x % 2 == 1 else -1

#### Simple Lengths

Mostly uninteresting for now...

In [None]:
width = 1/8
for l in languages:
    plots = {}
    min_maxes = defaultdict(lambda : [float("inf"), -float("inf")])
    for i, fl in enumerate(flavours):
        xs = "/".join([data, l, fl])
        v = load_vocab(l + "_vocabs.pickle")

        sentences = []
        for tdt in [train, dev, test]:
            sentences.extend(cd.read_col_data(xs + "/" + tdt))

        d = {}
        for sentence in sentences:
            m = sentence.make_matrix("scope", label=True, w2i=v.scoperels.w2i)
            for src, tgt in zip(*np.where(m != 0)):
                if src == 0:
                    lngth = 0
                else:
                    lngth = src - tgt
                lbl = m[src, tgt]
                if lbl not in d:
                    d[lbl] = {}
                if lngth not in d[lbl]:
                    d[lbl][lngth] = 0
                d[lbl][lngth] += 1
        for lbl in d:
            #print(v.scoperels.i2w[lbl])
            xy = sorted(d[lbl].items(), key=lambda x: -x[1])
            #print(xy)
            xs, ys = zip(*xy)
            min_maxes[lbl][0] = min([min(xs), min_maxes[lbl][0]])
            min_maxes[lbl][1] = max([max(xs), min_maxes[lbl][1]])
            if lbl not in plots:
                plots[lbl] = []
            plots[lbl].append((xs, ys, fl))
    for lbl in plots:
        fig, ax = plt.subplots(figsize=(19, 6))
        ax.set_title(l + " " + v.scoperels.i2w[lbl])
        min_max = min_maxes[lbl]
        myrange = {i: j for j, i in enumerate(range(min_max[0], min_max[1] + 1))}
        ax.set_xticks(np.arange(min_max[0], min_max[1]+1))
        ax.set_xticklabels(np.arange(min_max[0], min_max[1]+1), rotation=45)
        _xs = np.arange(min_max[0], min_max[1]+1)
        #ax.set_xticklabels(max_ti.cks, rotation=45)
        for i, (xs, ys, fl) in enumerate(plots[lbl]):
            xs = np.array(xs)
            #_xs = np.zeros(len(myrange))
            _ys = np.zeros(len(myrange))
            xs, ys = zip(*(sorted(zip(xs, ys), key=lambda x: x[1])))
            for x,y in zip(xs, ys):
                j = myrange[x]
                _ys[j] = y
            #plt.bar(_xs + (i * width/2 * _1(i)), _ys, width/2, color=colours[i], alpha=0.7, linewidth=0.8, edgecolor=colours[i], align="center", label=fl)
        ax.legend()
        fig.tight_layout()
        #plt.show()
    

## Compare Runs for each Flavour

### What is easy & what is difficult?

In [None]:
list("/".join(i) for i in it.product(runs, [test, dev]))

In [None]:
class Arc(NamedTuple):
    index: int
    sen: cd.Sentence
    w_i: cd.Token
    w_j: cd.Token
    i: int
    j: int
    lbl: str

In [None]:
def get_arcs(index, sentence: cd.Sentence, m: np.ndarray, i2w: Dict[float, str]) -> List[Arc]:
    "get all arcs in a given sentences"
    return [Arc(index, sentence, sentence[i-1], sentence[j-1], i, j, i2w[m[i,j]]) for i,j in zip(*np.nonzero(m))]

In [None]:
# overlap
# cell entries should match
# should be nonzero for unlabelled
# i,j == j,i for undirected
def overlap(index: int,
            sen1: cd.Sentence, m1: np.ndarray,
            sen2: cd.Sentence, m2: np.ndarray,
            i2w: Dict[float, str],
            labelled: bool = True, direction: bool = True
           ) -> List[Arc]:  # indices and label
    results = []
    assert m1.shape == m2.shape
    #if not m1.shape == m2.shape:
    #    print(m1.shape, m2.shape)
    #    return results
    d1, d2 = m1.shape
    scnd_dim = lambda x: range(x, d2)
    for i in range(d1):
        if direction:
            x = 0
        else:
            x = i
        for j in scnd_dim(x):
            if m1[i,j]:
                a = m1[i,j]
            elif m1[j,i] and not direction:
                a = m1[j,i]
            else:
                continue
            if m2[i,j]:
                b = m2[i,j]
            elif m2[j,i] and not direction:
                b = m2[j,i]
            else:
                continue
            if a == b:
                results.append(Arc(index, sen1, sen1[i-1], sen1[j-1], i, j, i2w[a]))  # (i, j, a))
            elif a and b and not labelled:
                results.append(Arc(index, sen1, sen1[i-1], sen1[j-1], i, j, "-Any-"))
    return results


In [None]:
def print_arcs(arcs, mode: str = "lbl"):
    "prints the number of arcs of some type"
    d = {}
    for arc in arcs:
        if mode == "lbl":
            x = arc.lbl
        elif mode == "upos":
            x = (arc.w_i.upos, arc.w_j.upos)
        elif mode == "lemma":
            x = (arc.w_i.lemma, arc.w_j.lemma)
        elif mode == "deprel":
            x = (arc.w_i.deprel, arc.w_j.deprel)
        else:
            raise ValueError(f"no such {mode} implemented")
        if x not in d:
            d[x] = 0
        d[x] += 1
    print(sorted(d.items(), key=lambda x: -x[1])[:5])

In [None]:
def eval_overlapping(lang: str, cache):
    "outer function calling the others"
    v = load_vocab(lang + "_vocabs.pickle")
    for i in range(1, len(flavours) + 1):
        cnt = {}
        for (_, combo), os in filter(lambda x: len(x[0][1]) == i, cache.items()):
            for arc in os:
                #if s == 0:
                #    continue
                if combo not in cnt:
                    cnt[combo] = {}
                if arc.lbl not in cnt[combo]:
                    cnt[combo][arc.lbl] = []
                cnt[combo][arc.lbl] += [arc]
        print(i)
        for combo in sorted(cnt, key=lambda x: len(x)):
            print(combo)
            for l in sorted(cnt[combo], key=lambda x: len(cnt[combo][x])):
                print(l)
                print_arcs(cnt[combo][l], "lbl")
                print_arcs(cnt[combo][l], "upos")
                print_arcs(cnt[combo][l], "deprel")
                print_arcs(cnt[combo][l], "lemma")

In [None]:
def get_overlapping(lang: str,
                    datasplits: List[str],
                    labelled: bool = True,
                    direction: bool = True
                    ):
    "get overlapping arcs for all flavour combinations: what do flavours have in common?"
    v = load_vocab(lang + "_vocabs.pickle")
    sentences = {}
    for fl in flavours:
        xs = "/".join([data, lang, fl])
        sentences[fl] = []
        for ds in datasplits:
            sentences[fl].extend([(sen, sen.make_matrix("scope", label=True, w2i=v.scoperels.w2i))
                                    for sen in cd.read_col_data(xs + "/" + ds)])
            
    cache = {}  # stores overlaps for sentences so that only those need to be compared    
    i2w = v.scoperels.i2w
    for combo_len in range(1, len(flavours) + 1):
        for combo in it.combinations(flavours, combo_len):
            for mi, smss in enumerate(zip(*(sentences[c] for c in combo))):
                sss, mss = zip(*smss)
                if len(combo) == 1:
                    cache[(mi, combo)] = get_arcs(mi, sss[0], mss[0], i2w)
                elif len(combo) == 2:
                    cache[(mi, combo)] = overlap(mi, sss[0], mss[0], sss[1], mss[1], i2w, labelled, direction)
                elif len(combo) > 2 and (mi, combo[:-1]) in cache:
                    # compute overlap for the arcs in cache
                    m = mss[-1]
                    os = []
                    for arc in cache[(mi, combo[:-1])]:
                        if i2w[m[arc.i, arc.j]] == arc.lbl:
                            os.append(arc)
                    if os:
                        cache[(mi, combo)] = os
                else:
                    # do nothing
                    pass
    return cache

###

for lang in languages:
    print(lang)
    eval_overlapping(lang, get_overlapping(lang, [dev]))

### Confusion Matrix-based Measures

The interesting ones here are **true positives, false positives, false negatives, false labels** and the derived **precision, recall, f-score**.

In [None]:
def get_confusion(lang: str,
                    pred_path: str,
                    gold_path: str,
                    datasplits: List[str],
                    labelled: bool = True,
                    direction: bool = True
                    ):
    "prints evaluation measures and returns a dictionary with the eval counts and the conf-matrix"
    v = load_vocab(lang + "_vocabs.pickle")
    pred_sentences = {}
    gold_sentences = {}
    results = {fl: [] for fl in flavours}
    for fl in flavours:
        xs = "/".join([pred_path, fl])
        gxs = "/".join([gold_path, fl])
        pred_sentences[fl] = []
        gold_sentences[fl] = []
        for ds in datasplits:
            pred_sentences[fl].extend([(sen, sen.make_matrix("scope", label=True, w2i=v.scoperels.w2i))
                                    for sen in cd.read_col_data(xs + "/1/" + ds + ".pred")])
            gold_sentences[fl].extend([(sen, sen.make_matrix("scope", label=True, w2i=v.scoperels.w2i))
                                    for sen in cd.read_col_data(gxs + "/" + ds)])
            
    i2w = v.scoperels.i2w
    for fl_i, fl in enumerate(flavours):
        gss, gms = zip(*gold_sentences[fl])
        pss, pms = zip(*pred_sentences[fl])
        c = cf.confuse(gms, pms, i2w)
        for i in range(4, len(c)):
            results[fl].append(cf.fscore(i, c)[:-3])
        results[fl] = (np.array(results[fl], dtype=int), c)

    return results

###
# save the results and confusion matrices in separate dictionaries
all_results = {}
all_confusions = {}
for lang in languages:
    print(lang)
    all_results[lang] = {}
    all_confusions[lang] = {}
    results = get_confusion(lang, "/".join([experiments, lang]), "/".join([data, lang]), [dev, test])
    for flavour in results:
        all_results[lang][flavour] = results[flavour][0]
        all_confusions[lang][flavour] = results[flavour][1]

#### Two types of flavours
Two types of flavours to make some comparisons easier.

In [None]:
general_flavours = [fl for fl in flavours if "inside" not in fl]
inside_flavours = [fl for fl in flavours if "inside" in fl]

Compare flavours pairwise for each language and take the difference between their evaluation counts.
The matrices are colour-coded according to their labels they represent.
This way we get an idea on how they directly compare to each other.
Comparing `general_flavours` with `inside_flavours` can be confusing due to their different label sets.

In [None]:
for lang in languages:
    v = load_vocab(lang + "_vocabs.pickle")
    labels = list(v.scoperels.w2i.keys())[4:]
    print(lang)
    for a,b in it.combinations(flavours, 2):
        print(a, " || ", b)
        m = all_results[lang][a] - all_results[lang][b]
        for i in range(len(m)):
            if (m[i] == 0).all():
                continue
            print("\t" , lbl_colors[labels[i]] + str(m[i]) + Style.RESET_ALL)
        print("\t", np.sum(m, 0), end="\n\n")

In [None]:
def colored_sentence(sentence, l2c):
    "print sentences and their graphs flatly encoded by colour to make simple comparisons easier"
    output = []
    for token in sentence:
        if token.scope:
            if len(token.scope) > 1:
                output.append("(")
                for _, l in token.scope:
                    output.append(l2c[l] + token.form + Style.RESET_ALL)
                    output.append("|")
                output.pop(-1)
                output.append(") ")
            else:
                output.append(l2c[token.scope[0][1]] + token.form + Style.RESET_ALL)
                output.append(" ")
        else:
            output.append(token.form)
            output.append(" ")
    return "".join(output)

# should proably be used to print into files, instead of blowing up the notebook
with open("test.out", "w") as f:
    print(colored_sentence(sentence, lbl_colors), file=f)

### More fun with evaluation counts

In [None]:
def get_tpfpfnfl(gold: List[cd.Sentence], pred: List[cd.Sentence], w2i: Dict[str, float], i2w: Dict[float, str]):
    "tpfpfnfl is a dictionary with the arcs of each category"
    results = {"tp": set(), "fp": set(), "fn": set(), "fl": set()}
    for sid, (gs, ps) in enumerate(zip(gold, pred)):
        gm = gs.make_matrix("scope", label=True, w2i=w2i)
        pm = ps.make_matrix("scope", label=True, w2i=w2i)
        d1, d2 = gm.shape
        assert gm.shape == pm.shape
        for i in range(d1):
            for j in range(d2):
                g = gm[i, j]
                p = pm[i, j]
                if g == p and g != 0:
                    results["tp"].add(Arc(sid, gs, gs[i-1], gs[j-1], i, j, i2w[gm[i, j]]))
                elif g == 0 and p != 0:
                    results["fp"].add(Arc(sid, gs, gs[i-1], gs[j-1], i, j, i2w[pm[i, j]]))
                elif p == 0 and g != 0:
                    results["fn"].add(Arc(sid, gs, gs[i-1], gs[j-1], i, j, i2w[gm[i, j]]))
                elif g != 0 and p != 0 and g != p:
                    results["fl"].add(Arc(sid, gs, gs[i-1], gs[j-1], i, j, i2w[pm[i, j]]))
    return results

Get `tpfpfnfl` for all languages and flavours

In [None]:
tpfpfnfl = {}
for lang in languages:
    tpfpfnfl[lang] = {}
    v = load_vocab(lang + "_vocabs.pickle")
    labels = list(v.scoperels.w2i.keys())[4:]
    print(lang)
    pred_path = "/".join([experiments, lang])
    gold_path = "/".join([data, lang])
    datasplits = [dev, test]
    for fl in flavours:
        print("\t", fl)
        xs = "/".join([pred_path, fl])
        gxs = "/".join([gold_path, fl])
        pred = []
        gold = []
        for ds in datasplits:
            pred.extend([sen for sen in cd.read_col_data(xs + "/1/" + ds + ".pred")])
            gold.extend([sen for sen in cd.read_col_data(gxs + "/" + ds)])
        tpfpfnfl[lang][fl] = get_tpfpfnfl(gold, pred, v.scoperels.w2i, v.scoperels.i2w)

In [None]:
def get_counts(arc_set, combo, comboverlap):
    "counts for different types of arcs"
    # we lose the arc label since we want to compare across labels
    results = {x: {} for x in "total form upos xpos deprel".split()}
    for x in "tp fp fn fl".split():
        results["total"][x] = {}
        results["form"][x] = {}
        results["upos"][x] = {}
        results["xpos"][x] = {}
        results["deprel"][x] = {}
        for a in combo:
            results["total"][x][a] = len(arc_set[a][x])
            results["form"][x][a] = Counter([(arc[1][1], arc[2][1]) for arc in arc_set[a][x]])
            results["upos"][x][a] = Counter([(arc[1][2], arc[2][2]) for arc in arc_set[a][x]])
            results["xpos"][x][a] = Counter([(arc[1][3], arc[2][3]) for arc in arc_set[a][x]])
            results["deprel"][x][a] = Counter([(arc[1][4], arc[2][4]) for arc in arc_set[a][x]])
        results["total"][x][combo] = len(arc_set[a][x])
        results["form"][x][combo] = Counter([(arc[1][1], arc[2][1]) for arc in comboverlap[combo][x]])
        results["upos"][x][combo] = Counter([(arc[1][2], arc[2][2]) for arc in comboverlap[combo][x]])
        results["xpos"][x][combo] = Counter([(arc[1][3], arc[2][3]) for arc in comboverlap[combo][x]])
        results["deprel"][x][combo] = Counter([(arc[1][4], arc[2][4]) for arc in comboverlap[combo][x]])
    return results


In [None]:
arc_sets = {} # general arcs as tuples of sid, src, tgt
# src/tgt consist of the index, form, upos, xpos, and deprel
comboverlap = {} # how do combinations of flavours overlap?
for lang in languages:
    comboverlap[lang] = {}
    arc_sets[lang] = {}
    print(lang)
    for flavour in flavours:
        arc_sets[lang][flavour] = {x: set() for x in "tp fp fn fl".split()}
        for x in tpfpfnfl[lang][flavour]:
            arc_sets[lang][flavour][x] = set((arc.index, (arc.i, arc.w_i.form, arc.w_i.upos, arc.w_i.xpos, arc.w_i.deprel), (arc.j, arc.w_j.form, arc.w_j.upos, arc.w_j.xpos, arc.w_j.deprel)) for arc in tpfpfnfl[lang][flavour][x])
    for combo_len in range(1, len(flavours) + 1):
        for combo in it.combinations(flavours, combo_len):
            comboverlap[lang][combo] = {x: set() for x in "tp fp fn fl".split()}
            #print(combo)
            for x in comboverlap[lang][combo]:
                comboverlap[lang][combo][x] = arc_sets[lang][combo[0]][x].intersection(*(arc_sets[lang][ci][x] for ci in combo[1:]))
                #print(f"{x}: " + " ".join([f"{len(arc_sets[lang][a][x])}" for a in combo]) + f" overlap: {len(comboverlap[lang][combo][x])}")


In [None]:
def plot_tpfpfnfl(title, tps, fps, fns, fls, lngth=10, dominator="total"):
    # dominator total looks at the total amount of mistakes
    width = 1/4
    tps_lbls = [x[0] for x in tps.most_common(lngth)]
    fps_lbls = [x[0] for x in fps.most_common(lngth)]
    fns_lbls = [x[0] for x in fns.most_common(lngth)]
    fls_lbls = [x[0] for x in fls.most_common(lngth)]
    totals = Counter() # only mistakes
    for cnt in [fps, fns, fls]:
        totals.update(cnt)
    tot_lbls = [x[0] for x in totals.most_common(lngth)]
    all_lbls = {"tp": tps_lbls, "fp": fps_lbls, "fn": fns_lbls, "fl": fls_lbls, "total": tot_lbls}
    tpfpfnfls = {"tp": tps, "fp": fps, "fn": fns, "fl": fls}
    lbls = all_lbls[dominator]
    fig, ax = plt.subplots(figsize=(19, 6))
    ax.set_title(title)
    xs = np.arange(lngth)
    ax.set_xticks(xs)
    ax.set_xticklabels(lbls, rotation=45)
    for i, x in enumerate("tp fp fn fl".split()):
        ys = [tpfpfnfls[x][y] for y in lbls]
        plt.bar(xs + ((1+i) * width/2) - width, ys, width/2, color=colours[i], alpha=0.7, linewidth=0.8, edgecolor=colours[i], align="center", label=x)
    
    ax.legend()
    fig.tight_layout()
    plt.show()

#### Plot TpFpFnFl
Plot for each language and flavour (combination) their (shared) evaluation counts.

In [None]:
for lang in languages:
    print(lang)
    for combo_len in range(1, len(flavours) + 1):
        for combo in it.combinations(flavours, combo_len):
            print(combo)
            results = get_counts(arc_sets[lang], combo, comboverlap[lang])
            #for x in "tp fp fn fl".split():
                #print(x)
                #print(results["upos"][x][combo].most_common(5))
            category = "deprel"
            plot_tpfpfnfl(lang + " " + str(combo), results[category]["tp"][combo],
                          results[category]["fp"][combo],
                          results[category]["fn"][combo], results[category]["fl"][combo],
                          lngth=10, dominator="total")

### How is flavour A better than flavour B?

For each flavour-pair (A,B) look at the true positives in A and how many of those arc are in the various eval counts of B or do not have an equivalent arc.

In [None]:
def flavour_com_pair(d1, d2):
    """something is wrong here"""
    diffs1 = {x: [arc for arc in d1["tp"] if arc in d2[x]] for x in "tp fp fn fl".split()}
    diffs2 = {x: [arc for arc in d2["tp"] if arc in d1[x]] for x in "tp fp fn fl".split()}
    diffs1["None"] = [arc for arc in d1["tp"] if arc not in [a for a in d2[x] for x in "tp fp fn fl".split()]]
    diffs2["None"] = [arc for arc in d2["tp"] if arc not in [a for a in d1[x] for x in "tp fp fn fl".split()]]
    print([(x, len(y)) for x,y in diffs1.items()])
    print([(x, len(y)) for x,y in diffs2.items()])
    return diffs1, diffs2

In [None]:
def flavour_compair(d1, d2):
    diffs1 = {x: [] for x in "tp fp fn fl None".split()}
    diffs2 = {x: [] for x in "tp fp fn fl None".split()}
    for arc in d1["tp"]:
        if arc in d2["tp"]:
            diffs1["tp"].append(arc)
        elif arc in d2["fp"]:
            diffs1["fp"].append(arc)
        elif arc in d2["fn"]:
            diffs1["fn"].append(arc)
        elif arc in d2["fl"]:
            diffs1["fl"].append(arc)
        else:
            diffs1["None"].append(arc)
    for arc in d2["tp"]:
        if arc in d1["tp"]:
            diffs2["tp"].append(arc)
        elif arc in d1["fp"]:
            diffs2["fp"].append(arc)
        elif arc in d1["fn"]:
            diffs2["fn"].append(arc)
        elif arc in d1["fl"]:
            diffs2["fl"].append(arc)
        else:
            diffs2["None"].append(arc)
    print([(x, len(y)) for x,y in diffs1.items()])
    print([(x, len(y)) for x,y in diffs2.items()])
    return diffs1, diffs2

In [None]:
# differences between pairs only maybe...
pair_diffs = {}
for lang in languages:
    print(lang)
    pair_diffs[lang] = {}
    for fl1, fl2 in it.combinations(flavours, 2):
        print(fl1, fl2)
        d1 = arc_sets[lang][fl1]
        d2 = arc_sets[lang][fl2]
        flavour_compair(d1, d2)
    print()


### Comparison between flavours by head comparison

Comparing flavours is difficult, as they generally differ in their sources.
Taking away sources and label-prefixes (`IN:`) throws away some information, but might allow for more comparability.

In [None]:
class FlatArc(NamedTuple):
    index: int
    sid: int
    form: str
    lemma: str
    upos: str
    xpos: str
    deprel: str
    j: int
    lbl: str

In [None]:
eval_cnts = "tp fp fn fl".split()
flat_tpfpfnfl = {}
for lang in languages:
    print(lang)
    flat_tpfpfnfl[lang] = {}
    for fl in flavours:
        print(fl)
        flat_tpfpfnfl[lang][fl] = {}
        for x in eval_cnts:
            flat_tpfpfnfl[lang][fl][x] = []
            for arc in tpfpfnfl[lang][fl][x]:
                index = arc.index
                sid = arc.sen.id
                form = arc.w_j.form
                lemma = arc.w_j.lemma
                upos = arc.w_j.upos
                xpos = arc.w_j.xpos
                deprel = arc.w_j.deprel
                j = arc.j
                lbl = arc.lbl[3:] if arc.lbl.startswith("IN:") else arc.lbl
                flat_tpfpfnfl[lang][fl][x].append(FlatArc(index, sid, form, lemma, upos, xpos, deprel, j, lbl))
        print([(k, len(v)) for k,v in tpfpfnfl[lang][fl].items()])
        print([(k, len(v)) for k,v in flat_tpfpfnfl[lang][fl].items()])
                
        

In [None]:
def plot_diffs(c1):
    c1 = {k: v for k,v in c1.items() if v != 0}
    fig, ax = plt.subplots(figsize=(19, 6))
    #ax.set_title(title)
    xs = np.arange(len(c1))
    ax.set_xticks(xs)
    ax.set_xticklabels(c1.keys(), rotation=45)
    plt.plot(xs, c1.values(), "-")
    plt.show()

In [None]:
# differences between pairs only maybe...
pair_diffs = {}
for lang in languages:
    print(lang)
    pair_diffs[lang] = {}
    for fl1, fl2 in it.combinations(flavours, 2):
        print(fl1, fl2)
        d1 = flat_tpfpfnfl[lang][fl1]
        d2 = flat_tpfpfnfl[lang][fl2]
        diffs1, diffs2 = flavour_compair(d1, d2)
        ###
        for x in "fp fn".split():
            print(x)
            c1 = Counter([arc.deprel for arc in diffs1[x]])
            c2 = Counter([arc.deprel for arc in diffs2[x]])
            c1.subtract(c2)
            print(c1)
            plot_diffs(c1)
        ###
    print()

In [None]:
def get_flat_arcs(index, sen):
    sid = sen.id
    arcs = []
    for j, token in enumerate(sen):
        form = token.form
        lemma = token.lemma
        upos = token.upos
        xpos = token.xpos
        deprel = token.deprel
        if token.scope:
            for head, lbl in token.scope:   
                lbl = lbl[3:] if lbl.startswith("IN:") else lbl
                arcs.append(FlatArc(index, sid, form, lemma, upos, xpos, deprel, j, lbl))
    return arcs

In [None]:
lang = "norec"
fl1 = "head_final"
fl2 = "head_final-inside_label-dep_edges"
d1 = flat_tpfpfnfl[lang][fl1]
d2 = flat_tpfpfnfl[lang][fl2]
print([(k, len(v)) for k,v in d1.items()])
print([(k, len(v)) for k,v in d2.items()])
diffs1, diffs2 = flavour_compair(d1, d2)
print(sum([len(v) for v in diffs1.values()]), sum([len(v) for v in diffs2.values()]))
#list(filter(lambda x: x.index == 616, diffs1["None"])),"xxx", list(filter(lambda x: x.index == 616, diffs2["None"]))

In [None]:
for x in diffs1.keys():
    print(x)
    #print("\t", [(k,v) for k,v in Counter([arc.upos for arc in diffs1[x]]).most_common()])
    #print("\t", [(k,v) for k,v in Counter([arc.xpos for arc in diffs1[x]]).most_common()])
    #print([(k,v) for k,v in Counter([arc.lemma for arc in diffs1[x]]).most_common()])
    #print("\t", [(k,v) for k,v in Counter([arc.deprel for arc in diffs1[x]]).most_common()])
    #print("\t", [(k,v) for k,v in Counter([arc.deprel for arc in diffs2[x]]).most_common()])
    c1 = Counter([arc.deprel for arc in diffs1[x]])
    c2 = Counter([arc.deprel for arc in diffs2[x]])
    c1.subtract(c2)
    print(c1)
    plot_diffs(c1)

    c1 = Counter([arc.upos for arc in diffs1[x]])
    c2 = Counter([arc.upos for arc in diffs2[x]])
    c1.subtract(c2)
    print(c1)
    plot_diffs(c1)


    print()

In [None]:
path = "/".join([data, lang, fl1]) + "/"
with open(path + dev + ".flat", "w") as fh:
    for i, s in enumerate(cd.read_col_data(path + dev)):
        #if s.id == "000286-06-02":
            print(colored_sentence(s, lbl_colors), file=fh)
            flats = get_flat_arcs(i, s)
            print(len(flats), len(set(flats)))
            ga = set(flats)

path = "/".join([experiments, lang, fl1]) + "/1/"
with open(path + dev + ".pred" + ".flat", "w") as fh:
    for i, s in enumerate(cd.read_col_data(path + dev + ".pred")):
        #if s.id == "000286-06-02":
            print(colored_sentence(s, lbl_colors), file=fh)
            flats = get_flat_arcs(i, s)
            print(len(flats), len(set(flats)))
            pa = set(flats)


In [None]:
path = "/".join([data, lang, fl2]) + "/"
with open(path + dev + ".flat", "w") as fh:
    for i, s in enumerate(cd.read_col_data(path + dev)):
        #if s.id == "000286-06-02":
            print(colored_sentence(s, lbl_colors), file=fh)
            flats = get_flat_arcs(i, s)
            print(len(flats), len(set(flats)))
            ga = set(flats)

path = "/".join([experiments, lang, fl2]) + "/1/"
with open(path + dev + ".pred" + ".flat", "w") as fh:
    for i, s in enumerate(cd.read_col_data(path + dev + ".pred")):
        #if s.id == "000286-06-02":
            print(colored_sentence(s, lbl_colors), file=fh)
            flats = get_flat_arcs(i, s)
            print(len(flats), len(set(flats)))
            pa = set(flats)


For a by-hand gold-pred-flavour-by-flavour-comparison do the following:

`paste data/sent_graphs/norec/head_final/dev.conllu.flat experiments/norec/head_final/1/dev.conllu.pred.flat data/sent_graphs/norec/head_final-inside_label-dep_edges/dev.conllu.flat experiments/norec/head_final-inside_label-dep_edges/1/dev.conllu.pred.flat -d "\n"`

In [None]:
len(ga - gb), len(ga.intersection(pa)), len(ga - pa), len(pa - ga), len(gb.intersection(pb)), len(gb -pb), len(pb - gb)