# Sentiment Parsing Analysis

## Preliminaries

### Import I/O and confusion functions

In [None]:
%cd ..
import src.col_data as cd
import src.confusion as cf
import src.vocab as vcb
%cd -
import pickle
from typing import List, Tuple
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
import itertools as it

### Set the Main Parameters

In [None]:
data = "../data/sent_graphs/"
experiments = "../experiments/"
runs = [str(i) for i in range(1,6)]
languages = !ls $experiments
x = languages[0]
flavours = !ls $experiments/$x
dev = "dev.conllu"
test = "test.conllu"
train = "train.conllu"
dev_pred = dev + ".pred"
test_pred = test + ".pred"

#### Is everything where it should be?

In [None]:
for l in languages:
    for f in flavours:
        print(l,f, end="\n\t")
        !ls $data/$l/$f
        for r in runs:
            print(r, end ="")
            !ls $experiments/$l/$f/$r

#### Create Vocabs per language

In [None]:
for l in languages:
    sentences = []
    for f in flavours:
        path = "/".join([data, l, f]) + "/"
        for fn in ["dev.conllu", "test.conllu", "train.conllu"]:
            sentences.extend(cd.read_col_data(path + fn))
    forms, norms, lemmas, uposs, xposs, synrels, semrels, chars, scoperels = vcb.make_vocabs(sentences)
    print([len(v.w2i) for v in [forms, norms, lemmas, uposs, xposs, synrels, semrels, chars, scoperels]])
    vocabs = vcb.Vocabs(forms, norms, lemmas, uposs, xposs, synrels, semrels, chars, scoperels)
    with open(f"{l}_vocabs.pickle", "wb") as fh:
        pickle.dump(vocabs, fh)

## Analysis

### Data itself

- how many arcs are there? per label? per POS pair? per dependencies?=
- how long are they?
- overlap between flavours

### Results

- how many arcs?
- how long?
- overlap between flavours
- precision / recall / fscore per label / POS pair / dependencies / arc length
- which schemes are best for which label?


In [None]:
!ls 

In [None]:
def load_vocab(fn: str) -> vcb.Vocabs:
    with open(fn, "rb") as fh:
        v = pickle.load(fh)
    return v

In [None]:
%matplotlib inline

In [None]:
colours = ['tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple', 'tab:brown', 'tab:pink', 'tab:gray', 'tab:olive', 'tab:cyan']

In [None]:
#         ax.bar(ind+width/2, curr, width/2, bottom=prev, label=alternative[feat], color=colours[i],
#               alpha=0.7, linewidth=0.8, edgecolor=colours[i], align="center")
#    ax.set_xticks(np.arange(0, m))
#    ax.set_xticklabels(sorted(c.keys(), key=lambda x: keysort(x))[-m:], rotation=45)

In [None]:
# little helper for plotting
_1 = lambda x: 1 if x % 2 == 1 else -1

#### Simple Lenghts

In [None]:
width = 1/8
for l in languages:
    plots = {}
    min_maxes = defaultdict(lambda : [float("inf"), -float("inf")])
    for i, fl in enumerate(flavours):
        xs = "/".join([data, l, fl])
        v = load_vocab(l + "_vocabs.pickle")

        sentences = []
        for tdt in [train, dev, test]:
            sentences.extend(cd.read_col_data(xs + "/" + tdt))

        d = {}
        for sentence in sentences:
            m = sentence.make_matrix("scope", label=True, w2i=v.scoperels.w2i)
            for src, tgt in zip(*np.where(m != 0)):
                if src == 0:
                    lngth = 0
                else:
                    lngth = src - tgt
                lbl = m[src, tgt]
                if lbl not in d:
                    d[lbl] = {}
                if lngth not in d[lbl]:
                    d[lbl][lngth] = 0
                d[lbl][lngth] += 1
        for lbl in d:
            #print(v.scoperels.i2w[lbl])
            xy = sorted(d[lbl].items(), key=lambda x: -x[1])
            #print(xy)
            xs, ys = zip(*xy)
            min_maxes[lbl][0] = min([min(xs), min_maxes[lbl][0]])
            min_maxes[lbl][1] = max([max(xs), min_maxes[lbl][1]])
            if lbl not in plots:
                plots[lbl] = []
            plots[lbl].append((xs, ys, fl))
    for lbl in plots:
        fig, ax = plt.subplots(figsize=(19, 6))
        ax.set_title(l + " " + v.scoperels.i2w[lbl])
        min_max = min_maxes[lbl]
        myrange = {i: j for j, i in enumerate(range(min_max[0], min_max[1] + 1))}
        ax.set_xticks(np.arange(min_max[0], min_max[1]+1))
        ax.set_xticklabels(np.arange(min_max[0], min_max[1]+1), rotation=45)
        _xs = np.arange(min_max[0], min_max[1]+1)
        #ax.set_xticklabels(max_ti.cks, rotation=45)
        for i, (xs, ys, fl) in enumerate(plots[lbl]):
            xs = np.array(xs)
            #_xs = np.zeros(len(myrange))
            _ys = np.zeros(len(myrange))
            xs, ys = zip(*(sorted(zip(xs, ys), key=lambda x: x[1])))
            for x,y in zip(xs, ys):
                j = myrange[x]
                _ys[j] = y
            plt.bar(_xs + (i * width/2 * _1(i)), _ys, width/2, color=colours[i], alpha=0.7, linewidth=0.8, edgecolor=colours[i], align="center", label=fl)
        ax.legend()
        fig.tight_layout()
        plt.show()
    

#### Overlap

In [None]:
# overlap
# cell entries should match
# should be nonzero for unlabelled
# i,j == j,i for undirected
def overlap(m1: np.ndarray, m2: np.ndarray,
            labelled: bool = True, direction: bool = True
           ) -> List[Tuple[int, int, float]]:  # indices and label
    results = []
    assert m1.shape == m2.shape
    #if not m1.shape == m2.shape:
    #    print(m1.shape, m2.shape)
    #    return results
    d1, d2 = m1.shape
    scnd_dim = lambda x: range(x, d2)
    for i in range(d1):
        if direction:
            x = 0
        else:
            x = i
        for j in scnd_dim(x):
            if m1[i,j]:
                a = m1[i,j]
            elif m1[j,i] and not direction:
                a = m1[j,i]
            else:
                continue
            if m2[i,j]:
                b = m2[i,j]
            elif m2[j,i] and not direction:
                b = m2[j,i]
            else:
                continue
            if a == b:
                results.append((i, j, a))
            elif a and b and not labelled:
                results.append((i, j, 1))
    return results
                    
np.random.seed(1)
m1 = np.random.choice(3, (5,5))
m2 = np.random.choice(3, (5,5))
overlap(m1,m2, labelled=False), m1, m2

In [None]:
def get_arcs(m: np.ndarray) -> List[Tuple[int, int, float]]:
    return [(i, j, m[i,j]) for i,j in zip(*np.nonzero(m))]

m1, get_arcs(m1)

In [None]:
# for every language
# for every flavour
# save sentences for each flavour in a >list<
# all combinations (itertools) for all lengths (2 to number of flavours)
# dictionary with language -> flavour combos -> label -> count
# label can then be exchanged with POS-pair or dependencies
# ignore direction

In [None]:
def get_overlapping(lang: str,
                    labelled: bool = True,
                    direction: bool = True):
    v = load_vocab(lang + "_vocabs.pickle")
    sentences = {}
    for fl in flavours:
        xs = "/".join([data, lang, fl])
        sentences[fl] = []
        for tdt in [dev, test]:
            sentences[fl].extend([sen.make_matrix("scope", label=True, w2i=v.scoperels.w2i)
                                    for sen in cd.read_col_data(xs + "/" + tdt)])
            
    cache = {}  # stores overlaps for sentences so that only those need to be compared            
    for combo_len in range(1, len(flavours) + 1):
        for combo in it.combinations(flavours, combo_len):
            for mi, mss in enumerate(zip(*(sentences[c] for c in combo))):
                if len(combo) == 1:
                    cache[(mi, combo)] = get_arcs(mss[0])
                elif len(combo) == 2:
                    # compute overlap between a pair
                    try:
                        cache[(mi, combo)] = overlap(mss[0], mss[1], labelled=labelled, direction=direction)
                    except AssertionError:
                        print(mi, combo)
                elif len(combo) > 2 and (mi, combo[:-1]) in cache:
                    # compute overlap for the arcs in cache
                    m = mss[-1]
                    os = []
                    for src, tgt, lbl in cache[(mi, combo[:-1])]:
                        if m[src, tgt] == lbl:
                            os.append((src, tgt, lbl))
                    if os:
                        cache[(mi, combo)] = os
                else:
                    # do nothing
                    pass
    return cache

In [None]:
def get_overlapping_experiments(lang: str,
                    labelled: bool = True,
                    direction: bool = True):
    v = load_vocab(lang + "_vocabs.pickle")
    sentences = {}
    for fl in flavours:
        xs = "/".join([experiments, lang, fl, "1"])
        sentences[fl] = []
        for tdt in [dev_pred, test_pred]:
            sentences[fl].extend([sen.make_matrix("scope", label=True, w2i=v.scoperels.w2i)
                                    for sen in cd.read_col_data(xs + "/" + tdt)])
    
    cache = {}  # stores overlaps for sentences so that only those need to be compared            
    for combo_len in range(1, len(flavours) + 1):
        for combo in it.combinations(flavours, combo_len):
            for mi, mss in enumerate(zip(*(sentences[c] for c in combo))):
                if len(combo) == 1:
                    cache[(mi, combo)] = get_arcs(mss[0])
                elif len(combo) == 2:
                    # compute overlap between a pair
                    try:
                        cache[(mi, combo)] = overlap(mss[0], mss[1], labelled=labelled, direction=direction)
                    except AssertionError:
                        # print(mi, combo)
                        pass
                elif len(combo) > 2 and (mi, combo[:-1]) in cache:
                    # compute overlap for the arcs in cache
                    m = mss[-1]
                    os = []
                    for src, tgt, lbl in cache[(mi, combo[:-1])]:
                        if m[src, tgt] == lbl:
                            os.append((src, tgt, lbl))
                    if os:
                        cache[(mi, combo)] = os
                else:
                    # do nothing
                    pass
    return cache

In [None]:
def eval_overlapping(lang: str, cache):
    v = load_vocab(lang + "_vocabs.pickle")
    for i in range(1, len(flavours) + 1):
        cnt = {}
        for (_, combo), os in filter(lambda x: len(x[0][1]) == i, cache.items()):
            for s,t,l in os:
                #if s == 0:
                #    continue
                if combo not in cnt:
                    cnt[combo] = {}
                if l not in cnt[combo]:
                    cnt[combo][l] = 0
                cnt[combo][l] += 1
        print(i)
        for combo in sorted(cnt, key=lambda x: len(x)):
            print(combo)
            for l in sorted(cnt[combo], key=lambda x: cnt[combo][x]):
                print("\t",v.scoperels.i2w[l], cnt[combo][l])

In [None]:
for lang in languages:
    print(lang)
    eval_overlapping(lang, get_overlapping(lang))

In [None]:
for lang in languages:
    print(lang)
    eval_overlapping(lang, get_overlapping_experiments(lang))

## Predictions

### What do various flavours do right?

#### True Positives

In [None]:
# same strategy as for overlap, but also overlap of pred with gold

In [None]:
def get_overlapping_tp(lang: str,
                    labelled: bool = True,
                    direction: bool = True):
    v = load_vocab(lang + "_vocabs.pickle")
    sentences = {}
    gold_sentences = {}
    for fl in flavours:
        xs = "/".join([experiments, lang, fl, "1"])
        ys = "/".join([data, lang, fl])
        sentences[fl] = []
        gold_sentences[fl] = []
        for tdt in [dev, test]:
            sentences[fl].extend([sen.make_matrix("scope", label=True, w2i=v.scoperels.w2i)
                                    for sen in cd.read_col_data(xs + "/" + tdt + ".pred")])
            gold_sentences[fl].extend([sen.make_matrix("scope", label=True, w2i=v.scoperels.w2i)
                                    for sen in cd.read_col_data(ys + "/" + tdt)])
    
    cache = {}  # stores overlaps for sentences so that only those need to be compared            
    for combo_len in range(1, len(flavours) + 1):
        for combo in it.combinations(flavours, combo_len):
            for mi, mss in enumerate(zip(*(sentences[c] for c in combo))):
                if len(combo) == 1:
                    gsen = gold_sentences[combo[0]][mi]
                    cache[(mi, combo)] = overlap(gsen, mss[0], labelled=labelled, direction=direction)
                    #cache[(mi, combo)] = get_arcs(mss[0])
                elif len(combo) > 1 and (mi, combo[:-1]) in cache:
                    # compute overlap for the arcs in cache
                    m = mss[-1]
                    os = []
                    for src, tgt, lbl in cache[(mi, combo[:-1])]:
                        if m[src, tgt] == lbl:
                            os.append((src, tgt, lbl))
                    if os:
                        cache[(mi, combo)] = os
                else:
                    # do nothing
                    pass
    return cache

In [None]:
for lang in languages:
    print(lang)
    eval_overlapping(lang, get_overlapping_tp(lang))

### What do various flavours do wrong?

#### False Positives

In [None]:
def overlap_fp(g: np.ndarray, p: np.ndarray,
            labelled: bool = True, direction: bool = True
           ) -> List[Tuple[int, int, float]]:  # indices and label
    results = []
    assert g.shape == p.shape
    d1, d2 = g.shape
    scnd_dim = lambda x: range(x, d2)
    for i in range(d1):
        if direction:
            x = 0
        else:
            x = i
        for j in scnd_dim(x):
            if p[i,j]:
                a = p[i,j]
            elif p[j,i] and not direction:
                a = p[j,i]
            else:
                continue
            if not g[i,j] or (g[j,i] and not direction):
                results.append((i, j, a))
            #elif g[i,j] and g[i,j] != a:
            #    results.append((i, j, a))
            #elif not direction and g[j,i] and g[j,i] != a:
            #    results.append((i, j, a))

    return results
                    
np.random.seed(1)
m1 = np.random.choice(3, (5,5))
m2 = np.random.choice(3, (5,5))
overlap_fp(m1,m2, labelled=True, direction=True), m1, m2

In [None]:
def get_overlapping_fp(lang: str,
                    labelled: bool = True,
                    direction: bool = True):
    v = load_vocab(lang + "_vocabs.pickle")
    sentences = {}
    gold_sentences = {}
    for fl in flavours:
        xs = "/".join([experiments, lang, fl, "1"])
        ys = "/".join([data, lang, fl])
        sentences[fl] = []
        gold_sentences[fl] = []
        for tdt in [dev, test]:
            sentences[fl].extend([sen.make_matrix("scope", label=True, w2i=v.scoperels.w2i)
                                    for sen in cd.read_col_data(xs + "/" + tdt + ".pred")])
            gold_sentences[fl].extend([sen.make_matrix("scope", label=True, w2i=v.scoperels.w2i)
                                    for sen in cd.read_col_data(ys + "/" + tdt)])
    
    cache = {}  # stores overlaps for sentences so that only those need to be compared            
    for combo_len in range(1, len(flavours) + 1):
        for combo in it.combinations(flavours, combo_len):
            for mi, mss in enumerate(zip(*(sentences[c] for c in combo))):
                if len(combo) == 1:
                    gsen = gold_sentences[combo[0]][mi]
                    cache[(mi, combo)] = overlap_fp(gsen, mss[0], labelled=labelled, direction=direction)
                    #cache[(mi, combo)] = get_arcs(mss[0])
                elif len(combo) > 1 and (mi, combo[:-1]) in cache:
                    # compute overlap for the arcs in cache
                    m = mss[-1]
                    os = []
                    for src, tgt, lbl in cache[(mi, combo[:-1])]:
                        if m[src, tgt] == lbl:
                            os.append((src, tgt, lbl))
                    if os:
                        cache[(mi, combo)] = os
                else:
                    # do nothing
                    pass
    return cache

In [None]:
for lang in languages:
    print(lang)
    eval_overlapping(lang, get_overlapping_fp(lang))

#### False Negatives

In [None]:
def overlap_fn(g: np.ndarray, p: np.ndarray,
            labelled: bool = True, direction: bool = True
           ) -> List[Tuple[int, int, float]]:  # indices and label
    results = []
    assert g.shape == p.shape
    d1, d2 = g.shape
    scnd_dim = lambda x: range(x, d2)
    for i in range(d1):
        if direction:
            x = 0
        else:
            x = i
        for j in scnd_dim(x):
            if g[i,j]:
                a = g[i,j]
            elif g[j,i] and not direction:
                a = g[j,i]
            else:
                continue
            if not p[i,j] or (p[j,i] and not direction):
                results.append((i, j, a))
            #elif g[i,j] and g[i,j] != a:
            #    results.append((i, j, a))
            #elif not direction and g[j,i] and g[j,i] != a:
            #    results.append((i, j, a))

    return results
                    
np.random.seed(1)
m1 = np.random.choice(3, (5,5))
m2 = np.random.choice(3, (5,5))
overlap_fn(m1,m2, labelled=True, direction=True), m1, m2

In [None]:
def get_overlapping_fn(lang: str,
                    labelled: bool = True,
                    direction: bool = True):
    v = load_vocab(lang + "_vocabs.pickle")
    sentences = {}
    gold_sentences = {}
    for fl in flavours:
        xs = "/".join([experiments, lang, fl, "1"])
        ys = "/".join([data, lang, fl])
        sentences[fl] = []
        gold_sentences[fl] = []
        for tdt in [dev, test]:
            sentences[fl].extend([sen.make_matrix("scope", label=True, w2i=v.scoperels.w2i)
                                    for sen in cd.read_col_data(xs + "/" + tdt + ".pred")])
            gold_sentences[fl].extend([sen.make_matrix("scope", label=True, w2i=v.scoperels.w2i)
                                    for sen in cd.read_col_data(ys + "/" + tdt)])
    
    cache = {}  # stores overlaps for sentences so that only those need to be compared            
    for combo_len in range(1, len(flavours) + 1):
        for combo in it.combinations(flavours, combo_len):
            for mi, mss in enumerate(zip(*(sentences[c] for c in combo))):
                if len(combo) == 1:
                    gsen = gold_sentences[combo[0]][mi]
                    cache[(mi, combo)] = overlap_fn(gsen, mss[0], labelled=labelled, direction=direction)
                    #cache[(mi, combo)] = get_arcs(mss[0])
                elif len(combo) > 1 and (mi, combo[:-1]) in cache:
                    # compute overlap for the arcs in cache
                    m = mss[-1]
                    os = []
                    for src, tgt, lbl in cache[(mi, combo[:-1])]:
                        if m[src, tgt] == lbl:
                            os.append((src, tgt, lbl))
                    if os:
                        cache[(mi, combo)] = os
                else:
                    # do nothing
                    pass
    return cache

In [None]:
for lang in languages:
    print(lang)
    eval_overlapping(lang, get_overlapping_fn(lang))

## Compare Runs for each Flavour

### What is easy & what is difficult?