In [1]:
import json

with open("./out/202003021204_fnbrasil.json") as fp:
    data = json.load(fp)

In [2]:
from scipy.stats import rankdata

def rank_transform(orig):
    data = np.copy(orig)
    indices = [i for i, s in enumerate(data) if s > 0]
    norm = rankdata([data[i] for i in indices], "max") / len(indices)

    for i, s in zip(indices, norm):
        data[i] = s
    return data

In [3]:
import numpy as np
import pandas as pd

scaled_rank = [
    (rank_transform(np.array(x["data"]).flatten()), x["id"])
    for x in data["alignments"]
]

# Score techniques correlations

In [4]:
m = np.matrix([X for X, label in scaled_rank]).T
df = pd.DataFrame(m, columns=[label for X, label in scaled_rank])

df.corr()

Unnamed: 0,id_matching,core_fe_matching,lu_wordnet,synset,synset_inv,muse_fe_match,muse_exact_fe_match,muse_mixed_fe_match,lu_muse_10_0.3,lu_muse_5_0.3,lu_mean_muse,frame_def_muse
id_matching,1.0,0.162606,0.062533,0.07013,0.079656,0.045581,0.176351,0.094832,0.042546,0.05193,0.007858,0.008921
core_fe_matching,0.162606,1.0,-0.001662,-0.000679,-0.000157,0.294834,0.992487,0.077496,-0.004614,-0.002653,-0.043992,-0.058701
lu_wordnet,0.062533,-0.001662,1.0,0.935412,0.768259,-0.00254,-0.001129,0.057293,0.266579,0.26959,0.173931,0.042128
synset,0.07013,-0.000679,0.935412,1.0,0.81791,-0.000322,-7.1e-05,0.058748,0.280054,0.297481,0.177716,0.042052
synset_inv,0.079656,-0.000157,0.768259,0.81791,1.0,0.000111,0.000456,0.061999,0.241557,0.262595,0.1755,0.042711
muse_fe_match,0.045581,0.294834,-0.00254,-0.000322,0.000111,1.0,0.29192,0.081699,-0.005304,-0.002048,-0.044451,-0.084689
muse_exact_fe_match,0.176351,0.992487,-0.001129,-7.1e-05,0.000456,0.29192,1.0,0.084586,-0.004215,-0.002258,-0.042564,-0.056606
muse_mixed_fe_match,0.094832,0.077496,0.057293,0.058748,0.061999,0.081699,0.084586,1.0,0.0613,0.051763,0.397582,0.577491
lu_muse_10_0.3,0.042546,-0.004614,0.266579,0.280054,0.241557,-0.005304,-0.004215,0.0613,1.0,0.796981,0.193374,0.046377
lu_muse_5_0.3,0.05193,-0.002653,0.26959,0.297481,0.262595,-0.002048,-0.002258,0.051763,0.796981,1.0,0.158204,0.037804


In [5]:
indices = data["indices"]
en_len = len(indices[0])
l2_len = len(indices[1])

sums = np.sum([X for X, label in scaled_rank], axis=0).reshape(en_len, l2_len)

# Best alignment pairs scored by all techniques

In [6]:
sum_idx = [(idx[0], idx[1], score) for idx, score in np.ndenumerate(sums)]
sorted_idx = sorted(sum_idx, key=lambda x: -x[2])

In [7]:
printed = 0

for x in sorted_idx:
    en_frm = data["frames"][indices[0][x[0]]]
    l2_frm = data["frames"][indices[1][x[1]]]
    
    if en_frm["name"] != l2_frm["name"]:
        score = '{:.3f}'.format(x[2]/10)
        print(f'{score}: {(en_frm["name"]+"("+en_frm["gid"]+")").ljust(40)} {l2_frm["name"]} ({l2_frm["gid"]})')
        printed += 1
        printed += 1
        
        if printed == 50:
              break


1.076: Intentionally_create(280.en)             Intentionally_create# (280.pt)
1.033: Assistance(391.en)                       Assistance# (391.pt)
1.018: Linguistic_meaning(406.en)               Linguistic_meaning# (406.pt)
1.014: Creating(319.en)                         Creating# (319.pt)
0.993: Cutting(265.en)                          Cortar__Cutting (265.pt)
0.993: Performers_and_roles(413.en)             Performers_and_roles# (413.pt)
0.992: Studying(1130.en)                        Estudar__Studying (1130.pt)
0.985: Competition(241.en)                      Competition# (241.pt)
0.981: Perception_experience(70.en)             Perception_experience# (70.pt)
0.981: Cause_to_be_included(2230.en)            Causar_estar_incluído__Cause_to_be_included (2230.pt)
0.975: Satisfying(2233.en)                      Satisfatório___Satisfying (2233.pt)
0.973: Abandonment(2031.en)                     Abandono__Abandonment (2031.pt)
0.970: Feigning(12.en)                          Fingir__Feigning 

# Worst alignment pairs compared to baseline


The baseline in this case is name matching. This list sorts alignemnt pairs by the difference of their aggregated score and the name_matching score and only prints frame pairs with the same name (baseline). 

In [8]:
try:
    baseline = next(np.array(x["data"]) for x in data["alignments"] if x["id"] == "name_matching")
except:
    baseline = next(np.array(x["data"]) for x in data["alignments"] if x["id"] == "id_matching")
sum_idx = [(idx[0], idx[1], score) for idx, score in np.ndenumerate(baseline - (sums / 10))]
sorted_idx = [x for x in sorted(sum_idx, key=lambda x: -x[2]) if x[2] > 0]

In [9]:
printed = 0

for x in sorted_idx:
    en_frm = data["frames"][indices[0][x[0]]]
    l2_frm = data["frames"][indices[1][x[1]]]
    
    if len(en_frm["LUs"]) > 0 and len(l2_frm["LUs"]) > 0:
        score = '{:.3f}'.format(1-x[2])
        print(f'{score}: {(en_frm["name"]+"("+en_frm["gid"]+")").ljust(40)} {l2_frm["name"]} ({l2_frm["gid"]})')
        printed += 1

        if printed == 50:
              break


0.187: Counterattack(2677.en)                   Digestão__Digestion (2677.pt)
0.246: Possibility(2120.en)                     Possibilidade__Possibility (2120.pt)
0.373: Responsibility(2657.en)                  Causar_ficar_molhado__Cause_to_be_wet (2657.pt)
0.377: Mass_motion(284.en)                      Massa_movimento   Mass_motion (284.pt)
0.397: Preference(1626.en)                      Preferência__Preference (1626.pt)
0.431: Adding_up(621.en)                        Adição__Adding_up (621.pt)
0.452: Process_end(232.en)                      Processo_fim [Process_end]# (232.pt)
0.471: Assessing(519.en)                        Avaliar__Assessing# (519.pt)
0.475: Body_description_part(235.en)            Descrição_parte_do_corpo__Body_description_part (235.pt)
0.478: Containing(397.en)                       Conter (397.pt)
0.479: Activity_pause(167.en)                   Atividade_pausar__Activity_pause (167.pt)
0.488: Correctness(1512.en)                     Correção    Correctness (151

In [10]:
import networkx as nx

def min_matching(scores):
    G = nx.Graph()

    G.add_nodes_from(indices[0])
    G.add_nodes_from(indices[1])

    edge_matrix = scores.reshape(en_len, l2_len)
    G.add_edges_from([
        (indices[0][idx[0]], indices[1][idx[1]], { "weight": 1-score })
        for idx, score in np.ndenumerate(edge_matrix)
    ])
    
    m = nx.bipartite.minimum_weight_full_matching(G, top_nodes=indices[0])
    return {k:v for k, v in m.items() if k.endswith(".en")}

In [11]:
import time

matchings = []

for X, label in scaled_rank:
    start = time.time()
    matchings.append(min_matching(X))

    print(f'Computed matching for {label}')
    print("--- %s seconds ---" % (time.time() - start))

Computed matching for id_matching
--- 11.72362494468689 seconds ---
Computed matching for core_fe_matching
--- 47.562856912612915 seconds ---
Computed matching for lu_wordnet
--- 42.738417863845825 seconds ---
Computed matching for synset
--- 39.50200295448303 seconds ---
Computed matching for synset_inv
--- 37.680036783218384 seconds ---
Computed matching for muse_fe_match
--- 88.11011409759521 seconds ---
Computed matching for muse_exact_fe_match
--- 49.988282203674316 seconds ---
Computed matching for muse_mixed_fe_match
--- 46.5916748046875 seconds ---
Computed matching for lu_muse_10_0.3
--- 38.30361795425415 seconds ---
Computed matching for lu_muse_5_0.3
--- 36.21912717819214 seconds ---
Computed matching for lu_mean_muse
--- 42.37503123283386 seconds ---
Computed matching for frame_def_muse
--- 700.685142993927 seconds ---


In [31]:
votes = {}

for m in matchings[1:]:
    for k,v in m.items():
        if (k, v) in votes:
            votes[(k, v)] += 1
        else:
            votes[(k, v)] = 1

In [34]:
len(matchings)-1

11

In [32]:
sorted_idx = sorted([(k[0], k[1], v) for k, v in votes.items()], key=lambda x: -x[2])

In [33]:
printed = 0

for x in sorted_idx:
    if x[2] <= 3:
        continue
    
    en_frm = data["frames"][x[0]]
    l2_frm = data["frames"][x[1]]
    
    if en_frm["name"] != l2_frm["name"]:
        score = '{:d}'.format(x[2])
        print(f'{score}: {(en_frm["name"]+"("+en_frm["gid"]+")").ljust(40)} {l2_frm["name"]} ({l2_frm["gid"]})')
        printed += 1

        if printed == 100:
              break


8: Assistance(391.en)                       Assistance# (391.pt)
8: Participation(1144.en)                   Participation# (1144.pt)
8: Abusing(482.en)                          Abusar__Abusing (482.pt)
8: Activity_ready_state(169.en)             Atividade_preparada__Activity_ready_state (169.pt)
7: Kinship(104.en)                          Kinship# (104.pt)
7: Abandonment(2031.en)                     Abandono__Abandonment (2031.pt)
6: Linguistic_meaning(406.en)               Linguistic_meaning# (406.pt)
6: Categorization(21.en)                    Categorization# (21.pt)
6: Cause_to_experience(288.en)              Emoções_de_atividade_mental__Emotions_of_mental_activity (2045.pt)
6: Feigning(12.en)                          Fingir__Feigning (12.pt)
6: Prevarication(287.en)                    Prevaricação__Prevarication (287.pt)
6: Activity_start(146.en)                   Atividade_iniciar__Activity_start (146.pt)
6: Money(1761.en)                           Dinheiro__Money (1761.pt)
6: St