In [1]:
import json

with open("../out/202006222159_spanishfn.json") as fp:
    data = json.load(fp)

In [2]:
from scipy.stats import rankdata

def rank_transform(orig):
    data = np.copy(orig)
    indices = [i for i, s in enumerate(data) if s > 0]
    norm = rankdata([data[i] for i in indices], "max") / len(indices)

    for i, s in zip(indices, norm):
        data[i] = s
    return data

In [3]:
import numpy as np
import pandas as pd

scaled_rank = [
    (rank_transform(np.array(x["data"]).flatten()), x["id"])
    for x in data["alignments"]
]

# Score techniques correlations

In [4]:
m = np.matrix([X for X, label in scaled_rank]).T
df = pd.DataFrame(m, columns=[label for X, label in scaled_rank])

df.corr()

Unnamed: 0,name_matching,core_fe_matching,lu_wordnet,synset,synset_inv,muse_fe_match,muse_exact_fe_match,lu_muse_10_0.3,lu_muse_5_0.3,lu_muse_3_0.3,lu_mean_muse,frame_def_muse
name_matching,1.0,0.167571,0.08419,0.090414,0.096833,0.045212,0.183034,0.087974,0.102747,0.112796,0.009915,0.012267
core_fe_matching,0.167571,1.0,0.046413,0.048231,0.048284,0.337887,0.993215,0.045758,0.043805,0.042065,0.034919,-0.002378
lu_wordnet,0.08419,0.046413,1.0,0.945605,0.772525,0.018468,0.048899,0.305959,0.316027,0.326958,0.206459,-0.001974
synset,0.090414,0.048231,0.945605,1.0,0.810133,0.020401,0.050916,0.330875,0.347481,0.363508,0.209317,-0.002052
synset_inv,0.096833,0.048284,0.772525,0.810133,1.0,0.019611,0.051208,0.293711,0.310127,0.323873,0.207649,0.004173
muse_fe_match,0.045212,0.337887,0.018468,0.020401,0.019611,1.0,0.332904,0.016484,0.014505,0.013328,0.043994,0.028583
muse_exact_fe_match,0.183034,0.993215,0.048899,0.050916,0.051208,0.332904,1.0,0.048362,0.046718,0.044893,0.035677,-0.00232
lu_muse_10_0.3,0.087974,0.045758,0.305959,0.330875,0.293711,0.016484,0.048362,1.0,0.823111,0.724372,0.193122,-0.006959
lu_muse_5_0.3,0.102747,0.043805,0.316027,0.347481,0.310127,0.014505,0.046718,0.823111,1.0,0.880212,0.157745,-0.005529
lu_muse_3_0.3,0.112796,0.042065,0.326958,0.363508,0.323873,0.013328,0.044893,0.724372,0.880212,1.0,0.137136,-0.004278


In [5]:
indices = data["indices"]
en_len = len(indices[0])
l2_len = len(indices[1])

sums = np.sum([X for X, label in scaled_rank], axis=0).reshape(en_len, l2_len)

# Best alignment pairs scored by all techniques

In [6]:
sum_idx = [(idx[0], idx[1], score) for idx, score in np.ndenumerate(sums)]
sorted_idx = sorted(sum_idx, key=lambda x: -x[2])

In [7]:
printed = 0

for x in sorted_idx:
    en_frm = data["frames"][indices[0][x[0]]]
    l2_frm = data["frames"][indices[1][x[1]]]
    
    if en_frm["name"] != l2_frm["name"] and (len(en_frm["LUs"]) > 0 and len(l2_frm["LUs"]) > 0):
        score = '{:.3f}'.format(x[2]/10)
        print(f'{score}: {(en_frm["name"]+"("+en_frm["gid"]+")").ljust(40)} {l2_frm["name"]} ({l2_frm["gid"]})')
        printed += 1
        printed += 1
        
        if printed == 50:
              break


1.063: Have_as_translation_equivalent(1654.en)  Translating (1645.es)
1.059: Be_translation_equivalent(1655.en)       Translating (1645.es)
1.051: Gradable_proximity(2900.en)              Locative_relation (199.es)
1.047: Hearsay(37.en)                           Hear (37.es)
1.030: Make_agreement_on_action(1342.en)        Be_in_agreement_on_action (1380.es)
1.025: Renting_out(1729.en)                     Renting (1728.es)
1.022: Regard(1400.en)                          Opinion (1692.es)
1.021: Hit_target(485.en)                       Use_firearm (290.es)
1.019: Hit_target(485.en)                       Shoot_projectiles (289.es)
1.019: Renting(1728.en)                         Renting_out (1729.es)
0.999: Questioning(40.en)                       Request (41.es)
0.997: Undergo_transformation(2973.en)          Cause_change (683.es)
0.986: Just_found_out(2050.en)                  Stimulus_focus (280.es)
0.984: Locale_by_ownership(2171.en)             Locale (192.es)
0.983: Being_at_risk(156

# Worst alignment pairs compared to baseline


The baseline in this case is name matching. This list sorts alignemnt pairs by the difference of their aggregated score and the name_matching score and only prints frame pairs with the same name (baseline). 

In [8]:
try:
    baseline = next(np.array(x["data"]) for x in data["alignments"] if x["id"] == "name_matching")
except:
    baseline = next(np.array(x["data"]) for x in data["alignments"] if x["id"] == "id_matching")
sum_idx = [(idx[0], idx[1], score) for idx, score in np.ndenumerate(baseline - (sums / 10))]
sorted_idx = [x for x in sorted(sum_idx, key=lambda x: -x[2]) if x[2] > 0]

In [9]:
printed = 0

for x in sorted_idx:
    en_frm = data["frames"][indices[0][x[0]]]
    l2_frm = data["frames"][indices[1][x[1]]]
    
    if len(en_frm["LUs"]) > 0 and len(l2_frm["LUs"]) > 0:
        score = '{:.3f}'.format(1-x[2])
        print(f'{score}: {(en_frm["name"]+"("+en_frm["gid"]+")").ljust(40)} {l2_frm["name"]} ({l2_frm["gid"]})')
        printed += 1

        if printed == 50:
              break


0.454: Possibility(2120.en)                     Possibility (2120.es)
0.464: Terrorism(1750.en)                       Terrorism (1750.es)
0.485: Activity_done_state(182.en)              Activity_done_state (182.es)
0.508: Turning_out(2191.en)                     Turning_out (2191.es)
0.554: Path_shape(61.en)                        Path_shape (61.es)
0.570: Distinctiveness(1133.en)                 Distinctiveness (1133.es)
0.575: Process_completed_state(234.en)          Process_completed_state (234.es)
0.650: Impact(311.en)                           Impact (311.es)
0.651: Buildings(173.en)                        Buildings (173.es)
0.712: Leadership(73.en)                        Leadership (73.es)
0.735: Activity_prepare(170.en)                 Activity_prepare (170.es)
0.760: Commerce_collect(210.en)                 Commerce_collect (210.es)
0.767: Estimating(1513.en)                      Estimating (1513.es)
0.774: Substance_by_phase(385.en)               Substance_by_phase (385.es)
0.

In [10]:
import networkx as nx

def min_matching(scores):
    G = nx.Graph()

    G.add_nodes_from(indices[0])
    G.add_nodes_from(indices[1])

    edge_matrix = scores.reshape(en_len, l2_len)
    G.add_edges_from([
        (indices[0][idx[0]], indices[1][idx[1]], { "weight": 1-score })
        for idx, score in np.ndenumerate(edge_matrix)
    ])
    
    m = nx.bipartite.minimum_weight_full_matching(G, top_nodes=indices[0])
    return {k:v for k, v in m.items() if k.endswith(".en")}

In [11]:
import time

matchings = []

for X, label in scaled_rank:
    start = time.time()
    matchings.append(min_matching(X))

    print(f'Computed matching for {label}')
    print("--- %s seconds ---" % (time.time() - start))

AttributeError: module 'networkx.algorithms.bipartite' has no attribute 'minimum_weight_full_matching'

In [None]:
votes = {}

for m in matchings[1:]:
    for k,v in m.items():
        if (k, v) in votes:
            votes[(k, v)] += 1
        else:
            votes[(k, v)] = 1

In [None]:
len(matchings)-1

In [None]:
sorted_idx = sorted([(k[0], k[1], v) for k, v in votes.items()], key=lambda x: -x[2])

In [None]:
printed = 0

for x in sorted_idx:
    if x[2] <= 3:
        continue
    
    en_frm = data["frames"][x[0]]
    l2_frm = data["frames"][x[1]]
    
    if en_frm["name"] != l2_frm["name"]:
        score = '{:d}'.format(x[2])
        print(f'{score}: {(en_frm["name"]+"("+en_frm["gid"]+")").ljust(40)} {l2_frm["name"]} ({l2_frm["gid"]})')
        printed += 1

        if printed == 100:
              break
