## Load classifier and alignment, gold and mapping data

In [1]:
import os
import json
from joblib import load

# classifier
clf = load('de_clf.joblib')

# alignment
with open(os.path.join('..', 'out', '202112201417_salsa.json')) as fp:
    alignment = json.load(fp)

# mapping
with open(os.path.join('..', 'data', 'salsa-mapping-2021-11-15.json')) as fp:
    mapping_raw = json.load(fp)

In [2]:
# filtering out unmapped and problematic salsa proto-frames
mapping_finished = [m for m in mapping_raw if m["MAPPING"] and m["MAPPING"]["COMMENTSTATUS"] == 2]
len(mapping_finished)

314

In [3]:
# filtering out "trivial" alignments
mapping = [m for m in mapping_finished if m['MAPPING']['BERKELEYFN'] not in m['SALSA_NAME']]
len(mapping)

109

In [4]:
pairs = [(m['MAPPING']['BERKELEYFN'], m['SALSA_NAME']) for m in mapping]
pairs

[('Commerce_buy', 'Abnehmer1-salsa'),
 ('Presence', 'Abwesenheit1-salsa'),
 ('Distributed_position', 'Adorning'),
 ('Presence', 'Anwesenheit1-salsa'),
 ('Give_impression', 'Appearance'),
 ('Confronting_problem', 'Auge30-salsa'),
 ('Change_event_time', 'Bank6-salsa'),
 ('Giving_birth', 'Birth'),
 ('Cause_change_of_position_on_a_scale', 'Cause_change_of_scalar_position'),
 ('Undergo_change', 'Change_of_state_scenario'),
 ('Come_together', 'Congregating'),
 ('Process_start', 'Eingang4-salsa'),
 ('Conquering', 'Einnahme1-salsa'),
 ('Earnings_and_losses', 'Einnahme2-salsa'),
 ('Getting', 'Eroberung1-salsa'),
 ('Conquering', 'Eroberung3-salsa'),
 ('Stimulate_emotion', 'Experiencer_obj'),
 ('Experiencer_focused_emotion', 'Experiencer_subj'),
 ('Point_of_dispute', 'Frage3-salsa'),
 ('Appellations', 'Frau1-salsa'),
 ('Money', 'Geld1-salsa'),
 ('Money', 'Geld3-salsa'),
 ('Hearsay', 'Hear-fnsalsa'),
 ('Importing', 'Import_export'),
 ('Take_place_of', 'Newcomer'),
 ('Body_parts', 'Observable_bodyp

In [5]:
en_name_gid = {v['name']:k for k, v in alignment['frames'].items() if v['language'] == 'en'}
de_name_gid = {v['name']:k for k, v in alignment['frames'].items() if v['language'] == 'de'}

en_gid_idx = {gid:i for i, gid in enumerate(alignment['indices'][0])}
de_gid_idx = {gid:i for i, gid in enumerate(alignment['indices'][1])}

# hardcoding name change. TODO: update data
en_name_gid['Experiencer_focused_emotion'] = en_name_gid['Experiencer_focus']

en_idx = []
de_idx = []

for x, y in pairs:
    try:
        en_idx.append(en_gid_idx[en_name_gid[x]])
        de_idx.append(de_gid_idx[de_name_gid[y]])
    except:
        print(x, y)

Stimulate_emotion Experiencer_obj


## Evaluate accuracy

In [34]:
import numpy as np

X = np.stack([np.array(a['data']).flatten()
              for a in alignment['alignments']
              if a['type'] != 'attr_matching']).T
X.shape

(1249083, 13)

In [35]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [73]:
# class prediction
y_pred = clf.decision_function(X_scaled)

# reshape to 2-d
y_pred = y_pred.reshape((len(alignment['indices'][0]), len(alignment['indices'][1])))
# transpose so DE frames are in 1st dim
y_pred = y_pred[:, de_idx].T
# Accuracy: count of occurrences where argmax equals to EN index
acc = (y_pred.argmax(axis=1) == en_idx).sum() / y_pred.shape[0]

acc

0.5185185185185185

In [128]:
# Lax-accuracy considering K-best 
def lax_acc(k):
    return sum(t in k_best for t, k_best in zip(en_idx, y_pred.argsort(axis=1)[:, -k:])) / y_pred.shape[0]

print('K=3 ->', lax_acc(3))
print('K=5 ->', lax_acc(5))
print('K=10 ->', lax_acc(10))

K=3 -> 0.6481481481481481
K=5 -> 0.6574074074074074
K=10 -> 0.7314814814814815
