## Load classifier and alignment, gold and mapping data

In [1]:
import os
import json
from joblib import load

# classifier
clf = load('de_clf.joblib')

# MLFN alignment result (automatic)
with open(os.path.join('..', 'out', '202112201417_salsa.json')) as fp:
    alignment = json.load(fp)

# Salsa mapping result (manual)
with open(os.path.join('..', 'data', 'salsa-mapping-2022-01-26.json')) as fp:
    mapping_raw = json.load(fp)
    
# Frame families
with open(os.path.join('..', 'data', 'framefamilies-2020-01-26.json')) as fp:
    frame_families_raw = json.load(fp)
    
len(mapping_raw)

1023

In [2]:
# proto-frames with a mapping
len([m for m in mapping_raw if m["MAPPING"]])

852

In [3]:
# proto-frames with 'PROBLEMATIC' status
len([m for m in mapping_raw if m["MAPPING"] and m["MAPPING"]["COMMENTSTATUS"] == 3])

468

In [4]:
# filtering out unmapped and problematic salsa proto-frames
mapping_finished = [m for m in mapping_raw if m["MAPPING"] and m["MAPPING"]["COMMENTSTATUS"] == 2]
len(mapping_finished)

372

In [5]:
# filtering out "trivial" alignments
# mapping = [m for m in mapping_finished if m['MAPPING']['BERKELEYFN'] not in m['SALSA_NAME']]
mapping = mapping_finished
len(mapping)

372

In [6]:
pairs = [(m['MAPPING']['BERKELEYFN'], m['SALSA_NAME']) for m in mapping]
pairs

[('Commerce_buy', 'Abnehmer1-salsa'),
 ('Abusing', 'Abusing'),
 ('Presence', 'Abwesenheit1-salsa'),
 ('Achieving_first', 'Achieving_first'),
 ('Activity_resume', 'Activity_resume'),
 ('Adding_up', 'Adding_up'),
 ('Distributed_position', 'Adorning'),
 ('Amalgamation', 'Amalgamation'),
 ('Ambient_temperature', 'Ambient_temperature'),
 ('Amounting_to', 'Amounting_to'),
 ('Presence', 'Anwesenheit1-salsa'),
 ('Give_impression', 'Appearance'),
 ('Arrest', 'Arrest'),
 ('Arriving', 'Arriving'),
 ('Assessing', 'Assessing'),
 ('Attack', 'Attack'),
 ('Attempt', 'Attempt'),
 ('Attempt_suasion', 'Attempt_suasion'),
 ('Attention_getting', 'Attention_getting'),
 ('Confronting_problem', 'Auge30-salsa'),
 ('Awareness', 'Awareness'),
 ('Change_event_time', 'Bank6-salsa'),
 ('Becoming_aware', 'Becoming_aware'),
 ('Behind_the_scenes', 'Behind_the_scenes'),
 ('Being_born', 'Being_born'),
 ('Being_located', 'Being_located'),
 ('Being_obligated', 'Being_obligated-fnsalsa'),
 ('Giving_birth', 'Birth'),
 ('Bod

In [7]:
en_name_gid = {v['name']:k for k, v in alignment['frames'].items() if v['language'] == 'en'}
de_name_gid = {v['name']:k for k, v in alignment['frames'].items() if v['language'] == 'de'}

en_gid_idx = {gid:i for i, gid in enumerate(alignment['indices'][0])}
de_gid_idx = {gid:i for i, gid in enumerate(alignment['indices'][1])}

# hardcoding name change. TODO: update data
en_name_gid['Experiencer_focused_emotion'] = en_name_gid['Experiencer_focus']

en_idx = []
de_idx = []

for x, y in pairs:
    try:
        en_idx.append(en_gid_idx[en_name_gid[x]])
        de_idx.append(de_gid_idx[de_name_gid[y]])
    except:
        print(x, y)
        

Stimulate_emotion Experiencer_obj
Stimulate_emotion ueberraschen1-salsa
Stimulate_emotion wehtun1-salsa


In [8]:
import numpy as np
from collections import defaultdict

idx_to_gid = alignment['indices'][0]
frames = alignment['frames']

def idx_to_family(idx):
    if np.issubdtype(type(idx), int):
        return frame_families[frames[idx_to_gid[idx]]['name']]
    
    return [
        frame_families[frames[idx_to_gid[i]]['name']]
        for i in idx
    ]

frame_families = defaultdict(set)

for family in frame_families_raw:
    name = family["FAMILY_TITLE"]
    
    frame_families[family['FAMILY_CONSTITUTING_FRAME']['TITLE_BFN']].add(name)
    
    for member in family["FAMILY_MEMBER"]:
        frame_families[member['TITLE_BFN']].add(name)


## Evaluate accuracy

In [9]:
import numpy as np

X = np.stack([np.array(a['data']).flatten()
              for a in alignment['alignments']
              if a['type'] != 'attr_matching']).T
X.shape

(1249083, 13)

In [10]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [11]:
# class prediction
y_pred = clf.decision_function(X_scaled)

# reshape to 2-d
y_pred = y_pred.reshape((len(alignment['indices'][0]), len(alignment['indices'][1])))
# transpose so DE frames are in 1st dim
y_pred = y_pred[:, de_idx].T

In [12]:
# Lax-accuracy considering K-best 
def lax_acc(k):
    return sum(t in k_best for t, k_best in zip(en_idx, y_pred.argsort(axis=1)[:, -k:])) / y_pred.shape[0]

print('Accuracies: ')
print('k=1 ->', lax_acc(1))
print('K=2 ->', lax_acc(2))
print('K=3 ->', lax_acc(3))
print('K=5 ->', lax_acc(5))
print('K=10 ->', lax_acc(10))
print('K=20 ->', lax_acc(20))
print('K=50 ->', lax_acc(50))

Accuracies: 
k=1 -> 0.6883468834688347
K=2 -> 0.7533875338753387
K=3 -> 0.7750677506775068
K=5 -> 0.7859078590785907
K=10 -> 0.8211382113821138
K=20 -> 0.8373983739837398
K=50 -> 0.8563685636856369


In [13]:
def lax_family_acc(k):
    intersections = [
        len(idx_to_family(t).intersection(set().union(*idx_to_family(k_best)))) > 0
        for t, k_best in zip(en_idx, y_pred.argsort(axis=1)[:, -k:])
        if len(idx_to_family(t)) > 0 
    ]
    
    return sum(intersections) / len(intersections)

print('Family accuracies: ')
print('k=1 ->', lax_family_acc(1))
print('K=2 ->', lax_family_acc(2))
print('K=3 ->', lax_family_acc(3))
print('K=5 ->', lax_family_acc(5))
print('K=10 ->', lax_family_acc(10))
print('K=20 ->', lax_family_acc(20))
print('K=50 ->', lax_family_acc(50))

Family accuracies: 
k=1 -> 0.760932944606414
K=2 -> 0.8075801749271136
K=3 -> 0.8221574344023324
K=5 -> 0.8600583090379009
K=10 -> 0.880466472303207
K=20 -> 0.9067055393586005
K=50 -> 0.9475218658892128


In [14]:
def lax_jaccard_acc(k):
    jaccard = [
        max(
            len(idx_to_family(t).intersection(idx_set)) / len(idx_to_family(t).union(idx_set))
            for idx_set in idx_to_family(k_best)
            if len(idx_to_family(t).union(idx_set)) > 0
        )
        for t, k_best in zip(en_idx, y_pred.argsort(axis=1)[:, -k:])
        if len(idx_to_family(t)) > 0 
    ]

    return sum(jaccard) / len(jaccard)
    

print('Jaccard family accuracies: ')
print('k=1 ->', lax_jaccard_acc(1))
print('K=2 ->', lax_jaccard_acc(2))
print('K=3 ->', lax_jaccard_acc(3))
print('K=5 ->', lax_jaccard_acc(5))
print('K=10 ->', lax_jaccard_acc(10))
print('K=20 ->', lax_jaccard_acc(20))
print('K=50 ->', lax_jaccard_acc(50))

Jaccard family accuracies: 
k=1 -> 0.7152145473574045
K=2 -> 0.7721063516981884
K=3 -> 0.7912767828219722
K=5 -> 0.8114952103290296
K=10 -> 0.8538699153130639
K=20 -> 0.8717791198111897
K=50 -> 0.9103498542274054


## Checking wrong predictions

In [15]:
wrong_idx = y_pred.argmax(axis=1) != en_idx

wrong_en_idx = np.array(en_idx)[wrong_idx]
wrong_de_idx = np.array(de_idx)[wrong_idx]
wrong_pred_idx = y_pred.argmax(axis=1)[wrong_idx]

wrong_en_gid = np.array(alignment['indices'][0])[wrong_en_idx]
wrong_de_gid = np.array(alignment['indices'][1])[wrong_de_idx]
wrong_pred_gid = np.array(alignment['indices'][0])[wrong_pred_idx]

frames = alignment['frames']

wrong = [
    (frames[d]['name'], frames[e]['name'], frames[p]['name'])
    for e, d, p in zip(wrong_en_gid, wrong_de_gid, wrong_pred_gid)
]

wrong

[('Abnehmer1-salsa', 'Commerce_buy', 'Commercial_transaction'),
 ('Attack', 'Attack', 'Counterattack'),
 ('Attempt', 'Attempt', 'Accomplishment'),
 ('Bank6-salsa', 'Change_event_time', 'Means'),
 ('Being_obligated-fnsalsa', 'Being_obligated', 'Product_line'),
 ('Buildings-fnsalsa', 'Buildings', 'Physical_artworks'),
 ('Change_of_state_scenario', 'Undergo_change', 'Dynamic_situation_scenario'),
 ('Change_position_on_a_scale-fnsalsa', 'Change_position_on_a_scale', 'Trial'),
 ('Choosing-fnsalsa', 'Choosing', 'Alternatives'),
 ('Coming_to_be', 'Coming_to_be', 'Transition_to_a_situation'),
 ('Creating', 'Creating', 'Intentionally_create'),
 ('Desiring-fnsalsa', 'Desiring', 'Living_conditions'),
 ('Eingang4-salsa', 'Process_start', 'History_scenario'),
 ('Employing-fnsalsa', 'Employing', 'Attempting_and_resolving_scenario'),
 ('Eroberung1-salsa', 'Getting', 'Getting_scenario'),
 ('Eroberung3-salsa', 'Conquering', 'Precariousness'),
 ('Experiencer_subj', 'Experiencer_focus', 'Tolerating'),
 (

## Export predictions

In [16]:
# class prediction
y_pred = clf.decision_function(X_scaled)

# reshape to 2-d
y_pred_reshaped = y_pred.reshape((len(alignment['indices'][0]), len(alignment['indices'][1]))).T

best_pred = y_pred_reshaped.argsort(axis=1)[:, -3:][:, ::-1]

In [17]:
frm_pred = dict()
frames = alignment['frames']

for i, preds in enumerate(best_pred[:,:]):
    de_frm = frames[alignment['indices'][1][i]]['name']
    en_frms = list(map(lambda x: frames[alignment['indices'][0][x]]['name'], preds))
    
    frm_pred[de_frm] = en_frms

In [18]:
out_data = []

for m in mapping_raw:
    out = {
        'SALSA_NAME': m['SALSA_NAME'],
    }
    
    if m['MAPPING']:
        out['COMMENTSTATUS'] = m['MAPPING']['COMMENTSTATUS']
        out['COMMENTSTATUSTEXT'] = m['MAPPING']['COMMENTSTATUSTEXT']
        if 'BERKELEYFN' in m['MAPPING']:
            out['BERKELEYFN'] = m['MAPPING']['BERKELEYFN'] 
        
    out['MAPPINGPREDICTION1'] = frm_pred[m['SALSA_NAME']][0]
    out['MAPPINGPREDICTION2'] = frm_pred[m['SALSA_NAME']][1]
    out['MAPPINGPREDICTION3'] = frm_pred[m['SALSA_NAME']][2]

    out_data.append(out)

In [19]:
import pandas as pd

pd.DataFrame.from_records(out_data).to_csv('salsa-predictions.csv')

## Find family representatives

In [20]:
# class prediction
y_pred = clf.decision_function(X_scaled)

# reshape to 2-d
y_pred = y_pred.reshape((len(alignment['indices'][0]), len(alignment['indices'][1])))
# transpose so DE frames are in 1st dim
y_pred = y_pred[:, de_idx].T

In [40]:
import numpy as np

unique_counts = np.unique(y_pred.argmax(axis=1), return_counts=True)
unique_counts_true = np.unique(en_idx, return_counts=True)

In [50]:
from collections import defaultdict

idx_to_gid = alignment['indices'][0]
frames = alignment['frames']

frame_counts = defaultdict(int, {
    frames[idx_to_gid[idx]]['name']: count
    for idx, count in zip(unique_counts[0], unique_counts[1])
})

frame_counts_true = defaultdict(int, {
    frames[idx_to_gid[idx]]['name']: count
    for idx, count in zip(unique_counts_true[0], unique_counts_true[1])
})

family_frame_count = dict()
for family in frame_families_raw:
    family_name = family["FAMILY_TITLE"]
    constituting_frame = family["FAMILY_CONSTITUTING_FRAME"]["TITLE_BFN"]
    
    member_frames = dict()
    member_frames[constituting_frame] = frame_counts[constituting_frame]
    
    for member in family["FAMILY_MEMBER"]:
        member_frames[member["TITLE_BFN"]] = frame_counts[member["TITLE_BFN"]]
    
    family_frame_count[family_name] = member_frames

In [52]:
error_families = set()

for t, p in zip(en_idx, y_pred.argmax(axis=1)):
    true_fam = idx_to_family(t)
    pred_fam = idx_to_family(p)
    intersect = true_fam.intersection(pred_fam)
    
    if t != p and len(intersect) > 0:
        error_families.update(intersect)

In [54]:
for f in error_families:
    family_counts = family_frame_count[f]
    
    if len(family_counts.keys()) > 10:
#         print("Ignored because of large family size: ", f)
        continue
    elif sum(family_counts.values()) <= 1:
#         print("Ignored because of small frame counts: ", f)
        continue
    else:
        print(family_frame_count[f])

{'Attack': 0, 'Besieging': 0, 'Invading': 0, 'Counterattack': 1, 'Suicide_attack': 0, 'Defending': 1}
{'Make_noise': 1, 'Impact': 0, 'Cause_impact': 1, 'Motion_noise': 1, 'Sound_movement': 0, 'Cause_to_make_noise': 0, 'Sounds': 0, 'Communication_noise': 0, 'Friction': 0}
{'Cause_change': 2, 'Undergo_change': 0, 'Change_event_duration': 0, 'Change_event_time': 0, 'Adjusting': 0, 'Reforming_a_system': 0, 'Exchange_currency': 0}
{'Locale_by_ownership': 1, 'Political_locales': 1, 'Relational_political_locales': 0, 'Foreign_or_domestic_country': 0}
{'Replacing': 2, 'Take_place_of': 1, 'Exchange': 0}
{'Cause_impact': 1, 'Impact': 0, 'Make_noise': 1, 'Hit_target': 1, 'Motion_noise': 1, 'Sound_movement': 0, 'Cause_to_make_noise': 0, 'Sounds': 0, 'Friction': 0}
{'Impact': 0, 'Make_noise': 1, 'Friction': 0, 'Cause_impact': 1, 'Motion_noise': 1, 'Sound_movement': 0, 'Cause_to_make_noise': 0, 'Sounds': 0}
{'Motion_noise': 1, 'Make_noise': 1, 'Sound_movement': 0, 'Cause_to_make_noise': 0}
{'Categor