In [132]:
from collections import defaultdict, Counter
from pathlib import Path
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np

from dep_tregex.conll import read_trees_conll
from dep_tregex.tree_script import parse_pattern
from dep_tregex.tree_pattern import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Data prep

In [2]:
speaker = "MOT"

In [3]:
vb_tags = ["cop", "v"]
vb_pattern = And([IsTop(), AttrMatches("cpostags", lambda tag: tag in vb_tags)])

In [4]:
matches = defaultdict(list)

In [5]:
for path in Path(f"data_conllu/{speaker}").glob("*.conllu"):
    with path.open("r", encoding="utf-8") as f:
        for i, tree in enumerate(read_trees_conll(f)):
            source_name = f"{path}:{i}" 
            for node in range(1, len(tree) + 1):
                if vb_pattern.match(tree, node, {}):
                    # fetch immediate children
                    lemma = tree.forms(node).split("-")[0]
                    children_desc = [(tree.deprels(child), tree.forms(child)) 
                                     for child in tree.children(node)]
                    matches[lemma].append((source_name, children_desc))

In [6]:
# accumulate by deprel sequence
matches_grouped = defaultdict(Counter)
for verb, v_matches in matches.items():
    for source_name, v_match in v_matches:
        deprel_seq = tuple(deprel for deprel, _ in v_match)
        matches_grouped[verb][deprel_seq] += 1

In [7]:
matches_grouped["throw"].most_common(10)

[(('SUBJ', 'OBJ', 'JCT'), 3),
 (('SUBJ', 'OBJ'), 1),
 (('LINK', 'SUBJ', 'AUX', 'OBJ'), 1),
 (('COM', 'SUBJ', 'AUX', 'COMP'), 1),
 (('AUX', 'OBJ', 'JCT', 'JCT'), 1),
 (('AUX', 'OBJ', 'JCT'), 1),
 (('SUBJ', 'OBJ', 'JCT', 'JCT'), 1),
 (('COM', 'SUBJ', 'OBJ', 'JCT'), 1),
 (('SUBJ', 'JCT', 'JCT'), 1),
 (('SUBJ', 'AUX', 'OBJ'), 1)]

In [8]:
import json
matches_grouped_json = {vb: {" ".join(seq): count for seq, count in v_grouped.items()}
                        for vb, v_grouped in matches_grouped.items()}
with open(f"verb_frames_{speaker}.json", "w") as out_f:
    json.dump({"matches": matches, "matches_grouped": matches_grouped_json}, out_f)

## Data analysis

In [9]:
verbs_of_interest = set([
    ## lineland
    # agent-agent-object
    "give", "find",
    # agent-agent
    "pull", "push", "take", "find",
    # agent-object
    "pull", "push", "find", "get", "have", "hit", "knock", "play",
    # object-object
    "push", "knock",
    # agent
    "go", "come", "shake",
    # object
    "fall", "go", "come",
    
    ## requires upright agent with legs
    # agent-object
    "sit",
    # agent
    "sit",
    
    ## requires agent with arms
    # agent-agent
    "hit",
    # agent-object
    "put", "set", "throw", "hold", "hit", "catch", "shake",
    # agent
    "knock", "shake",
    
    ## landscapes / obstructions
    "fit",
    
    ## affordances
    "open", "close",
    
    ## deformations
    "break", "tear",
    
    ## other
    "ride",
])

In [32]:
count_dfs = {verb: pd.DataFrame.from_dict(matches, orient="index", columns=[verb]).T
             for verb, matches in matches_grouped_json.items()
             if verb in verbs_of_interest}

In [34]:
count_dfs["ride"]

Unnamed: 0,AUX SUBJ OBJ
ride,1


In [35]:
count_df = pd.concat(count_dfs.values(), sort=True).fillna(0.)

In [36]:
# Drop extremely rare frames.
rare_frames = count_df.columns[count_df.sum(axis=0) < 10]
count_df.drop(columns=rare_frames, inplace=True)

In [37]:
count_df.to_csv(f"verb_frames_{speaker}_matrix.csv")

In [39]:
from scipy.spatial import distance

In [116]:
count_df_norm = count_df.div(count_df.sum(axis=1), axis=0)
distances = distance.pdist(count_df_norm, metric="euclidean")

In [117]:
def smallest_n(a, n):
    return np.sort(np.partition(a, n)[:n])

def argsmallest_n(a, n):
    ret = np.argpartition(a, n)[:n]
    b = np.take(a, ret)
    return np.take(ret, np.argsort(b))

In [135]:
closest = argsmallest_n(distances, len(distances) - 1)

In [136]:
tu = np.triu_indices(len(count_df_norm), 1)
pairs = np.column_stack((np.take(tu[0], closest),
                         np.take(tu[1], closest)))

In [137]:
subs = count_df_norm.iloc[pairs[0]]
subs = subs.loc[:, (subs != 0).any(axis=0)]
subs.iloc[0].dot(subs.iloc[1])

0.06652641155398875

In [138]:
[(tuple(count_df_norm.iloc[pair].index), ((count_df_norm.iloc[pair[0]] - count_df_norm.iloc[pair[1]]) ** 2).sum()) for pair in pairs]

[(('have', 'find'), 0.035862728317655836),
 (('get', 'hold'), 0.04409430819720581),
 (('come', 'sit'), 0.044587315786083694),
 (('have', 'open'), 0.04515221202659484),
 (('find', 'open'), 0.045661828060165825),
 (('get', 'find'), 0.04875096405861477),
 (('have', 'get'), 0.05623534447883092),
 (('take', 'get'), 0.05958764792899409),
 (('take', 'push'), 0.06548816568047339),
 (('get', 'push'), 0.0671646942800789),
 (('take', 'open'), 0.06804079515297798),
 (('take', 'hold'), 0.06812083997808004),
 (('get', 'break'), 0.06917324128862591),
 (('get', 'open'), 0.07053411448872908),
 (('take', 'find'), 0.07139614088536582),
 (('push', 'hold'), 0.07220256677072492),
 (('put', 'set'), 0.07859789086127547),
 (('break', 'hold'), 0.08051797895710487),
 (('have', 'hold'), 0.08682131689647112),
 (('break', 'push'), 0.08888888888888889),
 (('find', 'hold'), 0.09115707283910249),
 (('come', 'pull'), 0.09257545186124509),
 (('sit', 'pull'), 0.09283744403681157),
 (('hold', 'open'), 0.09573361082206033)