In [1]:
from collections import defaultdict, Counter
from pathlib import Path
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np

from dep_tregex.conll import read_trees_conll
from dep_tregex.tree_script import parse_pattern
from dep_tregex.tree_pattern import *

## Data prep

In [2]:
speaker = "MOT"

In [23]:
vb_tags = ["cop", "v"]
vb_pattern = And([AttrMatches("cpostags", lambda tag: tag in vb_tags)])

In [24]:
matches = defaultdict(list)

In [25]:
for path in Path(f"data_conllu/{speaker}").glob("*.conllu"):
    with path.open("r", encoding="utf-8") as f:
        for i, tree in enumerate(read_trees_conll(f)):
            source_name = f"{path}:{i}" 
            for node in range(1, len(tree) + 1):
                if vb_pattern.match(tree, node, {}):
                    # fetch immediate children
                    lemma = tree.forms(node).split("-")[0]
                    children_desc = [(tree.deprels(child), tree.forms(child)) 
                                     for child in tree.children(node)]
                    matches[lemma].append((source_name, children_desc))

In [26]:
# accumulate by deprel sequence
matches_grouped = defaultdict(Counter)
for verb, v_matches in matches.items():
    for source_name, v_match in v_matches:
        deprel_seq = tuple(deprel for deprel, _ in v_match)
        matches_grouped[verb][deprel_seq] += 1

In [27]:
matches_grouped["throw"].most_common(10)

[(('SUBJ', 'OBJ', 'JCT'), 4),
 (('INF', 'OBJ', 'JCT'), 2),
 (('SUBJ', 'OBJ'), 1),
 (('LINK', 'SUBJ', 'AUX', 'OBJ'), 1),
 (('COM', 'SUBJ', 'AUX', 'COMP'), 1),
 (('SUBJ', 'JCT', 'SUBJ', 'OBJ', 'JCT'), 1),
 (('AUX', 'OBJ', 'JCT', 'JCT'), 1),
 (('AUX', 'OBJ', 'JCT'), 1),
 (('SUBJ', 'OBJ', 'JCT', 'JCT'), 1),
 (('LINK', 'SUBJ', 'OBJ', 'JCT'), 1)]

In [28]:
import json
matches_grouped_json = {vb: {" ".join(seq): count for seq, count in v_grouped.items()}
                        for vb, v_grouped in matches_grouped.items()}
with open(f"verb_frames_{speaker}.json", "w") as out_f:
    json.dump({"matches": matches, "matches_grouped": matches_grouped_json}, out_f)

## Data analysis

In [29]:
verbs_of_interest = set([
    ## lineland
    # agent-agent-object
    "give", "find",
    # agent-agent
    "pull", "push", "take", "find",
    # agent-object
    "pull", "push", "find", "get", "have", "hit", "knock", "play",
    # object-object
    "push", "knock",
    # agent
    "go", "come", "shake",
    # object
    "fall", "go", "come",
    
    ## requires upright agent with legs
    # agent-object
    "sit",
    # agent
    "sit",
    
    ## requires agent with arms
    # agent-agent
    "hit",
    # agent-object
    "put", "set", "throw", "hold", "hit", "catch", "shake",
    # agent
    "knock", "shake",
    
    ## landscapes / obstructions
    "fit",
    
    ## affordances
    "open", "close",
    
    ## deformations
    "break", "tear",
    
    ## other
    "ride",
])

In [30]:
count_dfs = {verb: pd.DataFrame.from_dict(matches, orient="index", columns=[verb]).T
             for verb, matches in matches_grouped_json.items()
             if verb in verbs_of_interest}

In [31]:
count_dfs["ride"]

Unnamed: 0,SUBJ JCT CONJ,INF JCT,LINK SUBJ JCT,JCT OBJ,SUBJ JCT,LINK SUBJ OBJ,LINK SUBJ,AUX SUBJ OBJ,INF JCT JCT
ride,1,1,3,1,1,1,1,1,1


In [32]:
count_df = pd.concat(count_dfs.values(), sort=True).fillna(0.)

In [33]:
# Drop extremely rare frames.
rare_frames = count_df.columns[count_df.sum(axis=0) < 10]
count_df.drop(columns=rare_frames, inplace=True)

In [34]:
count_df.to_csv(f"verb_frames_{speaker}_matrix.csv")

In [35]:
from scipy.spatial import distance

In [36]:
count_df_norm = count_df.div(count_df.sum(axis=1), axis=0)
distances = distance.pdist(count_df_norm, metric="euclidean")

In [37]:
def smallest_n(a, n):
    return np.sort(np.partition(a, n)[:n])

def argsmallest_n(a, n):
    ret = np.argpartition(a, n)[:n]
    b = np.take(a, ret)
    return np.take(ret, np.argsort(b))

In [38]:
closest = argsmallest_n(distances, len(distances) - 1)

In [39]:
tu = np.triu_indices(len(count_df_norm), 1)
pairs = np.column_stack((np.take(tu[0], closest),
                         np.take(tu[1], closest)))

In [40]:
subs = count_df_norm.iloc[pairs[0]]
subs = subs.loc[:, (subs != 0).any(axis=0)]
subs.iloc[0].dot(subs.iloc[1])

0.04594141649664821

In [41]:
[(tuple(count_df_norm.iloc[pair].index), ((count_df_norm.iloc[pair[0]] - count_df_norm.iloc[pair[1]]) ** 2).sum()) for pair in pairs]

[(('get', 'hold'), 0.02697087971799631),
 (('get', 'find'), 0.030498791747886993),
 (('have', 'find'), 0.03139066839883203),
 (('sit', 'come'), 0.031516137385669725),
 (('have', 'get'), 0.0336465423929241),
 (('have', 'open'), 0.03648259002605198),
 (('break', 'hold'), 0.037956611066615596),
 (('take', 'hold'), 0.037993730633557765),
 (('get', 'break'), 0.0395658010595147),
 (('get', 'take'), 0.039586142489041035),
 (('take', 'open'), 0.03994598337950139),
 (('get', 'open'), 0.04234978419966222),
 (('open', 'hold'), 0.044876414667270256),
 (('open', 'find'), 0.04595918367346939),
 (('have', 'hold'), 0.04622918559748904),
 (('push', 'hold'), 0.04660090487360095),
 (('have', 'break'), 0.047909186723863316),
 (('find', 'hold'), 0.05289030958694024),
 (('take', 'find'), 0.05313824975973769),
 (('put', 'set'), 0.05483878643948982),
 (('take', 'push'), 0.05517551277956108),
 (('get', 'push'), 0.06446216993651387),
 (('have', 'take'), 0.06697555650389822),
 (('break', 'open'), 0.068),
 (('bre

In [45]:
query = count_df_norm.loc[["get", "hold"]]
query.loc[:, (query != 0).any(axis=0)]

Unnamed: 0,Unnamed: 1,AUX OBJ,AUX OBJ JCT,AUX SUBJ JCT,AUX SUBJ JCT JCT,AUX SUBJ OBJ,AUX SUBJ OBJ JCT,COM JCT,COM JCT JCT,COM OBJ JCT,...,SUBJ JCT,SUBJ JCT CONJ,SUBJ JCT JCT,SUBJ JCT OBJ,SUBJ OBJ,SUBJ OBJ JCT,SUBJ OBJ JCT JCT,SUBJ OBJ OBJ,SUBJ XCOMP,XCOMP
get,0.006849,0.001712,0.003425,0.003425,0.003425,0.02911,0.02226,0.001712,0.003425,0.001712,...,0.011986,0.001712,0.006849,0.005137,0.131849,0.053082,0.003425,0.005137,0.063356,0.015411
hold,0.021277,0.0,0.0,0.0,0.0,0.021277,0.042553,0.0,0.0,0.0,...,0.0,0.0,0.0,0.021277,0.170213,0.06383,0.0,0.0,0.0,0.0
