In [37]:
from collections import defaultdict, Counter
from pathlib import Path
%load_ext autoreload
%autoreload 2

import pandas as pd

from dep_tregex.conll import read_trees_conll
from dep_tregex.tree_script import parse_pattern
from dep_tregex.tree_pattern import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Data prep

In [28]:
speaker = "MOT"

In [29]:
vb_tags = ["cop", "v"]
vb_pattern = And([IsTop(), AttrMatches("cpostags", lambda tag: tag in vb_tags)])

In [30]:
matches = defaultdict(list)

In [68]:
for path in Path(f"data_conllu/{speaker}").glob("*.conllu"):
    with path.open("r", encoding="utf-8") as f:
        for i, tree in enumerate(read_trees_conll(f)):
            source_name = f"{path}:{i}" 
            for node in range(1, len(tree) + 1):
                if vb_pattern.match(tree, node, {}):
                    # fetch immediate children
                    lemma = tree.forms(node).split("-")[0]
                    children_desc = [(tree.deprels(child), tree.forms(child)) 
                                     for child in tree.children(node)]
                    matches[lemma].append((source_name, children_desc))

In [69]:
# accumulate by deprel sequence
matches_grouped = defaultdict(Counter)
for verb, v_matches in matches.items():
    for source_name, v_match in v_matches:
        deprel_seq = tuple(deprel for deprel, _ in v_match)
        matches_grouped[verb][deprel_seq] += 1

In [70]:
matches_grouped["throw"].most_common(10)

[(('SUBJ', 'OBJ', 'JCT'), 15),
 (('SUBJ', 'OBJ'), 5),
 (('LINK', 'SUBJ', 'AUX', 'OBJ'), 5),
 (('COM', 'SUBJ', 'AUX', 'COMP'), 5),
 (('AUX', 'OBJ', 'JCT', 'JCT'), 5),
 (('AUX', 'OBJ', 'JCT'), 5),
 (('SUBJ', 'OBJ', 'JCT', 'JCT'), 5),
 (('COM', 'SUBJ', 'OBJ', 'JCT'), 5),
 (('SUBJ', 'JCT', 'JCT'), 5),
 (('SUBJ', 'AUX', 'OBJ'), 5)]

In [71]:
import json
matches_grouped_json = {vb: {" ".join(seq): count for seq, count in v_grouped.items()}
                        for vb, v_grouped in matches_grouped.items()}
with open(f"verb_frames_{speaker}.json", "w") as out_f:
    json.dump({"matches": matches, "matches_grouped": matches_grouped_json}, out_f)

## Data analysis

In [72]:
verbs_of_interest = set([
    ## lineland
    # agent-agent-object
    "give", "find",
    # agent-agent
    "pull", "push", "take", "find",
    # agent-object
    "pull", "push", "find", "get", "have", "hit", "knock", "play",
    # object-object
    "push", "knock",
    # agent
    "go", "come", "shake",
    # object
    "fall", "go", "come",
    
    ## requires upright agent with legs
    # agent-object
    "sit",
    # agent
    "sit",
    
    ## requires agent with arms
    # agent-agent
    "hit",
    # agent-object
    "put", "set", "throw", "hold", "hit", "catch", "shake",
    # agent
    "knock", "shake",
    
    ## landscapes / obstructions
    "fit",
    
    ## affordances
    "open", "close",
    
    ## deformations
    "break", "tear",
    
    ## other
    "ride",
])

In [73]:
count_dfs = {verb: pd.DataFrame.from_dict(matches, orient="index", columns=[verb]).T
             for verb, matches in matches_grouped_json.items()}

In [74]:
count_dfs["see"]

Unnamed: 0,OBJ,SUBJ OBJ,Unnamed: 3,AUX SUBJ,OBJ JCT JCT,COM SUBJ AUX OBJ,COMP,AUX SUBJ OBJ OBJ JCT,LINK SUBJ OBJ,LINK AUX SUBJ,...,LINK SUBJ AUX SUBJ,LINK AUX SUBJ OBJ JCT,AUX SUBJ OBJ JCT JCT,AUX SUBJ COMP,SUBJ AUX OBJ JCT COM,SUBJ AUX OBJ OBJ,JCT SUBJ AUX OBJ,LINK LINK AUX SUBJ OBJ,JCT CJCT,SUBJ AUX OBJ JCT JCT
see,330,180,430,20,15,15,80,5,50,60,...,10,5,5,5,5,5,5,5,5,5


In [75]:
count_df = pd.concat(count_dfs.values(), sort=True).fillna(0.)

In [81]:
# Drop extremely rare frames.
rare_frames = count_df.columns[count_df.sum(axis=0) < 10]
count_df.drop(columns=rare_frames, inplace=True)

In [82]:
count_df.to_csv(f"verb_frames_{speaker}_matrix.csv")