In [1]:
import pickle
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus.reader.wordnet import Synset
nltk.data.path.append('../data')  # noqa
from tqdm.notebook import tqdm


with open('../data/sim_matrix.p', 'rb') as f:
    nominos, synsets, similarities = pickle.load(f)


values, indices = similarities.sort(dim=1, descending=True)

In [2]:
groups = []
for n_idx, nomino in enumerate(tqdm(nominos)):
    most_sim = values[n_idx][0].item()
    matches = []
    for k in range(len(indices)):
        if (v := values[n_idx][k].item()) < 0.5 or k > 10:
            break
        matches.append((v, (s := wn.synset(synsets[indices[n_idx, k]]))))
    if matches:
        groups.append((nomino, matches))

  0%|          | 0/6192 [00:00<?, ?it/s]

In [3]:
next(g for g in groups if g[0].entry == 'yahe')

(Nomino(entry='yahe', definition='friend, comrade', subject_concord='a-/wa-'),
 [(0.7157182097434998, Synset('comrade.n.02')),
  (0.6393529176712036, Synset('amigo.n.01')),
  (0.590857744216919, Synset('tovarich.n.01')),
  (0.5628433227539062, Synset('fellow_traveler.n.01')),
  (0.5444111824035645, Synset('communist.n.01')),
  (0.541063129901886, Synset('communist.n.02')),
  (0.5316822528839111, Synset('soviets.n.01')),
  (0.5242171883583069, Synset('bolshevism.n.01')),
  (0.5164322853088379, Synset('stalinist.n.01'))])

In [3]:
from collections import Counter, defaultdict
from pprint import pprint

def get_hypers(xs: list[Synset], min_depth: int = 0, max_depth: int = -1) -> dict[Synset, float]:
    hypers = [h for x in xs for hs in x.hypernym_paths() for h in hs]
    return {hyper: count / len(hypers) for hyper, count in Counter(hypers).items() if count} 
    # hyperss = [{h for path in x.hypernym_paths() for h in path[min_depth:max_depth]} for x in xs]
    # counter = Counter([hyper for hypers in hyperss for hyper in hypers])
    # return {hyper: count / len(xs) for hyper, count in counter.items() if count >= 0.15 * len(xs)}


concord_to_synset_counts = defaultdict(lambda: defaultdict(lambda: 0))
synset_to_concord_counts = defaultdict(lambda: defaultdict(lambda: 0))
synset_global_counts = defaultdict(lambda: 0)

for nomino, matches in tqdm(groups):
    hypers = get_hypers([match for _, match in matches])
    for hyper, value in hypers.items():
        concord_to_synset_counts[nomino.subject_concord][hyper] = concord_to_synset_counts[nomino.subject_concord][hyper] + value
        synset_to_concord_counts[hyper][nomino.subject_concord] = synset_to_concord_counts[hyper][nomino.subject_concord] + value
        synset_global_counts[hyper] += 1
   

  0%|          | 0/6032 [00:00<?, ?it/s]

In [5]:
concord_to_synset_counts['u-/i-']

defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>,
            {Synset('entity.n.01'): 79.63083692958558,
             Synset('physical_entity.n.01'): 41.67008750698048,
             Synset('causal_agent.n.01'): 2.939662823669523,
             Synset('person.n.01'): 5.0030593034491195,
             Synset('inhabitant.n.01'): 0.19750868751670675,
             Synset('resident.n.01'): 0.04663382594417077,
             Synset('townsman.n.02'): 0.027586206896551724,
             Synset('object.n.01'): 29.599225171677723,
             Synset('whole.n.02'): 27.041868011244734,
             Synset('living_thing.n.01'): 13.100676253165277,
             Synset('organism.n.01'): 13.06241190787914,
             Synset('abstraction.n.06'): 37.93835021902128,
             Synset('group.n.01'): 5.037000001778659,
             Synset('social_group.n.01'): 0.9722348060333994,
             Synset('gathering.n.01'): 0.6408250657020715,
             Synset('meeting.n.01'): 0.10512307545524882,

In [4]:
from math import log2 as log

threshold = 10

concord_norms = {concord: sum(vs.values()) for concord, vs in concord_to_synset_counts.items()}

synset_norms = {synset: sum(vs.values()) for synset, vs in synset_to_concord_counts.items()}
norm = sum(concord_norms.values())
log_norm = log(norm)

joint = {concord: {k: (log(v) - log_norm) for k, v in synset_counts.items() if synset_global_counts[k] > threshold} for concord, synset_counts in concord_to_synset_counts.items()}

pmi = defaultdict(lambda: defaultdict(lambda: 0))
for concord, synset_ps in joint.items():
    for synset, value in synset_ps.items():
        pmi[concord][synset] = (concord_to_synset_counts[concord][synset]/norm) * (value - (log(concord_norms[concord]) + log(synset_norms[synset]) - 2 * log_norm))

pmi = {concord: sorted([(k, v) for k, v in synset_counts.items()], key=lambda x: x[1], reverse=True) for concord, synset_counts in pmi.items()}

In [30]:
ks1 = {k for k, _ in pmi['i-/zi-'][:20]}
ks2 = {k for k, _ in pmi['ki-/vi-'][:20]}
ks3 = {k for k, _ in pmi['li-/ya-'][:20]}
ks4 = {k for k, _ in pmi['i-'][:20]}
ks5 = {k for k, _ in pmi['ya-'][:20]}

print(ks1&ks2&ks3)

{Synset('artifact.n.01'), Synset('instrumentality.n.03')}


In [9]:
def bold(x: float) -> str:
    if x < 1.:
        return f'{x:.1f}'
    return '\\textbf{' + f'{x:.1f}' + '}'


def pp(k: Synset, ks: list[Synset], v: float) -> str:

    
    if any (k0 in k.hypernyms() for k0 in ks):
        return '\\textcolor{gray!80}{' + k.name()+ f'~({bold(v * 100)})' + '}' 
    return k.name() + '~' + f'({bold(v * 100)})'
  

# print(f'{sum((v for concord in pmi.keys() for _, v in pmi[concord]))   :.3f}')

for concord in ['a-/wa-', 'i-/zi-', 'u-', 'ki-/vi-', 'u-/i-', 'li-/ya-', 'ya-', 'u-/zi-', 'i-']:
    print(concord)
    print(f'{sum((v for _, v in pmi[concord])) * 1 :.3f}')
    print(', '.join(
        pp(k, [k for k, _ in pmi[concord][:idx]], v) 
        for idx, (k, v) in enumerate(pmi[concord][:20])
        # if not any(k0 in k.hypernyms() for k0, _ in pmi[concord][:idx])
    ).replace('_', '\_'))

a-/wa-
0.102
person.n.01~(\textbf{1.6}), organism.n.01~(\textbf{1.1}), living\_thing.n.01~(\textbf{1.1}), causal\_agent.n.01~(0.8), physical\_entity.n.01~(0.6), \textcolor{gray!80}{animal.n.01~(0.6)}, \textcolor{gray!80}{chordate.n.01~(0.4)}, \textcolor{gray!80}{vertebrate.n.01~(0.4)}, whole.n.02~(0.4), \textcolor{gray!80}{object.n.01~(0.3)}, \textcolor{gray!80}{bird.n.01~(0.2)}, \textcolor{gray!80}{aquatic\_vertebrate.n.01~(0.1)}, \textcolor{gray!80}{fish.n.01~(0.1)}, taxonomic\_group.n.01~(0.1), biological\_group.n.01~(0.1), \textcolor{gray!80}{adult.n.01~(0.1)}, \textcolor{gray!80}{bad\_person.n.01~(0.1)}, \textcolor{gray!80}{mammal.n.01~(0.1)}, \textcolor{gray!80}{unwelcome\_person.n.01~(0.1)}, \textcolor{gray!80}{relative.n.01~(0.1)}
i-/zi-
0.018
artifact.n.01~(0.2), abstraction.n.06~(0.1), \textcolor{gray!80}{instrumentality.n.03~(0.1)}, matter.n.03~(0.1), \textcolor{gray!80}{device.n.01~(0.1)}, \textcolor{gray!80}{measure.n.02~(0.1)}, \textcolor{gray!80}{communication.n.02~(0.0)

In [11]:
Synset.h

NameError: name 'k0' is not defined