# Recursive Ekphrasis Gym

## Processing `(noun,verb)` pairs

In [None]:
! wget https://github.com/kbooten/ekphrasisgym/raw/main/noun2verb_tuples.json

--2022-07-12 23:03:00--  https://github.com/kbooten/ekphrasisgym/raw/main/noun2verb_tuples.json
Resolving github.com (github.com)... 20.205.243.166
Connecting to github.com (github.com)|20.205.243.166|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://media.githubusercontent.com/media/kbooten/ekphrasisgym/main/noun2verb_tuples.json [following]
--2022-07-12 23:03:00--  https://media.githubusercontent.com/media/kbooten/ekphrasisgym/main/noun2verb_tuples.json
Resolving media.githubusercontent.com (media.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to media.githubusercontent.com (media.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 51250034 (49M) [application/octet-stream]
Saving to: ‘noun2verb_tuples.json’


2022-07-12 23:03:04 (373 MB/s) - ‘noun2verb_tuples.json’ saved [51250034/51250034]



In [None]:
import json

with open('noun2verb_tuples.json','r') as f:
  noun2verb_tuples = json.load(f)

In [None]:
noun2verb_tuples[:4]

[['education', 'demand', 1],
 ['diagram', 'show', 1],
 ['story', 'stir', 1],
 ['lamb', 'follow', 1]]

In [None]:
from collections import defaultdict

noun2verb = defaultdict(list)

In [None]:
for i in noun2verb_tuples:
  noun = i[0]
  verb = i[1]
  number = i[2]
  noun2verb[noun].append((verb,number))

In [None]:
noun2verb["curate"][:10]

[('regard', 1),
 ('feel', 1),
 ('follow', 1),
 ('lead', 1),
 ('take', 1),
 ('nod', 1),
 ('regain', 1),
 ('run', 1),
 ('believe', 1),
 ('take', 1)]

Using WordNet to make sure a token is a word.

In [None]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import wordnet as wn

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [None]:
wn.synsets("hat",pos=wn.NOUN)

[Synset('hat.n.01'), Synset('hat.n.02')]

In [None]:
for key,values in noun2verb.items():
  values = [(i,n) for (i,n) in values if wn.synsets(i,pos=wn.VERB)!=[]] ## spellcheck via wordnet
  noun2verb[key] = values

Get rid of values that are the same as the key.

In [None]:
for key,values in noun2verb.items():
  noun2verb[key] = [v for v in values if v!=key]

In [None]:
noun2verb["pig"][:3]

[('present', 1), ('change', 1), ('get', 1)]

Sometimes a value is just a letter.  Filter out really short words.

In [None]:
for key,values in noun2verb.items():
  noun2verb[key] = [(i,n) for (i,n) in values if len(i)>2]

### Rank by TF-IDF

In [None]:
sets_of_words = list(noun2verb.values())

In [None]:
total_number_of_sets = len(sets_of_words)
total_number_of_sets

58514

In [None]:
from collections import defaultdict

word2doc_count = defaultdict(int)

In [None]:
for s in sets_of_words:
  s = list(set(s))
  for t in s:
    word2doc_count[t]+=1

In [None]:
word2doc_count['friend']

0

In [None]:
noun2verbs_and_weights = {}

In [None]:
for key,words in noun2verb.items():
  if len(words)!=0: ## no empty sets
    unique_words = list(set(words))
    verbs_and_weights = []
    for w in unique_words:
      tf = words.count(w)/len(words)
      idf = total_number_of_sets/word2doc_count[w]
      tfidf = tf * idf
      verbs_and_weights.append((w,tfidf))
    verbs_and_weights.sort(key=lambda x:x[1], reverse=True)
    noun2verbs_and_weights[key]=verbs_and_weights[:20]

In [None]:
noun2verbs_and_weights['friend']

[(('referee', 1), 9.16716277612408),
 (('expatiate', 1), 9.16716277612408),
 (('cuckold', 1), 9.16716277612408),
 (('prevaricate', 1), 4.58358138806204),
 (('orate', 1), 4.58358138806204),
 (('fawn', 1), 4.58358138806204),
 (('misappropriate', 1), 3.0557209253746933),
 (('needle', 1), 3.0557209253746933),
 (('shin', 1), 2.29179069403102),
 (('smart', 1), 2.29179069403102),
 (('clump', 1), 2.29179069403102),
 (('capitalize', 1), 2.037147283583129),
 (('patent', 1), 1.8334325552248159),
 (('palpitate', 1), 1.8334325552248159),
 (('skid', 1), 1.8334325552248159),
 (('twit', 1), 1.6667568683861962),
 (('joke', 1), 1.6667568683861962),
 (('converse', 1), 1.6667568683861962),
 (('book', 1), 1.6667568683861962),
 (('overdraw', 1), 1.5278604626873467)]

In [None]:
with open('noun2verbs_and_weights.json','w') as f:
  json.dump(noun2verbs_and_weights,f)

In [None]:
from google.colab import files
files.download('/content/noun2verbs_and_weights.json')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

***