## Finding Synonyms and Antonyms of Words

This notebook reads in a csv of proposed words for an experiment that will require triplets of words in which two of the words in the triplet are synonyms and the third is an antonym of the other two.

The rough goal is to ensure that each word is correctly related to the other words within its triplet but (roughly) unrelated to words outside of its triplet.

The exact goal is to find four sets of ten words such that  ...

In [1]:
import wordfreq

import numpy as np
import pandas as pd

from nltk.corpus import wordnet as wn

### Import the words

Each row represents words for a potential triplet. The b columns are from Masson & MacLeod (19xx) and the a columns are synonyms for the words in the b columns (chosen by Melissa and I). Some indices contain NaN because there wasn't a synonym we liked.

In [2]:
url = 'https://raw.githubusercontent.com/kleinmichaeldavid/phd/master/memory_for_synonyms/proposed_words'

words = pd.read_csv(url,sep='\t')

In [3]:
print(words.shape); print(words[:10])

(57, 4)
           a1          a2       b1      b2
0     spoiled       crisp    stale   fresh
1         NaN    profound  shallow    deep
2        roof      ground  ceiling   floor
3      fluffy       solid     soft    hard
4       leave      appear   depart  arrive
5  abstaining  inebriated    sober   drunk
6        dull      pointy    blunt   sharp
7       awful    positive      bad    good
8      ascent        drop     rise    fall
9     revenue      defeat   profit    loss


### Overlap in synonym/antonym lists method

In [4]:
def get_relateds(word):
    # takes one word, and finds all synonyms and antonyms
    
    synonyms = []
    antonyms = []

    for synset in wn.synsets(word):
        for lemma in synset.lemmas():
            synonyms.append(lemma.name())
            if lemma.antonyms():
                for ant in lemma.antonyms():
                    antonyms.append(ant.name())
                    
    synonyms = np.unique(np.array(synonyms)[np.array(synonyms) != word])
    antonyms = np.unique(np.array(antonyms)[np.array(antonyms) != word])
    
    return(antonyms,synonyms)

In [5]:
def intersect(a, b):
    return list(set(a) & set(b))

In [6]:
def count_oops(df):
    flat = df.values.flatten()
    ants = []
    syns = []
    
    for word in flat:
        if pd.isnull(word):
            ants.append(np.nan)
            syns.append(np.nan)
        else:
            a,s = get_relateds(word)
            oops1 = len(intersect(s, flat))
            oops2 = len(intersect(a, flat))
            ants.append(oops1)#ants.append((word,oops1))
            syns.append(oops2)#syns.append((word,oops2))

    a = pd.DataFrame(np.array(ants).reshape(df.shape[0],df.shape[1]),columns=df.columns.values)
    a['sum'] = a.sum(1)
    s = pd.DataFrame(np.array(syns).reshape(df.shape[0],df.shape[1]),columns=df.columns.values)
    s['sum'] = s.sum(1)
    
    return(a,s)

a,s = count_oops(words)

  from ipykernel import kernelapp as app
  


In [7]:
def list_oopses(df):
    # returns 2 dataframes containing tuples of original words and
    # either their synonyms or their antonyms
    
    oops_ants = []
    oops_syns = []
    for row in range(df.shape[0]):
        for col in range(df.shape[1]):
            w = df.iloc[row,col]
            if pd.isnull(w):
                oops_ants.append(np.nan)
                oops_syns.append(np.nan)
            else:
                a,s = get_relateds(w)
                flat = df.drop(row).values.flatten()
                oops_a = intersect(a, flat)
                oops_s = intersect(s, flat)
                oops_ants.append((w,oops_a))
                oops_syns.append((w,oops_s))
            
    a = pd.DataFrame(np.array(oops_ants,dtype=tuple).reshape(df.shape[0],df.shape[1]),columns=df.columns.values)
    s = pd.DataFrame(np.array(oops_syns,dtype=tuple).reshape(df.shape[0],df.shape[1]),columns=df.columns.values)
    
    a['sum'] = [np.sum([0 if pd.isnull(x) else len(x[1]) for x in a.iloc[i]]) for i in range(a.shape[0])]
    s['sum'] = [np.sum([0 if pd.isnull(x) else len(x[1]) for x in s.iloc[i]]) for i in range(s.shape[0])]
    
    return(a,s)

In [8]:
a,s = list_oopses(words)

  from ipykernel import kernelapp as app
  


In [9]:
s.sort_values(by = 'sum')[:10]

Unnamed: 0,a1,a2,b1,b2,sum
28,"(solitary, [])","(multiple, [])","(singular, [])","(plural, [])",0
24,"(ask, [])","(response, [])","(question, [])","(answer, [])",0
23,"(sob, [])","(giggle, [])","(cry, [])","(laugh, [])",0
21,"(morning, [])","(evening, [])","(day, [])","(night, [])",0
20,"(naïve, [])","(convicted, [])","(innocent, [])","(guilty, [])",0
19,,"(prior, [])","(future, [])","(past, [])",0
18,"(joy, [])","(discomfort, [])","(pleasure, [])","(pain, [])",0
37,"(purchase, [])",,"(buy, [])","(sell, [])",0
39,"(battle, [])","(truce, [])","(war, [])","(peace, [])",0
43,"(wife, [])","(husband, [])","(bride, [])","(groom, [])",0


In [10]:
## we can use any that have 0 matches for sure
## any with 1 match can be used if the one with the match is removed
## a word with a match can also be used at encoding (but not retrieval - unless the
## match is in the same category)

good_sets = s[s['sum'] < 2]
good_sets.shape

(32, 5)

In [11]:
good_sets

Unnamed: 0,a1,a2,b1,b2,sum
4,"(leave, [forget])","(appear, [])","(depart, [])","(arrive, [])",1
5,"(abstaining, [])","(inebriated, [])","(sober, [])","(drunk, [])",0
9,"(revenue, [])","(defeat, [])","(profit, [])","(loss, [])",0
10,"(drift, [])","(capsize, [])","(float, [])","(sink, [drop])",1
11,"(forever, [])",,"(always, [])","(never, [])",0
12,"(shove, [])","(tow, [])","(push, [])","(pull, [])",0
13,"(sleek, [])","(coarse, [])","(smooth, [])","(rough, [])",0
14,"(coach, [])","(understand, [])","(teach, [])","(learn, [])",0
18,"(joy, [])","(discomfort, [])","(pleasure, [])","(pain, [])",0
19,,"(prior, [])","(future, [])","(past, [])",0


In [12]:
s[s['sum'] == 1]

Unnamed: 0,a1,a2,b1,b2,sum
4,"(leave, [forget])","(appear, [])","(depart, [])","(arrive, [])",1
10,"(drift, [])","(capsize, [])","(float, [])","(sink, [drop])",1
25,"(champion, [friend])","(deadbeat, [])","(winner, [])","(loser, [])",1
32,"(ajar, [])","(fastened, [])","(open, [loose])","(closed, [])",1
34,"(tidy, [])","(messy, [])","(clean, [fresh])","(dirty, [])",1
36,"(recall, [])",,"(remember, [])","(forget, [leave])",1
44,"(packed, [])","(vacant, [])","(full, [good])","(empty, [])",1
50,"(pal, [])","(rival, [])","(friend, [champion])","(enemy, [])",1
54,"(correct, [])","(wrong, [])","(true, [])","(false, [sour])",1
56,"(grin, [])","(pout, [])","(smile, [])","(frown, [lower])",1


In [13]:
s[s['sum'] == 2]

Unnamed: 0,a1,a2,b1,b2,sum
1,,"(profound, [])","(shallow, [])","(deep, [rich, late])",2
3,"(fluffy, [])","(solid, [strong])","(soft, [])","(hard, [strong])",2
6,"(dull, [slow])","(pointy, [])","(blunt, [])","(sharp, [crisp])",2
7,"(awful, [])","(positive, [])","(bad, [spoiled])","(good, [full])",2
8,"(ascent, [])","(drop, [sink])","(rise, [rear])","(fall, [])",2
15,"(wealthy, [])","(bankrupt, [])","(rich, [deep])","(poor, [short])",2
26,"(foremost, [])","(end, [death])","(first, [])","(last, [death])",2
38,"(belated, [])","(premature, [])","(late, [deep, later])","(early, [])",2
48,"(lengthy, [])",,"(long, [])","(short, [dead, poor])",2
55,"(sugary, [])","(bitter, [])","(sweet, [fresh])","(sour, [false])",2


In [14]:
## cleaning up the words a little

words[words.isin(['ground','frozen','lethargic','strong'])] = np.nan
remove_rows =  [0,17,22,27,35,40,42,45,52]
words = words.drop(remove_rows).reset_index(drop = True)

In [15]:
## repeat this with the smaller word set

a,s = list_oopses(words)

  from ipykernel import kernelapp as app
  


In [16]:
s.shape[0]

48

In [18]:
url = 'https://raw.githubusercontent.com/kleinmichaeldavid/phd/master/memory_for_synonyms/more_proposed_words'

words2 = pd.read_csv(url,sep='\t')

In [19]:
all_words = pd.concat([words, words2], axis = 0).reset_index(drop = True)

In [21]:
## check new words as well
a2,s2 = list_oopses(all_words)

  from ipykernel import kernelapp as app
  


In [22]:
greats_idx = s2[s2['sum'] < 2].index
goods_idx = s2[s2['sum'] == 2].index

In [23]:
great_words = all_words.iloc[greats_idx]
good_words = all_words.iloc[goods_idx]

In [24]:
s2[s2['sum'] == 2]

Unnamed: 0,a1,a2,b1,b2,sum
0,,"(profound, [])","(shallow, [])","(deep, [rich, late])",2
3,"(leave, [give, forget])","(appear, [])","(depart, [])","(arrive, [])",2
12,"(sleek, [])","(coarse, [common])","(smooth, [quiet])","(rough, [])",2
14,"(wealthy, [])","(bankrupt, [])","(rich, [deep])","(poor, [short])",2
15,"(missing, [drop])",,"(lost, [])","(found, [recover])",2
37,"(packed, [take])","(vacant, [])","(full, [good])","(empty, [])",2
51,"(admit, [take])","(deny, [])","(accept, [take])","(reject, [])",2
54,"(unique, [singular])","(prevalent, [])","(rare, [])","(common, [coarse])",2
59,"(silent, [dumb])","(noisy, [])","(quiet, [smooth])","(loud, [])",2
68,"(damage, [wrong])","(fix, [secure])","(destroy, [])","(repair, [])",2


### Frequencies

In [25]:
wordfreq.zipf_frequency('stale','en','large')

3.42

In [26]:
df = great_words
flat = df.values.flatten()
freqs = [np.nan if pd.isnull(x) else wordfreq.zipf_frequency(x,'en','large') for x in flat]
freq_df = pd.DataFrame(np.array(freqs).reshape(df.shape[0],df.shape[1]),columns=df.columns.values)

In [27]:
freq_df

Unnamed: 0,a1,a2,b1,b2
0,4.5,,4.17,4.93
1,3.47,4.68,4.62,5.52
2,2.63,2.62,3.97,4.67
3,3.45,4.97,4.84,5.16
4,4.55,4.51,4.72,5.06
5,3.83,2.37,3.96,4.16
6,4.75,,5.77,5.91
7,3.64,3.64,4.78,4.8
8,4.86,5.38,4.72,5.17
9,4.53,3.7,4.58,5.05
