In [1]:
import json
from random import sample

In [3]:
# Some utility funcs that will come in handy

def saveJson(dic, path):
    with open(path, 'w') as f:
        json.dump(dic, f)
    print(path + " has been saved!")
    

def readJson(path):
    return json.load(open(path))


def average_syn(dic):
    return sum([len(v) for v in dic.values()]) / len(dic)


def random_inspect(dic, out_num):
    for k, v in sample(list(dic.items()), out_num):
        print(k, v)

In [2]:
# Import two dics. revised_syn is a manually trimmed dic 
# from synonyms.json in the previous folder 
wn_syn = readJson('wordnetSyn.json')
rev_syn = readJson('revised_syn.json')
len(wn_syn), len(rev_syn)

NameError: name 'json' is not defined

In [4]:
# Co
syn = wn_syn.copy()

for k, v in rev_syn.items():
    if k in syn:
        syn[k] = list(set(v + syn[k]))
    else:
        syn[k] = v

len(syn)

46798

In [5]:
for k, v in sample(list(syn.items()), 10):
    print(k, v)

重压 ['不安', '压力', '紧张']
攫取 ['夺过', '抓住', '抓紧', '抱紧', '攫', '猛抓', '突然或有力地抓住', '紧抓', '紧握', '占用', '抢']
保持某种状态 ['保持', '继续', '维持']
强烈的感情 ['激情', '狂热']
火箭炮 ['喀秋莎', '火箭筒', '巴祖卡']
捉贼见赃 ['捉奸见双']
上工 ['上班', '出勤', '出工']
如梦初醒 ['茅塞顿开', '豁然开朗']
可笑 ['好笑', '喷饭', '捧腹', '洋相']
正十二面体 ['十二面体']


In [6]:
for k, v in syn.items():
    if not v:
        print(k, v)

## Make a narrowly expanded version
If x is the synonym for y, then y is the synonym for x too.

In [7]:
expand_narrow = syn.copy()

for k in syn.keys():
    values = syn[k]
    for v in values:
        if v in expand_narrow:
            if k not in expand_narrow[v]:
                expand_narrow[v] = [k] + expand_narrow[v]
        else:
            expand_narrow[v] = [k]
            
print(len(expand_narrow))

70599


## Trimming

In [8]:
def trim_dic(syn_dic):
    syn_dic = syn_dic.copy()
    # if words, four chars or longer, and have more than 5 syns,
    # delete syns that have only one char, but not all if only 5 syns left
    for k, v in list(syn_dic.items()):
        if len(k) >= 4:
            syn_num = len(v)
            if syn_num > 5:
                for w in v:
                    if len(w) == 1:
                        syn_dic[k].remove(w)
                        syn_num -= 1
                    if syn_num == 5:
                        break
    
    # if words, one char long, and have more than 5 syns, 
    # delete syns that have more than 4 chars, but not all if only 5 syns left
    for k, v in list(syn_dic.items()):
        if len(k) == 1:
            syn_num = len(v)
            if syn_num > 5:
                for w in v:
                    if len(w) >= 4:
                        syn_dic[k].remove(w)
                        syn_num -= 1
                    if syn_num == 5:
                        break
    
    # if words, two chars, has more than 5 syns, 
    # delete syns that have more than 4 chars, but not all if only 5 syns left
    for k, v in list(syn_dic.items()):
        if len(k) == 2:
            syn_num = len(v)
            if syn_num > 5:
                for w in v:
                    if len(w) >= 4:
                        syn_dic[k].remove(w)
                        syn_num -= 1
                    if syn_num == 5:
                        break
                        
    # if words, two chars long, has more than 5 syns,
    # delete syns that have only 1 char if that char not in the words, 
    # but not all if only 5 syns left
    for k, v in list(syn_dic.items()):
        if len(k) == 2:
            syn_num = len(v)
            if syn_num > 5:
                for w in v:
                    if len(w) == 1 and w not in k:
                        syn_dic[k].remove(w)
                        syn_num -= 1
                    if syn_num == 5:
                        break
    
    # if words, two chars long, have more than 5 syns, 
    # delete those who has no shared characters, but not all if only 5 syns left
    for k, v in list(syn_dic.items()):
        if len(k) == 2:
            syn_num = len(v)
            if len(v) > 5:
                for w in v:
                    if k[0] not in w and k[-1] not in w:
                        syn_dic[k].remove(w)
                        syn_num -= 1
                    if syn_num == 5:
                        break
                        
    # if words, one char long, have more than 5 syns, 
    # delete those who has no shared characters, but not all if only 5 syns left
    for k, v in list(syn_dic.items()):
        if len(k) == 1:
            syn_num = len(v)
            if syn_num > 5:
                for w in v:
                    if k[0] not in w and k[-1] not in w:
                        syn_dic[k].remove(w)
                        syn_num -= 1
                    if syn_num == 5:
                        break
    
    # finally if a word, has more than 5 syns, randomly sample 5 out of them;
    # if a word, has no syn at all following the trimming, pop that word
    for k, v in list(syn_dic.items()):
        if len(v) > 5:
            syn_dic[k] = sample(v, 5)
        elif len(v) <= 1:
            syn_dic.pop(k)
    
    return syn_dic

In [9]:
expand_narrow_trim = trim_dic(expand_narrow)
len(expand_narrow), len(expand_narrow_trim)

(70599, 34408)

In [10]:
syn_trim = trim_dic(syn)
len(syn), len(syn_trim)

(46798, 29163)

In [11]:
average_syn(expand_narrow), average_syn(expand_narrow_trim)

(2.4101757815266507, 3.332916763543362)

In [12]:
average_syn(syn), average_syn(syn_trim)

(2.7679815376725503, 3.2817268456605975)

## Debugging

In [13]:
def debug(dic):
    # No words should have over 5 syn
    for k, v in dic.items():
        if len(v) > 5:
            print(k, 'has more than 5 syns!')
            
    # No words should have no syns or one syn
    for k, v in dic.items():
        if len(v) <= 1:
            print(k, 'has only one syn or no syn:', v)
                    
    # if words, one char long, have 5 syns, they should not have syns 
    # that have no shared char with them
    for k, v in list(dic.items()):
        if len(k) == 1:
            for w in k:
                if k not in w:
                    print(k, 'has 5 syns but, ', w, 'has no shared char')
                    
    # if words, two chars long, have 5 syns, they should not have syns 
    # that have no shared char with them
    for k, v in list(dic.items()):
        if len(k) == 2:
            for w in k:
                if k[0] not in w and k[-1] not in w:
                    print(k, 'has 5 syns but, ', w, 'has no shared char')
    

In [14]:
debug(expand_narrow_trim)

In [15]:
debug(syn_trim)

In [16]:
for k, v in sample(list(expand_narrow_trim.items()), 20):
    print(k, v)

燃料量计 ['燃料指示器', '燃料液位指示计', '燃油表', '燃油量表']
过游民生活 ['游荡', '闲逛']
磨损率 ['损耗率', '磨耗率']
透孔织品 ['网眼织物', '蕾丝']
使阴沉 ['使变成阴天', '使布满云层', '使阴暗', '变阴暗']
钟 ['时钟', '钟表', '钟琴']
一念之差 ['一眨眼', '一差二错', '阴差阳错', '差之毫厘']
受害 ['被害', '遇害', '蒙难', '遇难', '落难']
停工 ['停止做', '停止执行', '歇工', '罢工', '罢工']
独身 ['孤单', '单身的', '未婚的', '没有配偶的']
三人成虎 ['以讹传讹', '讹以滋讹', '众口铄金', '道听途说']
专员 ['专人', '参赞', '大使随员']
书局 ['书店', '出版社']
木桶架 ['构台', '跨轨信号杆']
变紧 ['绷紧', '变得更紧', '拉紧']
无实用价值的鱼 ['没有经济价值的鱼', '非食用鱼']
盘旋 ['卷旋', '徘徊', '使升空', '高飞', '旋转']
豆蓉 ['木豆', '柳豆', '树豆', '树黄豆', '鸽豆']
熏肉 ['咸肉', '培根', '熏咸肉']
吉利 ['吉庆', '吉祥']


## Save whichever trimmed syn you want to save

In [17]:
saveJson(expand_narrow_trim, 'synonyms.json')

../data/synonyms.json has been saved!
