In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from CwnGraph import CwnBase, CwnAnnotator
from CwnGraph import CwnSense, CwnFacet, CwnSynset
cwn = CwnBase()

In [3]:
lemmas = cwn.find_lemma("^..$")
lemmas_with_senses = [x for x in lemmas if x.senses]

In [4]:
##### lemmas_with_senses[0].senses

def all_senses_has_example_sentences(senses, at_least=2):
    for sense in senses:
        if len(sense.all_examples()) < at_least:
            return(False)
    
    return(True)

In [5]:
#lemmas_with_senses[0].senses
def lemma_is_valid(senses, least_example_sentences=2):
    """
    senses: cwn senses obj
    lemma is valid if it has N or V senses, and every sense has at least 2 sentences
    """
    
    for sense in senses:
        # lemma has N or V senses
        if sense.pos.startswith('V') or sense.pos.startswith('N'):
            if len(sense.all_examples()) >= least_example_sentences:
                return(True)
    
    return(False)
#has_N_V_senses(lemmas_with_senses[1005].senses)
#lemmas_with_senses[1005].senses[0].pos

In [6]:
#lemmas_with_senses[0].lemma
def lemma_has_valid_components(lemma):
    """
    lemma: str
    output: boolean. If True, all component characters have senses and 
                              all these senses has example sentences.
                     Else, False.
    """
    
    for char in lemma:
        query = '^' + char + '$'
        char_lemma = cwn.find_lemma(query)
        char_lemma_with_senses = [x for x in char_lemma if x.senses]
        
        # Check whether components exist in CWN
        if len(char_lemma_with_senses) == 0:
            return(False)
        
        # Check whether there are example sentences in the component
        for char_lemma in char_lemma_with_senses:
            if not all_senses_has_example_sentences(char_lemma.senses):
                return(False)

    return(True)

In [7]:
lemmas_with_components_with_senses = []

for i in range(len(lemmas_with_senses)):
    
    # Does this lemma has N or V senses?
    valid_lemma = lemma_is_valid(lemmas_with_senses[i].senses)
    # Does the components of this lemma exist in CWN?
    valid_component = lemma_has_valid_components(lemmas_with_senses[i].lemma)
    
    if valid_lemma and valid_component:
        lemmas_with_components_with_senses.append(lemmas_with_senses[i])

In [8]:
len(lemmas_with_components_with_senses)

1955

In [9]:
lemma_lst = [x.lemma for x in lemmas_with_components_with_senses]
lemma_lst

['發出',
 '油船',
 '油污',
 '水質',
 '微粒',
 '暖季',
 '微小',
 '播遷',
 '錯亂',
 '簡介',
 '能量',
 '基因',
 '做出',
 '指標',
 '那時',
 '那時',
 '典型',
 '定律',
 '定值',
 '定論',
 '入手',
 '力爭',
 '大氣',
 '大氣',
 '子孫',
 '可行',
 '外殼',
 '途徑',
 '排入',
 '排出',
 '排出',
 '有毒',
 '有害',
 '身處',
 '位處',
 '住房',
 '低溫',
 '分隔',
 '分解',
 '包裝',
 '去除',
 '引入',
 '推移',
 '推進',
 '深遠',
 '陷入',
 '減至',
 '種種',
 '照射',
 '移入',
 '污水',
 '沙泥',
 '發掘',
 '舒適',
 '冰雪',
 '龍蝦',
 '鳥類',
 '判定',
 '所及',
 '所得',
 '更少',
 '妥善',
 '作用',
 '建設',
 '發佈',
 '傾卸',
 '溶入',
 '減低',
 '減弱',
 '準線',
 '溫水',
 '秦王',
 '秦王',
 '作法',
 '國際',
 '氣候',
 '越界',
 '超載',
 '家園',
 '永續',
 '用量',
 '苔原',
 '凍原',
 '雨林',
 '健全',
 '崩解',
 '淡水',
 '淡水',
 '暖化',
 '演化',
 '負債',
 '旱林',
 '樹林',
 '樹林',
 '東西',
 '物種',
 '物種',
 '負面',
 '開球',
 '後果',
 '作為',
 '不錯',
 '水準',
 '為妙',
 '愛好',
 '半身',
 '報紙',
 '比喻',
 '敝國',
 '不等',
 '不惜',
 '不宜',
 '不予',
 '才藝',
 '慘遭',
 '纏身',
 '超出',
 '出嫁',
 '大早',
 '盜版',
 '等等',
 '弟兄',
 '電影',
 '抖出',
 '多半',
 '多少',
 '兒時',
 '而外',
 '發揮',
 '仿照',
 '分鐘',
 '豐厚',
 '鋼琴',
 '歌手',
 '給以',
 '給予',
 '更多',
 '故意',
 '罐子',
 '過後',
 '過人',
 '好比',
 '好幾',

## Write CSV

In [10]:
## all.csv
with open('all.csv', 'w') as f:
    f.write('lemma,c1,c2,statement,response\n')
    
    for lemma in lemma_lst:
        statement = lemma + ' 是一種 ' + lemma[1]
        out_str = lemma + ',' + lemma[0] + ',' + lemma[1] + ',' + statement + ',\n'
        f.write(out_str)

In [11]:
## v1.csv, v2, v3
lst1 = []
lst2 = []
lst3 = []
for i in range(len(lemma_lst)):
    if i % 3 == 0:
        lst1.append(lemma_lst[i])
    elif i % 3 == 1:
        lst2.append(lemma_lst[i])
    else:
        lst3.append(lemma_lst[i])

In [12]:
with open('v1.csv', 'w') as f:
    f.write('lemma,c1,c2,statement,response\n')
    
    for lemma in lst1:
        statement = lemma + ' 是一種 ' + lemma[1]
        out_str = lemma + ',' + lemma[0] + ',' + lemma[1] + ',' + statement + ',\n'
        f.write(out_str)
        
        
with open('v2.csv', 'w') as f:
    f.write('lemma,c1,c2,statement,response\n')
    
    for lemma in lst2:
        statement = lemma + ' 是一種 ' + lemma[1]
        out_str = lemma + ',' + lemma[0] + ',' + lemma[1] + ',' + statement + ',\n'
        f.write(out_str)

        
with open('v3.csv', 'w') as f:
    f.write('lemma,c1,c2,statement,response\n')
    
    for lemma in lst3:
        statement = lemma + ' 是一種 ' + lemma[1]
        out_str = lemma + ',' + lemma[0] + ',' + lemma[1] + ',' + statement + ',\n'
        f.write(out_str)