## Initializations

In [1]:
from utils import ngramGenerator, readJson, lcqmcLoader
from reda import REDA
from ngramLM import NgramLM
from random import sample, shuffle
import jieba

In [2]:
lm = NgramLM()
reda = REDA()
syn_dic = reda._syn.copy()
train = lcqmcLoader('train')

In [3]:
# Useful utils funcs

def random_sample(out_num=10000):
    return [t[0] for t in sample(train, out_num)]

def evaluate(eval_func, num_edits, num_choices, 
             num_run=5, num_sample=10000):
    base_score, ngram_score = [], []
    for _ in range(num_run):
        res = eval_func(random_sample(num_sample), num_edits, num_choices)
        print(res)
        base_score.append(res['base_accu'])
        ngram_score.append(res['ngram_accu'])
        
    base_avg = round(sum(base_score)/num_run, 2)
    ngram_avg = round(sum(ngram_score)/num_run, 2)
    return {'base_avg': base_avg, 'ngram_avg': ngram_avg}

### More about num_choices

As the reda model generates novel text at random, so to make it possible that both models see the original texts after these text editing operations, we need to have enough number of output sentences. The average sentence length of LCQMC is about 6~7 words per question. Suppose there are on average 3 words that have synonyms in this experiment, then there are (4 + 4 + 4 = 12) possible replacements when `num_edits=1`, (4 * 8 + 4 * 4 = 48) possible replacements when `num_edits=2`, and (4 * 4 * 4 = 64) possible replacements when `num_edits=3`. 

For random swap, this is: 15 possible swaps when `num_edits=1`, 15 * 15 = 225 possible swaps when `num_edits=2`, and (6! = 720) possible swaps when `num_edits=3`.

For random deletion (the sentence will first be added by the same number of tokens to delete), this is: 7 possible scenarios when when `num_edits=1`, 20 possible scenarios when when `num_edits=2`, 84 possible scenarios when when `num_edits=1`. This is a simple combination problem. 

## Synonym Replacement Evaluation

In [4]:
def pseudo_syn_dic(syn_dic, unigram_dic, 
                   min_freq=1000, max_freq=2000, distractor_num=3):
    freq_words = list(unigram_dic.keys())[min_freq: max_freq]
    qseudo_dic = {}
    for k in syn_dic.keys():
        if k in freq_words:
            fake_syn = [w for w in sample(freq_words, 20) if w not in syn_dic[k] + [k]]
            fake_syn = sample(fake_syn, distractor_num)
            qseudo_dic[k] = [k] + fake_syn
            shuffle(qseudo_dic[k])
    return qseudo_dic

In [5]:
qseudo_dic = pseudo_syn_dic(syn_dic, lm._unigram, 1000, 10001, distractor_num=3)

In [6]:
reda._syn = qseudo_dic

In [7]:
def syn_rpl_test(samples, num_changes, num_choices):
    base_right, ngram_right, total= 0, 0, 0
    for text in samples:
        tokens = reda.tokenize(text)
        replaceable = reda._replaceable_idx(tokens)
        if len(replaceable) >= num_changes:
            total += 1
            
            choices = reda.replace_syn(tokens, num_changes, num_choices)
            reda_text = sample(choices, 1)[0]
            ngram_text = lm.pickBestSent(choices)
            if ''.join(reda_text) == text:
                base_right += 1
            if ''.join(ngram_text) == text:
                ngram_right += 1
        
    base_accu = round(base_right / total, 2)
    ngram_accu = round(ngram_right / total, 2)
    return {'base_accu': base_accu, 'ngram_accu': ngram_accu}

In [8]:
evaluate(syn_rpl_test, 1, 20)

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/w9/d_nplhzj4qx35xxlgljgdtjh0000gn/T/jieba.cache
Loading model cost 0.679 seconds.
Prefix dict has been built successfully.


{'base_accu': 0.22, 'ngram_accu': 0.88}
{'base_accu': 0.22, 'ngram_accu': 0.88}
{'base_accu': 0.22, 'ngram_accu': 0.88}
{'base_accu': 0.21, 'ngram_accu': 0.89}
{'base_accu': 0.22, 'ngram_accu': 0.88}


{'base_avg': 0.22, 'ngram_avg': 0.88}

In [9]:
evaluate(syn_rpl_test, 2, 50)

{'base_accu': 0.06, 'ngram_accu': 0.8}
{'base_accu': 0.06, 'ngram_accu': 0.79}
{'base_accu': 0.07, 'ngram_accu': 0.78}
{'base_accu': 0.05, 'ngram_accu': 0.79}
{'base_accu': 0.06, 'ngram_accu': 0.8}


{'base_avg': 0.06, 'ngram_avg': 0.79}

In [10]:
evaluate(syn_rpl_test, 3, 100)

{'base_accu': 0.01, 'ngram_accu': 0.64}
{'base_accu': 0.01, 'ngram_accu': 0.62}
{'base_accu': 0.02, 'ngram_accu': 0.64}
{'base_accu': 0.02, 'ngram_accu': 0.64}
{'base_accu': 0.02, 'ngram_accu': 0.64}


{'base_avg': 0.02, 'ngram_avg': 0.64}

## Random swap Evaluation

In [11]:
def ran_swap_test(samples, num_changes, num_choices):
    base_right, ngram_right, total= 0, 0, 0
    for text in samples:
        tokens = jieba.lcut(text)
    # it is less informative if there are less than 3 words 
        if len(tokens) > 2:
            total += 1
            
            tokens = reda.swap_words(tokens, num_changes)
            choices = reda.swap_words(tokens, num_changes, num_choices)
            reda_text = sample(choices, 1)[0]
            ngram_text = lm.pickBestSent(choices)
            if ''.join(reda_text) == text:
                base_right += 1
            if ''.join(ngram_text) == text:
                ngram_right += 1
                
    base_accu = round(base_right / total, 2)
    ngram_accu = round(ngram_right / total, 2)
    return {'base_accu': base_accu, 'ngram_accu': ngram_accu}           

In [12]:
evaluate(ran_swap_test, 1, 50)

{'base_accu': 0.09, 'ngram_accu': 0.7}
{'base_accu': 0.09, 'ngram_accu': 0.69}
{'base_accu': 0.1, 'ngram_accu': 0.69}
{'base_accu': 0.09, 'ngram_accu': 0.7}
{'base_accu': 0.09, 'ngram_accu': 0.69}


{'base_avg': 0.09, 'ngram_avg': 0.69}

In [13]:
evaluate(ran_swap_test, 2, 100)

{'base_accu': 0.04, 'ngram_accu': 0.4}
{'base_accu': 0.04, 'ngram_accu': 0.41}
{'base_accu': 0.04, 'ngram_accu': 0.41}
{'base_accu': 0.04, 'ngram_accu': 0.41}
{'base_accu': 0.04, 'ngram_accu': 0.41}


{'base_avg': 0.04, 'ngram_avg': 0.41}

In [14]:
evaluate(ran_swap_test, 3, 200)

{'base_accu': 0.04, 'ngram_accu': 0.34}
{'base_accu': 0.04, 'ngram_accu': 0.34}
{'base_accu': 0.04, 'ngram_accu': 0.33}
{'base_accu': 0.04, 'ngram_accu': 0.34}
{'base_accu': 0.04, 'ngram_accu': 0.34}


{'base_avg': 0.04, 'ngram_avg': 0.34}

## Random delete Evaluation

In [15]:
def ran_delete_test(samples, num_changes, num_choices):
    base_right, ngram_right, total= 0, 0, 0
    for text in samples:
        tokens = jieba.lcut(text)
    # it is less informative if there are less than 6 words 
        if len(tokens) > num_changes:
            total += 1
            
            tokens += sample(tokens, num_changes)
            choices = reda.delete_words(tokens, num_changes, num_choices)
            reda_text = sample(choices, 1)[0]
            ngram_text = lm.pickBestSent(choices)
            
            if ''.join(reda_text) == text:
                base_right += 1
            if ''.join(ngram_text) == text:
                ngram_right += 1

    base_accu = round(base_right / total, 2)
    ngram_accu = round(ngram_right / total, 2)
    return {'base_accu': base_accu, 'ngram_accu': ngram_accu}           

In [16]:
evaluate(ran_delete_test, 1, 20)

{'base_accu': 0.15, 'ngram_accu': 0.39}
{'base_accu': 0.16, 'ngram_accu': 0.39}
{'base_accu': 0.16, 'ngram_accu': 0.39}
{'base_accu': 0.16, 'ngram_accu': 0.39}
{'base_accu': 0.16, 'ngram_accu': 0.39}


{'base_avg': 0.16, 'ngram_avg': 0.39}

In [17]:
evaluate(ran_delete_test, 2, 50)

{'base_accu': 0.05, 'ngram_accu': 0.23}
{'base_accu': 0.05, 'ngram_accu': 0.22}
{'base_accu': 0.05, 'ngram_accu': 0.22}
{'base_accu': 0.05, 'ngram_accu': 0.22}
{'base_accu': 0.05, 'ngram_accu': 0.22}


{'base_avg': 0.05, 'ngram_avg': 0.22}

In [18]:
evaluate(ran_delete_test, 3, 100)

{'base_accu': 0.02, 'ngram_accu': 0.15}
{'base_accu': 0.02, 'ngram_accu': 0.15}
{'base_accu': 0.02, 'ngram_accu': 0.15}
{'base_accu': 0.02, 'ngram_accu': 0.14}
{'base_accu': 0.02, 'ngram_accu': 0.14}


{'base_avg': 0.02, 'ngram_avg': 0.15}

## Additional evaluation methods for random swap

Do not change anything. Simply randomly swap the word order two times and check which output text is closer to the original one by ngramoverlap and edit distance

## Ngramoverlap

In [19]:
def ngramOverlapCoef(tokens1, tokens2, ngram, digits=2):
    tokens1 = ngramGenerator(tokens1, ngram)
    tokens2 = ngramGenerator(tokens2, ngram)
    avgLen = len(tokens1 + tokens2) / 2
    sharedLen = 0
    for i in tokens1:
        if i in tokens2:
            sharedLen += 1
            tokens2.remove(i)
    return round(sharedLen / avgLen, digits)

In [20]:
def ran_swap_test2(samples, num_changes, num_choices, ngram=2):
    assert num_changes > 1, 'num_changes should be at least greater than 1'
    res_base = []
    res_ngram = []
    for text in samples:
        tokens = jieba.lcut(text)
    # it is less informative if there are less than 3 words 
        if len(tokens) > 2:
            choices = reda.swap_words(tokens, num_changes, num_choices)
            reda_text = sample(choices, 1)[0]
            ngram_text = lm.pickBestSent(choices, out_str=False)[0]
            res_base.append(ngramOverlapCoef(tokens, reda_text, ngram))
            res_ngram.append(ngramOverlapCoef(tokens, ngram_text, ngram))
            
    base_avg = sum(res_base) / len(res_base)
    ngram_avg = sum(res_ngram) / len(res_ngram)
    return {'base_avg': base_avg, 'ngram_avg': ngram_avg}

In [21]:
ran_swap_test2(random_sample(), 2, 50)

{'base_avg': 0.2890173177607697, 'ngram_avg': 0.7676409585179205}

## Edit Distance

In [22]:
def editDistDP(str1, str2, digits=2):
    m = len(str1)
    n = len(str2)
    dp = [[0 for x in range(n+1)] for x in range(m+1)]
    
    for i in range(m+1):
        for j in range(n+1):
            if i == 0:
                dp[i][j] = j
            elif j == 0:
                dp[i][j] = i
                
            elif str1[i-1] == str2[j-1]:
                dp[i][j] = dp[i-1][j-1]
                
            else:
                dp[i][j] = 1 + min(dp[i][j-1], # insert
                                   dp[i-1][j], # remove
                                   dp[i-1][j-1]) # replace
    # the min distance         
    return dp[m][n]

In [23]:
def ran_swap_test3(samples, num_changes, num_choices, ngram=2):
    assert num_changes > 1, 'num_changes should be at least greater than 1'
    res_base = []
    res_ngram = []
    for text in samples:
        tokens = jieba.lcut(text)
    # it is less informative if there are less than 3 words 
        if len(tokens) > 2:
            choices = reda.swap_words(tokens, num_changes, num_choices)
            reda_text = sample(choices, 1)[0]
            ngram_text = lm.pickBestSent(choices, out_str=False)[0]
            res_base.append(editDistDP(tokens, reda_text, ngram))
            res_ngram.append(editDistDP(tokens, ngram_text, ngram))
            
    base_avg = sum(res_base) / len(res_base)
    ngram_avg = sum(res_ngram) / len(res_ngram)
    return {'base_avg': base_avg, 'ngram_avg': ngram_avg}

In [24]:
ran_swap_test3(random_sample(), 2, 20)

{'base_avg': 2.9825020112630733, 'ngram_avg': 1.3791230893000805}