In [2]:
dev_file = '/home/t-honli/data/EQnA/dev-v1.1.json'
bidaf_file = '/home/t-honli/bi-att-flow/out/EQnA/no_sent_token/01-08-2017/answer/test-020000.json'
prod_file = '/home/t-honli/data/EQnA/EQnA_Highlighting_Test_ProdModel.tsv'
    
# parse dev data function
def ParseJson(data_json):
    paragraphs = data_json['data'][0]['paragraphs']
    idx_list = list()
    context_list = list()
    ques_list = list()
    ans_list = list()
    for para in paragraphs:
        context = para['context']
        idx = para['qas'][0]['id']
        ques = para['qas'][0]['question']
        ans = '|||'.join([ans['text'] for ans in para['qas'][0]['answers']])
        idx_list.append(idx)
        context_list.append(context)
        ques_list.append(ques)
        ans_list.append(ans)
    return idx_list, context_list, ques_list, ans_list

# dev data
import json

dev_data = json.load(open(dev_file, "r"))
import pandas as pd
idx, context, ques, ans = ParseJson(dev_data)
dev_pd = pd.DataFrame({'id':idx, 'context':context, 'question':ques, 'ground_truth':ans}, columns=['id', 'context', 'question', 'ground_truth'])

# bidaf ans
import json
import os

bidaf = json.load(open(bidaf_file, "r"))
dev_pd['bidaf'] = dev_pd.apply(lambda row: '|||'.join([ phrase+':::'+score for phrase, score in  zip(str(bidaf[row['id']]).split('|||'), str(bidaf['scores'][row['id']]).split('|||'))          ]), axis=1)

# prod model
import pandas as pd
col_names = ['id', 'Query', 'Url', 'Answer', 'AnswerTokenList', 'ParaseSpan', 'phrase', 'Label', 'Probability']
prod = pd.read_csv(prod_file, header=None, sep='\t', names=col_names, dtype=str).fillna('')
# 根据answer，query，url生成hash_id
import re
def getAnswerByTokenList(s):
    slist = ([ wd.strip() for wd in s.strip('[]').split('\",\"')])
    slist = ' '.join(slist)
    slist = re.sub(r'\\\"', '\"', slist)
    slist = re.sub(r'\\\'', '\'', slist)
    slist = slist.strip('\"')
    return slist
import hashlib
def GetHashCode(context):
    hash = hashlib.md5()
    hash.update(context.encode('utf-8'))
    return hash.hexdigest()
prod['hash_id'] = prod.apply(lambda row: GetHashCode(getAnswerByTokenList(row['AnswerTokenList'].strip()) + ' ' + row['Query'].strip()), axis=1)
# filter len >= 32
prod = prod[prod.apply(lambda row: len(row['phrase'])<32, axis=1)]
# 获取hash_id, [(phrase,score)]
multi_phrase_dict = {}
def getPhrase(row):
    if row['hash_id'] not in multi_phrase_dict:
        multi_phrase_dict[row['hash_id']] = list()
    row['Probability'] = '%.4f' % float(row['Probability'])
    multi_phrase_dict[row['hash_id']].append(row['phrase'] + ":::" + row['Probability'])
prod.apply(getPhrase, axis=1)
# sort
for idx in list(multi_phrase_dict.keys()):
    multi_phrase_dict[idx].sort(key=lambda x: float(x.split(':')[-1]), reverse=True)
dev_pd['prod'] = dev_pd.apply(lambda row: '|||'.join(multi_phrase_dict[row['id']]), axis=1)
    

# 计算ground truth和bidaf的集合的P/R, 宏平均
# P = intersection/bidaf
# R = intersection/ground_truth
# 采用cover的方式

# 是否选择多answer的阈值
# choose_multi_answer_threshold = 0.4
# 第二个及之后的answer的阈值
# multi_answer_threshold = 0.5
# 计算prod多answer的阈值
# prod_answer_threshold=0.6

choose_multi_answer_threshold = 0.4
multi_answer_threshold = 0.5
prod_answer_threshold=0.6

import string
import re
def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

# with tricks
# 这里根据第一个slot的score来判断是否是多answer，threshold=0.4, （是否根据score>0.36选择后面的answer?）
def getPandR(row):
    spans = row['bidaf'].split('|||')
    grounds = row['ground_truth'].split('|||')
    bidafs = []
    if float(spans[0].split(':::')[1]) > choose_multi_answer_threshold:
        bidafs.append(spans[0].split(':::')[0])
    else:
        for idx, span in enumerate(spans):
            ans, score = span.split(':::')
            if idx == 0:
                bidafs.append(ans)
            else:
                if float(score) > multi_answer_threshold:
                    bidafs.append(ans)
                else:
                    break
    inter = 0
    for bidaf in bidafs:
        for ground in grounds:
            bidaf = normalize_answer(bidaf)
            ground = normalize_answer(ground)
            if bidaf in ground or ground in bidaf:
                inter += 1
                break
    return [1.0*inter/len(bidafs), 1.0*inter/len(grounds)]

# without tricks, 至少有一个
def getPandR_withoutTricks(row):
    grounds = row['ground_truth'].split('|||')
    bidafs = []
    for idx, span in enumerate(row['bidaf'].split('|||')):
        ans, score = span.split(':::')
        if idx == 0:
            bidafs.append(ans)
        else:
            if float(score) > multi_answer_threshold:
                bidafs.append(ans)
            else:
                break
    inter = 0
    for bidaf in bidafs:
        for ground in grounds:
            bidaf = normalize_answer(bidaf)
            ground = normalize_answer(ground)
            if bidaf in ground or ground in bidaf:
                inter += 1
                break
    return [1.0*inter/len(bidafs), 1.0*inter/len(grounds)]

#threshold=0.6
def getPandR_prod(row):
    grounds = row['ground_truth'].split('|||')
    prods = []
    for idx, span in enumerate(row['prod'].split('|||')):
        ans, score = span.split(':::')
        if idx == 0:
            prods.append(ans)
        else:
            if float(score) > prod_answer_threshold:
                prods.append(ans)
            else:
                break
    inter = 0
    for prod in prods:
        for ground in grounds:
            prod = normalize_answer(prod)
            ground = normalize_answer(ground)
            if prod in ground or ground in prod:
                inter += 1
                break
    return [1.0*inter/len(prods), 1.0*inter/len(grounds)]

bidaf_P, bidaf_R = zip(*dev_pd.apply(getPandR, axis=1).values)
P = sum(bidaf_P)/len(dev_pd)
R = sum(bidaf_R)/len(dev_pd)
print ('Macro Average:')
print ('Tricks on choose multi answer: choose_multi_answer_threshold = {}, multi_answer_threshold = {}'.format(choose_multi_answer_threshold, multi_answer_threshold))
print ('Bidaf P: ', P)
print ('Bidaf R: ', R)
print ('Bidaf F1: ', 2*P*R/(P+R))
print ()

bidaf_P, bidaf_R = zip(*dev_pd.apply(getPandR_withoutTricks, axis=1).values)
P = sum(bidaf_P)/len(dev_pd)
R = sum(bidaf_R)/len(dev_pd)
print ('Normal: choose first answer, multi_answer_threshold = {}'.format(multi_answer_threshold))
print ('Bidaf P: ', P)
print ('Bidaf R: ', R)
print ('Bidaf F1: ', 2*P*R/(P+R))
print ()

prod_P, prod_R = zip(*dev_pd.apply(getPandR_prod, axis=1).values)
P = sum(prod_P)/len(dev_pd)
R = sum(prod_R)/len(dev_pd)
print ('Prod: choose first answer, multi_answer_threshold = {}'.format(prod_answer_threshold))
print ('Bidaf P: ', P)
print ('Bidaf R: ', R)
print ('Bidaf F1: ', 2*P*R/(P+R))
print ()


# 计算ground truth和bidaf的集合的P/R, 微平均
# P = intersection/bidaf
# R = intersection/ground_truth
# 采用cover的方式

# 是否选择多answer的阈值
# choose_multi_answer_threshold = 0.4
# 第二个及之后的answer的阈值
# multi_answer_threshold = 0.5
# 计算prod多answer的阈值
# prod_answer_threshold=0.6

import string
import re
def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

# with tricks
# 这里根据第一个slot的score来判断是否是多answer，threshold=0.4, 最多选择3个answer（是否根据score>0.36选择后面的answer?）

def getPandR(row):
    global inter_num, bidaf_num, ground_num
    spans = row['bidaf'].split('|||')
    grounds = row['ground_truth'].split('|||')
    bidafs = []
    if float(spans[0].split(':::')[1]) > choose_multi_answer_threshold:
        bidafs.append(spans[0].split(':::')[0])
    else:
        for idx, span in enumerate(spans):
            ans, score = span.split(':::')
            if idx == 0:
                bidafs.append(ans)
            else:
                if float(score) > multi_answer_threshold:
                    bidafs.append(ans)
                else:
                    break
    inter = 0
    for bidaf in bidafs:
        for ground in grounds:
            bidaf = normalize_answer(bidaf)
            ground = normalize_answer(ground)
            if bidaf in ground or ground in bidaf:
                inter += 1
                break

    inter_num += inter
    bidaf_num += len(bidafs)
    ground_num += len(grounds)

    return [1.0*inter/len(bidafs), 1.0*inter/len(grounds)]

# without tricks, 至少有一个
def getPandR_withoutTricks(row):
    global inter_num, bidaf_num, ground_num
    grounds = row['ground_truth'].split('|||')
    bidafs = []
    for idx, span in enumerate(row['bidaf'].split('|||')):
        ans, score = span.split(':::')
        if idx == 0:
            bidafs.append(ans)
        else:
            if float(score) > multi_answer_threshold:
                bidafs.append(ans)
            else:
                break
    inter = 0
    for bidaf in bidafs:
        for ground in grounds:
            bidaf = normalize_answer(bidaf)
            ground = normalize_answer(ground)
            if bidaf in ground or ground in bidaf:
                inter += 1
                break

    inter_num += inter
    bidaf_num += len(bidafs)
    ground_num += len(grounds)
    return [1.0*inter/len(bidafs), 1.0*inter/len(grounds)]

def getPandR_prod(row):
    global inter_num, bidaf_num, ground_num
    grounds = row['ground_truth'].split('|||')
    prods = []
    for idx, span in enumerate(row['prod'].split('|||')):
        ans, score = span.split(':::')
        if idx == 0:
            prods.append(ans)
        else:
            if float(score) > prod_answer_threshold:
                prods.append(ans)
            else:
                break
    inter = 0
    for prod in prods:
        for ground in grounds:
            prod = normalize_answer(prod)
            ground = normalize_answer(ground)
            if prod in ground or ground in prod:
                inter += 1
                break

    inter_num += inter
    bidaf_num += len(prods)
    ground_num += len(grounds)
    return [1.0*inter/len(prods), 1.0*inter/len(grounds)]

inter_num = 0
ground_num = 0
bidaf_num = 0
bidaf_P, bidaf_R = zip(*dev_pd.apply(getPandR, axis=1).values)
P = 1.0*inter_num/bidaf_num
R = 1.0*inter_num/ground_num

print ('Micro Average:')
print ('Tricks on choose multi answer: choose_multi_answer_threshold = {}, multi_answer_threshold = {}'.format(choose_multi_answer_threshold, multi_answer_threshold))
print ('Bidaf P: ', P)
print ('Bidaf R: ', R)
print ('Bidaf F1: ', 2*P*R/(P+R))
print ()

inter_num = 0
ground_num = 0
bidaf_num = 0
bidaf_P, bidaf_R = zip(*dev_pd.apply(getPandR_withoutTricks, axis=1).values)
P = 1.0*inter_num/bidaf_num
R = 1.0*inter_num/ground_num
print ('Normal: choose first answer, multi_answer_threshold = {}'.format(multi_answer_threshold))
print ('Bidaf P: ', P)
print ('Bidaf R: ', R)
print ('Bidaf F1: ', 2*P*R/(P+R))
print ()

inter_num = 0
ground_num = 0
bidaf_num = 0
prod_P, prod_R = zip(*dev_pd.apply(getPandR_prod, axis=1).values)
P = 1.0*inter_num/bidaf_num
R = 1.0*inter_num/ground_num
print ('Prod: choose first answer, multi_answer_threshold = {}'.format(prod_answer_threshold))
print ('Bidaf P: ', P)
print ('Bidaf R: ', R)
print ('Bidaf F1: ', 2*P*R/(P+R))



Macro Average:
Tricks on choose multi answer: choose_multi_answer_threshold = 0.4, multi_answer_threshold = 0.5
Bidaf P:  0.8565891472868218
Bidaf R:  0.7233102070644822
Bidaf F1:  0.7843280292340071

Normal: choose first answer, multi_answer_threshold = 0.5
Bidaf P:  0.7977898730001648
Bidaf R:  0.7713062486280633
Bidaf F1:  0.7843245619633379

Prod: choose first answer, multi_answer_threshold = 0.6
Bidaf P:  0.8475931292853612
Bidaf R:  0.8547015854203428
Bidaf F1:  0.8511325155783384

Micro Average:
Tricks on choose multi answer: choose_multi_answer_threshold = 0.4, multi_answer_threshold = 0.5
Bidaf P:  0.83737760749255
Bidaf R:  0.5558067250635773
Bidaf F1:  0.6681385869565217

Normal: choose first answer, multi_answer_threshold = 0.5
Bidaf P:  0.7507173601147776
Bidaf R:  0.5914100028256569
Bidaf F1:  0.661608977398451

Prod: choose first answer, multi_answer_threshold = 0.6
Bidaf P:  0.8365356192728347
Bidaf R:  0.7996609211641706
Bidaf F1:  0.8176827506501011


In [3]:
dev_pd.head()

Unnamed: 0,id,context,question,ground_truth,bidaf,prod
0,2abdda92eac7f6b73d9b238f503e4219,Start Up & Rev of a 2006 Acura RL SH - AWD.Thi...,2006 acura rl horsepower,290 HP,290 HP:::0.3702|||3.5L:::0.6210|||V6:::0.5607,290 HP:::0.8149|||V6:::0.4651|||AWD.This:::0.3...
1,f26402275d1f070d15e951a3ec04dacb,The income tax withholding rate remains at 4.2...,2015 michigan withholding tax rate,4.25 %,"4.25 %:::0.9451|||$4,000:::0.9262|||for tax:::...","4.25 %:::0.8581|||$4,000:::0.6614|||2014:::0.0..."
2,125674965d00b4087f076223745f1ad5,The experts at Vidal Sassoon bring you salon c...,5rr hair color,Red,gray:::0.5498|||5RR:::0.3446|||100 percent:::0...,Red:::0.9268|||gray:::0.9099|||Vidal Sassoon::...
3,b153122979cdb753b1466ac8145758c9,Effects depend on the substance : hydrogen per...,acid that burns through skin,nitric acid|||hydrogen peroxide,hydrogen peroxide:::0.4090|||nitric acid:::0.9...,hydrogen peroxide:::0.4230|||nitric acid:::0.2...
4,f5d4a08ddf0aba6cab1ea9ae6bd1f26b,"Actor Nikolaj Coster - Waldau , the man who pl...",actor who plays jamie lannister,Nikolaj Coster|||Waldau,Waldau:::0.3298|||Nikolaj Coster:::0.9909|||Ac...,Jaime:::0.7372|||Nikolaj Coster:::0.7275|||Wal...
