In [2]:
from tqdm import tqdm
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

def cosine_sim(text1, text2):
    vectorizer = CountVectorizer().fit_transform([text1, text2])
    vectors = vectorizer.toarray()
    return cosine_similarity(vectors)[0][1]

def find_similar_x(str1, strlist):
    scores = []
    for i in strlist:
        score = cosine_sim(str1, i)
        scores.append(score)
    if max(scores)>0.5:
        idx = scores.index(max(scores))
    else:
        idx = -1
    return idx

def extract_spans_extraction(seq, label_len=2):
    extractions = []
    extractions_with_aspect = []
    all_pt = seq.split('; ')

    # print ("<<< all_pt", all_pt)
    for pt in all_pt:
        if label_len == 4:
            try:
                a, b, c, d = pt[1:-1].split(', ')
            except ValueError:
                continue
        extractions.append((a))
        extractions_with_aspect.append((a, b, c, d, a + ' ' + c))
    return extractions_with_aspect


def extract(preds, labels):
    same = []
    same_2 = []
    compre_str = [i[4] for i in preds]
    for i in range(len(labels)):
        target = labels[i]
        target_str = labels[i][4]
        similar_idx = find_similar_x(target_str, compre_str)
        if similar_idx == -1:
            same.append('none')
            same_2.append('none')
        else:
            same.append(preds[similar_idx][1])
            same_2.append(preds[similar_idx])
    return same, same_2

def main(df):
    ### 标签混淆性分析
    df_all = pd.DataFrame({'sent': [],
                           'label_all': [],
                           'label': [],
                           'pred_cate': [],
                           'pred': []})

    for i in tqdm(range(len(df['outputs']))):
        preds = extract_spans_extraction(df['outputs'][i], 4)
        labels = extract_spans_extraction(df['targets'][i], 4)
        same_tag, same_pred = extract(preds, labels)
        str_sent = ' '.join(df['sents'][i])
        sents = [str_sent] * len(same_tag)

        df_res = pd.DataFrame({'sent': sents,
                               'label_all': labels,
                               'label': [i[1] for i in labels],
                               'pred_cate': same_tag,
                               'pred': same_pred})

        df_all = pd.concat([df_all, df_res])
    return df_all

In [3]:
df = pd.read_pickle('results-tasd-huabao1023-extraction-allres.pickle')
res = main(df)

100%|██████████| 37/37 [00:00<00:00, 69.27it/s]


In [4]:
res.head()

Unnamed: 0,sent,label_all,label,pred_cate,pred
0,I purchased two of these units due to the adve...,"(Jackery Connector, Compatibility/adaptability...",Compatibility/adaptability,Compatibility/adaptability,"(Jackery Connector, Compatibility/adaptability..."
0,This is a well designed and built portable Li+...,"(power station, Appearance, well designed and ...",Appearance,Portability,"(portable Li+ power station, Portability, well..."
1,This is a well designed and built portable Li+...,"(AC output loads, Output performance, 110 volt...",Output performance,Output performance,"(110 volt AC output loads, Output performance,..."
2,This is a well designed and built portable Li+...,"(cords and power brick, Charging accessories, ...",Charging accessories,Charging accessories,"(power brick, Charging accessories, power bric..."
3,This is a well designed and built portable Li+...,"(AC or 12 volt DC sources, Charging accessorie...",Charging accessories,Output performance,"(DC sources, Output performance, 12 volt DC so..."


In [6]:
def get_summary(df_all):
    df_all['pred_cate'] = df_all['pred_cate'].map(lambda x: x.replace('\"',''))
    sizes = df_all.groupby(['label', 'pred_cate']).size().reset_index()
    sizes_2 = df_all.groupby(['label']).size().reset_index()
    sizes_2.columns = ['label','pred_num_total']
    sizes.columns = ['label','pred_cate','pred_num']
    size_all = pd.merge(sizes,sizes_2,on='label')
    size_all['percent'] = size_all['pred_num']/size_all['pred_num_total']
    return size_all

summary = get_summary(res)

In [7]:
summary.head()

Unnamed: 0,label,pred_cate,pred_num,pred_num_total,percent
0,Appearance,Portability,1,1,1.0
1,Appearance design,Portability,2,3,0.666667
2,Appearance design,none,1,3,0.333333
3,Battery capacity,Battery capacity,3,14,0.214286
4,Battery capacity,Battery cell,2,14,0.142857


In [8]:
pred_seqs = df['outputs']
gold_seqs = df['targets']
all_labels, all_predictions = [], []
all_labels_with_tag = []
all_predictions_with_tag = []
num_samples = len(gold_seqs)

def compute_f1_scores(pred_pt, gold_pt):
    """
    Function to compute F1 scores with pred and gold pairs/triplets
    The input needs to be already processed
    """
    # number of true postive, gold standard, predicted aspect terms
    res = {}

    n_tp, n_gold, n_pred = 0, 0, 0

    for i in range(len(pred_pt)):
        gold_pt[i] = list(set(gold_pt[i]))
        pred_pt[i] = list(set(pred_pt[i]))

        n_gold += len(gold_pt[i])
        n_pred += len(pred_pt[i])

        for t in pred_pt[i]:
            if t in gold_pt[i]:
                n_tp += 1

    precision = float(n_tp) / float(n_pred) if n_pred != 0 else 0
    recall = float(n_tp) / float(n_gold) if n_gold != 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if precision != 0 or recall != 0 else 0
    scores = {'total precision': precision, 'total recall': recall, 'total f1': f1}

    return scores

def extract_spans_extraction(seq,label_len=2):
    extractions = []
    extractions_with_aspect = []
    all_pt = seq.split('; ')
    
    #print ("<<< all_pt", all_pt)
    for pt in all_pt:
        if label_len==4:
            try:
                a, b, c, d = pt[1:-1].split(', ')
            except ValueError:
                continue
        extractions.append((b,d))
        extractions_with_aspect.append((a,b,c,d))
    return extractions,extractions_with_aspect

for i in range(num_samples):
    gold_list,gold_extractions_with_aspect = extract_spans_extraction(gold_seqs[i],4)
    pred_list,pred_extractions_with_aspect = extract_spans_extraction(pred_seqs[i],4)

    all_labels.append(gold_list)
    all_predictions.append(pred_list)

    all_labels_with_tag.append(gold_extractions_with_aspect)
    all_predictions_with_tag.append(pred_extractions_with_aspect)
    
def get_subset(original_list, element):
    return [sublist for sublist in original_list if element in sublist[0]]

def calc_tag(element, all_predictions, all_labels):
    pred_pt_sub = [get_subset(original_list, element) for original_list in all_predictions]
    gold_pt_sub = [get_subset(original_list, element) for original_list in all_labels]
    score = compute_f1_scores(pred_pt_sub, gold_pt_sub)
    return score

def get_tags(x):
    y = [i[0] for i in x]
    return y 

def get_tag_list(all_labels):
    merged = [get_tags(x) for x in all_labels]
    flat_list = [item for sublist in merged for item in sublist]
    unique_elements = list(set(flat_list))
    return unique_elements

def get_res(tags):
    precisons = []
    recall = []
    f1 = []
    for i in tags:
        score = calc_tag(i, all_predictions, all_labels)
        precisons.append(score['total precision'])
        recall.append(score['total recall'])
        f1.append(score['total f1'])

    res = pd.DataFrame({'tag':tags, 'precision':precisons, 'recall':recall,'f1':f1})
    return res


tags = get_tag_list(all_labels)
res = get_res(tags)

In [9]:
res2 = pd.merge(res,summary[['label','pred_num_total']].drop_duplicates(),left_on='tag',right_on='label')

In [10]:
res2.head()

Unnamed: 0,tag,precision,recall,f1,label,pred_num_total
0,Relatives/friends/personal recommendation,0.0,0.0,0.0,Relatives/friends/personal recommendation,1
1,Portability,0.428571,0.75,0.545455,Portability,7
2,Charging quality,0.0,0.0,0.0,Charging quality,5
3,Battery cell,0.142857,0.5,0.222222,Battery cell,2
4,Compatibility/Adaptability,0.0,0.0,0.0,Compatibility/Adaptability,5
