In [1]:
import numpy as np
import pandas as pd
import random
import sklearn
import seaborn
import matplotlib.pyplot as plt

In [2]:
# file path
data_path = '../data/attn_weight'
single_file = f'{data_path}/v1.0_single_test_weight.csv'
multi_file = f'{data_path}/v1.1_test_weight.csv'

# read csv
single_df = pd.read_csv(single_file)
multi_df = pd.read_csv(multi_file)

In [3]:
# change data type
columns = ['claim_tokenize', 'target_encode', 'claim_encode', 'claim_decode',
           'claim_lexicon', 'task_weight', 'shared_weight']
for column in columns:
    try:
        single_df[column] = (
            single_df[column].apply(lambda data: eval(data)))
    except:
        pass
    multi_df[column] = (
        multi_df[column].apply(lambda data: eval(data)))

In [4]:
# define compare function
def compare_result(single_df, multi_df, 
                   single_correct=True, multi_correct=True):
    # get boolean index
    if single_correct:
        single_boolean = single_df['label_encode'] == single_df['label_pred']
    else:
        single_boolean = single_df['label_encode'] != single_df['label_pred']

    if multi_correct:
        multi_boolean = multi_df['label_encode'] == multi_df['label_pred']
    else:
        multi_boolean = multi_df['label_encode'] != multi_df['label_pred']

    # get dataframe index
    single_index = single_df[single_boolean].index
    multi_index = multi_df[multi_boolean].index

    # get intersection index
    intersection_index = single_index.intersection(multi_index)

    return intersection_index

In [5]:
# get compare index
TT_index = compare_result(single_df, multi_df, True, True)  # single correct, multi correct
TF_index = compare_result(single_df, multi_df, True, False)  # single correct, multi incorrect
FT_index = compare_result(single_df, multi_df, False, True)  # single incorrect, multi correct
FF_index = compare_result(single_df, multi_df, False, False)  # single incorrect, multi incorrect

assert len(single_df) == (
    len(TT_index) + len(TF_index) + len(FT_index) + len(FF_index)
)

print('TT_index', 'TF_index', 'FT_index', 'FF_index', sep='\t')
print(len(TT_index), len(TF_index),
      len(FT_index), len(FF_index), sep='\t\t')

TT_index	TF_index	FT_index	FF_index
518		162		287		282


In [267]:
# define sample function
def sample_data(single_df, multi_df, all_index, target=None):
    while True:
        # get sample data index
        if target is None:
            index = random.choice(all_index.tolist())
        else:
            index = single_df.iloc[all_index][single_df['target'] == target].index
            index = random.choice(index.tolist())

        # get sample data
        single_data, multi_data = (
            single_df.iloc[index], multi_df.iloc[index])

        # get the column we need
        single_data = single_data[['ID', 'target', 'claim', 'claim_decode',
                                   'task_weight', 'label_encode', 'label_pred']]
        multi_data = multi_data[['claim_decode', 'task_weight',
                                 'shared_weight', 'label_pred']]
        
        # check single weight
        seq_len = len(single_data['claim_decode'])
        all_weights = single_data['task_weight'][1:seq_len-1]
        weight = max(all_weights)
        
        if weight > 0.1:
            break
    
    # print single result
    print(f'Target: {single_data["target"]}')
    print(f'ID: {single_data["ID"]}, Claim:  {single_data["claim"]}\n')

    print(f'Single model - label: {single_data["label_encode"]}, '
          f'predict: {single_data["label_pred"]}\n')
    print("{0:15} {1}".format('token', 'task_weight'))
    for token, weight in zip(single_data['claim_decode'],
                             single_data['task_weight']):
        token = '[unk]' if token == 'chasingice' else token
        print("{0:15} {1:.10f}".format(token, weight))

    # print multi result
    print(f'\nTarget: {single_data["target"]}')
    print(f'ID: {single_data["ID"]}, Claim:  {single_data["claim"]}\n')

    print(f'Multi model - label: {single_data["label_encode"]}, '
          f'predict: {multi_data["label_pred"]}\n')
    print("{0:15} {1:12} {2}".format('token', 'task_weight', 'shared_weight'))
    for token, task_weight, shared_weight in zip(multi_data['claim_decode'],
                                                 multi_data['task_weight'],
                                                 multi_data['shared_weight']):
        print("{0:15} {1:.10f} {2:.10f}".format(token, task_weight, shared_weight))

In [268]:
# specify all the targets
targets = ['Atheism', 'Climate Change is a Real Concern',
           'Feminist Movement', 'Hillary Clinton',
           'Legalization of Abortion']

In [333]:
# sample single correct, multi correct data
sample_data(single_df, multi_df, TF_index)

Target: Feminist Movement
ID: 10582, Claim:  I hate it when ignorant losers say "another feminist cured." I don't need to be cured for wanting to be treated like a person. #SemST

Single model - label: 0, predict: 0

token           task_weight
[bos]           0.0751995221
i               0.3992257118
hate            0.2179879248
it              0.0693770871
when            0.0318396874
ignorant        0.0164023079
losers          0.0087089632
say             0.0267525520
[unk]           0.0053545325
feminist        0.0149391228
[unk]           0.0022944261
i               0.0050446549
[unk]           0.0029496253
need            0.0265847184
to              0.0051019341
be              0.0011549945
[unk]           0.0002179704
for             0.0008856706
wanting         0.0002687991
to              0.0001517127
be              0.0000688030
treated         0.0000219532
like            0.0000079527
a               0.0000417886
[unk]           0.0000167918
[unk]           0.0000443778
[