In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.color_palette()
%config InlineBackend.figure_format='retina'

In [3]:
y_train, y_valid, y_test = pickle.load(open('../datasets/bm/labels.pkl', 'rb'))
ytvs = y_train, y_valid, y_test
p2i_train, p2i_valid, p2i_test = pickle.load(open('../datasets/bm/img2idx.pkl', 'rb'))
i2p_train = {v:k for k, v in p2i_train.items()}
i2p_valid = {v:k for k, v in p2i_valid.items()}
i2p_test = {v:k for k, v in p2i_test.items()}

In [4]:
df = pd.read_csv('../datasets/bm_prolific_triplets/ab1-3.results.csv', index_col=0).reset_index(drop=True)
df['ttt'] = df['trips_train'].apply(eval).apply(lambda row: np.array([list(map(lambda i: p2i_train[i], trip)) for trip in row]))
df['vtt'] = df['trips_validtt'].apply(eval).apply(lambda row: np.array([[p2i_valid[trip[0]], p2i_train[trip[1]], p2i_train[trip[2]]] for trip in row]))
df['stt'] = df['trips_testtt'].apply(eval).apply(lambda row: np.array([[p2i_test[trip[0]], p2i_train[trip[1]], p2i_train[trip[2]]] for trip in row]))

In [5]:
def get_apn(trips, ytvs, fold):
    yt, yv, ys = ytvs
    ya = yv if fold == 'validtt' or fold == 'vtt' else ys if fold == 'testtt' or fold == 'stt' else yt
    yp, yn = yt, yt
    a = np.take(ya, trips[:,0])
    p = np.take(yp, trips[:,1])
    n = np.take(yn, trips[:,2])
    return a, p, n

def get_apn_detailed(a, p, n):
    apn = (a == p) & (a == n)
    ap_n = (a == p) & (a != n)
    an_p = (a == n) & (a != p)
    a_pn = (a != p) & (p == n)
    results = {
        'correct': ap_n,
        'correct_ratio': ap_n.sum() / len(a),
        'clf': (ap_n | an_p),
        'clf_acc': ap_n.sum() / (ap_n | an_p).sum(),
        'clf_ratio': (ap_n | an_p).sum() / len(a),
        'filtered': (apn | ap_n | a_pn),
        'filtered_ratio': (apn | ap_n | a_pn).sum() / len(a),
    }
    return results

In [6]:
df['tapn'] = df['ttt'].apply(lambda x: get_apn(x, ytvs, fold='ttt'))
df['vapn'] = df['vtt'].apply(lambda x: get_apn(x, ytvs, fold='vtt'))
df['sapn'] = df['stt'].apply(lambda x: get_apn(x, ytvs, fold='stt'))

In [7]:
results_keys = list(get_apn_detailed(*df['tapn'].loc[0]).keys())
for key in results_keys:
    df['tapn_' + key] = df['tapn'].apply(lambda r: get_apn_detailed(*r)[key])
    df['vapn_' + key] = df['vapn'].apply(lambda r: get_apn_detailed(*r)[key])
    df['sapn_' + key] = df['sapn'].apply(lambda r: get_apn_detailed(*r)[key])

In [8]:
df.to_csv('bm_triplets.csv', index=False)
df.index, df.columns

(RangeIndex(start=0, stop=80, step=1),
 Index(['username', 'study_name', 'total_study_time', 'total_prediction_time',
        'median_prediction_time', 'prediction_time', 'attention_answers',
        'answers', 'question_images', 'trips_train', 'trips_validtt',
        'trips_testtt', 'ttt', 'vtt', 'stt', 'tapn', 'vapn', 'sapn',
        'tapn_correct', 'vapn_correct', 'sapn_correct', 'tapn_correct_ratio',
        'vapn_correct_ratio', 'sapn_correct_ratio', 'tapn_clf', 'vapn_clf',
        'sapn_clf', 'tapn_clf_acc', 'vapn_clf_acc', 'sapn_clf_acc',
        'tapn_clf_ratio', 'vapn_clf_ratio', 'sapn_clf_ratio', 'tapn_filtered',
        'vapn_filtered', 'sapn_filtered', 'tapn_filtered_ratio',
        'vapn_filtered_ratio', 'sapn_filtered_ratio'],
       dtype='object'))

In [9]:
from itertools import chain
ttt = np.vstack(list(chain(df['ttt'])))
vtt = np.vstack(list(chain(df['vtt'])))
stt = np.vstack(list(chain(df['stt'])))

In [10]:
ttt_filter_msk = np.vstack((list(chain(df['tapn'].apply(
    lambda r: get_apn_detailed(*r)['filtered']))))).ravel()
vtt_filter_msk = np.vstack((list(chain(df['vapn'].apply(
    lambda r: get_apn_detailed(*r)['filtered']))))).ravel()
stt_filter_msk = np.vstack((list(chain(df['sapn'].apply(
    lambda r: get_apn_detailed(*r)['filtered']))))).ravel()

In [11]:
ttt_filtered, vtt_filtered, stt_filtered = ttt[ttt_filter_msk], vtt[vtt_filter_msk], stt[stt_filter_msk]
ttt_filtered.shape, vtt_filtered.shape, stt_filtered.shape

((2037, 3), (659, 3), (662, 3))

In [12]:
# pickle.dump(filtered.tolist(),
#     open('../datasets/bm_prolific_triplets/train_triplets_filtered.pkl', 'wb'))
# pickle.dump(vtt_filtered.tolist(),
#     open('../datasets/bm_prolific_triplets/valid_triplets_filtered.pkl', 'wb'))
# pickle.dump(stt_filtered.tolist(),
#     open('../datasets/bm_prolific_triplets/test_triplets_filtered.pkl', 'wb'))

In [13]:
ttt_filter_msk = np.vstack((list(chain(df['tapn'].apply(
    lambda r: get_apn_detailed(*r)['correct']))))).ravel()
vtt_filter_msk = np.vstack((list(chain(df['vapn'].apply(
    lambda r: get_apn_detailed(*r)['correct']))))).ravel()
stt_filter_msk = np.vstack((list(chain(df['sapn'].apply(
    lambda r: get_apn_detailed(*r)['correct']))))).ravel()

In [14]:
ttt_filtered, vtt_filtered, stt_filtered = ttt[ttt_filter_msk], vtt[vtt_filter_msk], stt[stt_filter_msk]
ttt_filtered.shape, vtt_filtered.shape, stt_filtered.shape

((845, 3), (275, 3), (278, 3))

In [15]:
ttt_filter_msk = np.vstack((list(chain(df['tapn'].apply(
    lambda r: get_apn_detailed(*r)['clf']))))).ravel()
vtt_filter_msk = np.vstack((list(chain(df['vapn'].apply(
    lambda r: get_apn_detailed(*r)['clf']))))).ravel()
stt_filter_msk = np.vstack((list(chain(df['sapn'].apply(
    lambda r: get_apn_detailed(*r)['clf']))))).ravel()

In [16]:
ttt_filtered, vtt_filtered, stt_filtered = ttt[ttt_filter_msk], vtt[vtt_filter_msk], stt[stt_filter_msk]
ttt_filtered.shape, vtt_filtered.shape, stt_filtered.shape

((1208, 3), (416, 3), (416, 3))