# Investigate Atlas predictions

Rudimentary check: If any of the answer options has been generated by the unconstrained decoding, that should be the model answer. 

In [1]:
from pararel.consistency.utils import read_jsonl_file
import re

In [2]:
atlas_preds_folder = "/cephyr/users/lovhag/Alvis/projects/atlas/pararel-eval-zero-shot-base-no-space-likelihood-no-eos-with-3/"
pararel_data_folder = "/cephyr/users/lovhag/Alvis/projects/pararel/data/all_n1_atlas_no_space"

In [3]:
atlas_preds_file = "/cephyr/users/lovhag/Alvis/projects/atlas/data/experiments/pararel-eval-baseline-t5-no-space-likelihood-no-eos-with-3/P36-1116812/P36-step-None.jsonl"
pararel_options_file = "/cephyr/users/lovhag/Alvis/projects/pararel/data/all_n1_atlas_no_space/P36_options.txt"

data = read_jsonl_file(atlas_preds_file)

with open(pararel_options_file) as f:
    options = [line.strip() for line in f.readlines()]

In [66]:
# given greedy and decoded predictions
error_ixs = []
for ix, entry in enumerate(data):
    option_in_generation = False
    option_also_in_constrained_generation = False
    # do not investigate non-erroneous cases further
    if entry["generation_by_choice"] == entry["answers"][0]:
        continue
    # do not investigate cases for which the free generation doesn't contain the right answer
    if not entry["answers"][0] in re.split(' |\.|!|\?', entry["generation"]):
        continue
    for option in options:
        if option in re.split(' |\.|!|\?', entry["generation"]):
            option_in_generation = True
            if option in [entry["generation_by_choice"]]:
                option_also_in_constrained_generation = True
    if option_in_generation and not option_also_in_constrained_generation:
        error_ixs += [ix]
print(f"Found {len(error_ixs)} mismatching and erroneous generations out of {len(data)} predictions")

Found 9 mismatching and erroneous generations out of 6398 predictions


In [61]:
options.sort()
options

['Aberdeen',
 'Adelaide',
 'Aden',
 'Albany',
 'Albuquerque',
 'Aleppo',
 'Alexandria',
 'Amsterdam',
 'Ankara',
 'Arnold',
 'Athens',
 'Atlanta',
 'Auburn',
 'Auckland',
 'Augusta',
 'Austin',
 'Babylon',
 'Baghdad',
 'Bangalore',
 'Bangkok',
 'Barcelona',
 'Batman',
 'Beijing',
 'Beirut',
 'Belfast',
 'Berlin',
 'Bern',
 'Boise',
 'Bolton',
 'Boone',
 'Boston',
 'Boulder',
 'Brisbane',
 'Bristol',
 'Brunswick',
 'Brussels',
 'Bryan',
 'Budapest',
 'Burlington',
 'Burns',
 'Butler',
 'Cairo',
 'Cambridge',
 'Camden',
 'Canadian',
 'Canberra',
 'Cardiff',
 'Center',
 'Charleston',
 'Chennai',
 'Cherokee',
 'Chester',
 'Chicago',
 'Circle',
 'Clayton',
 'Cleveland',
 'Columbia',
 'Columbus',
 'Concord',
 'Constantine',
 'Constantinople',
 'Conway',
 'Copenhagen',
 'Cork',
 'Craig',
 'Crosby',
 'Crowley',
 'Damascus',
 'Darwin',
 'Davis',
 'Delhi',
 'Denver',
 'Douglas',
 'Dover',
 'Dresden',
 'Dubai',
 'Dublin',
 'Dudley',
 'Durham',
 'Eagle',
 'Eaton',
 'Edinburgh',
 'Edmonton',
 'Eliz

Check the error cases

In [67]:
for error_ix in error_ixs:
    print(f"query: {data[error_ix]['query']}")
    print(f"answers: {data[error_ix]['answers']}")
    print(f"generation: {data[error_ix]['generation']}")
    print(f"generation_by_choice: {data[error_ix]['generation_by_choice']}")
    print("--------------------------------------------")

query: The capital city of Kingdom of Scotland is<extra_id_0>.
answers: ['Edinburgh']
generation: The capital city of Kingdom of Scotland is located in the city of Edinburgh. The capital city of Kingdom of Scotland is located in the city of Edinburgh. The
generation_by_choice: Glasgow
--------------------------------------------
query: The capital city of Syria is<extra_id_0>.
answers: ['Damascus']
generation: The capital city of Syria is located in the city of Damascus. The capital city of Syria is located in the city of Damas
generation_by_choice: Istanbul
--------------------------------------------
query: The capital of Ankara Province is<extra_id_0> .
answers: ['Ankara']
generation: The capital of Ankara Province is located in the province of Ankara. The capital of Ankara Province is located in the province of An
generation_by_choice: Istanbul
--------------------------------------------
query: The capital city of Second Polish Republic is<extra_id_0>.
answers: ['Warsaw']
generati

## Check passage retrieval

In [125]:
import pandas as pd
import pickle
import numpy as np
import os
from pararel.consistency.utils import read_jsonl_file
import torch
from torch import nn

from nltk.stem import WordNetLemmatizer, PorterStemmer, SnowballStemmer
from nltk import word_tokenize

In [2]:
import nltk
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to
[nltk_data]     /cephyr/users/lovhag/Alvis/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /cephyr/users/lovhag/Alvis/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Elementwise analysis

In [3]:
def get_options(options_file):
    options = []
    with open(options_file, "r") as f:
        for line in f.readlines():
            options.append(line.strip())
    return options

In [63]:
#lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
def get_num_passages_w_obj(passages, obj):
    obj_stem = stemmer.stem(obj.lower())
    #print(obj_stem)
    #print(passages[0]['text'])
    return sum([obj_stem in passage['text'].lower() for passage in passages])

def get_passage_obj_freq(passages, obj):
    text = (" ").join([passage['text'].lower() for passage in passages])
    return text.count(stemmer.stem(obj.lower()))

def get_title_dispersion(passages):
    titles = [passage["title"].split(":")[0] for passage in passages]
    return len(set(titles))

def get_freq_answer(passages, options):
    opt_count = []
    for opt in options:
        opt_count += [get_passage_obj_freq(passages, opt)] # stemmer cannot handle e.g. 'physicist'
    return options[np.argmax(opt_count)]

def get_answer_freq_rank(passages, answer, options):
    ans_ix = options.index(answer)
    opt_count = []
    for opt in options:
        opt_count += [get_passage_obj_freq(passages, opt)] # stemmer cannot handle e.g. 'physicist'
    return len(options) - np.where(np.argsort(opt_count)==ans_ix)[0][0]

**Similarity per answer frequencies**

* What if we had a model that simply predicted the most frequently occurring answer alternative in the retrieval?
* Consistency of this?
* Count answer alternatives in passages!

* Or, for more granularity, as rank?

In [37]:
atlas_preds_file = "/cephyr/users/lovhag/Alvis/projects/atlas/data/experiments/pararel-eval-zero-shot-base-no-space-likelihood-no-eos-with-3/P101-base-2017-1115964/P101-step-0.jsonl"
#atlas_preds_file = "/cephyr/users/lovhag/Alvis/projects/atlas/data/experiments/pararel-eval-zero-shot-base-no-space-likelihood-no-eos-with-3/P276-base-2017-1115948/P276-step-0.jsonl"

r_file = "/cephyr/users/lovhag/Alvis/projects/atlas/data/experiments/pararel-compute-r-embeddings-base/P101-2017-1143962/P101-step-0-r-embedding"
r_embeddings_file = r_file + ".pt"
r_info_file = r_file + ".jsonl"

options_file = "/cephyr/users/lovhag/Alvis/projects/pararel/data/all_n1_atlas_no_space/P101_options.txt"

In [62]:
options = get_options(options_file)
data = read_jsonl_file(atlas_preds_file)

# for retriever embedding analysis
r_embeddings = torch.load(r_embeddings_file)
r_info = pd.DataFrame(read_jsonl_file(r_info_file))

assert r_embeddings.shape[0]==len(r_info)
r_info.head()

Unnamed: 0,query,answers,metadata,sub_label,pattern
0,The expertise of John Vincent Atanasoff is<ext...,[mathematics],{},John Vincent Atanasoff,The expertise of [X] is [Y].
1,<extra_id_0> is the domain of activity of John...,[mathematics],{},John Vincent Atanasoff,[Y] is the domain of activity of [X].
2,John Vincent Atanasoff's expertise is<extra_id...,[mathematics],{},John Vincent Atanasoff,[X]'s expertise is [Y].
3,<extra_id_0> is the specialization of Edward B...,[anthropology],{},Edward Burnett Tylor,[Y] is the specialization of [X].
4,The domain of work of Edward Burnett Tylor is<...,[anthropology],{},Edward Burnett Tylor,The domain of work of [X] is [Y].


In [64]:
pd_data = pd.DataFrame(data)
pd_data["title_dispersion"] = pd_data.passages.apply(get_title_dispersion)
pd_data["num_psgs_w_obj"] = pd_data.apply(lambda row: get_num_passages_w_obj(row.passages, row.answers[0]), axis=1)
pd_data["num_psgs_w_pred"] = pd_data.apply(lambda row: get_num_passages_w_obj(row.passages, row.generation_by_choice), axis=1)
pd_data["correct"] = pd_data.apply(lambda row: row.answers[0]==row.generation_by_choice, axis=1)
pd_data["found_obj"] = pd_data.num_psgs_w_obj.apply(lambda val: val>0)
pd_data["gold_freq_rank"] = pd_data.apply(lambda row: get_answer_freq_rank(row.passages, row.answers[0], options), axis=1)
pd_data["freq_answer"] = pd_data.passages.apply(lambda val: get_freq_answer(val, options))
pd_data["freq_answer_correct"] = pd_data.apply(lambda row: row.answers[0]==row.freq_answer, axis=1)
pd_data["picked_freq_answer"] = pd_data.apply(lambda row: row.generation_by_choice==row.freq_answer, axis=1)
pd_data["generation_freq_rank"] = pd_data.apply(lambda row: get_answer_freq_rank(row.passages, row.generation_by_choice, options), axis=1)

pd_data.head()

Unnamed: 0,query,answers,generation,generation_by_choice,passages,metadata,sub_label,pattern,title_dispersion,num_psgs_w_obj,num_psgs_w_pred,correct,found_obj,gold_freq_rank,freq_answer,freq_answer_correct,picked_freq_answer,generation_freq_rank
0,The expertise of John Vincent Atanasoff is<ext...,[mathematics],reflected in his work,astronomy,"[{'id': 'infobox-3114414', 'title': 'John Vinc...",{},John Vincent Atanasoff,The expertise of [X] is [Y].,9,4,1,False,True,3,physics,False,False,19
1,<extra_id_0> is the domain of activity of John...,[mathematics],physics of the computer. It physics he founded...,physics,"[{'id': 'infobox-3114414', 'title': 'John Vinc...",{},John Vincent Atanasoff,[Y] is the domain of activity of [X].,11,4,7,False,True,4,science,False,False,1
2,John Vincent Atanasoff's expertise is<extra_id...,[mathematics],in computer science computer science in comput...,physics,"[{'id': 'infobox-3114414', 'title': 'John Vinc...",{},John Vincent Atanasoff,[X]'s expertise is [Y].,9,3,5,False,True,3,science,False,False,2
3,<extra_id_0> is the specialization of Edward B...,[anthropology],anthropology of James George Frazer of,anthropology,"[{'id': 'infobox-3003644', 'title': 'Edward Bu...",{},Edward Burnett Tylor,[Y] is the specialization of [X].,12,11,11,True,True,1,anthropology,True,True,1
4,The domain of work of Edward Burnett Tylor is<...,[anthropology],anthropology anthropology anthropology anthrop...,anthropology,"[{'id': 'infobox-3003644', 'title': 'Edward Bu...",{},Edward Burnett Tylor,The domain of work of [X] is [Y].,10,15,15,True,True,1,anthropology,True,True,1


In [65]:
pd_data.correct.value_counts()

True     3734
False    3013
Name: correct, dtype: int64

In [66]:
pd_data.groupby(["correct"]).mean()

Unnamed: 0_level_0,title_dispersion,num_psgs_w_obj,num_psgs_w_pred,found_obj,gold_freq_rank,freq_answer_correct,picked_freq_answer,generation_freq_rank
correct,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
False,15.575838,3.346499,2.547959,0.757385,18.269167,0.151012,0.114172,20.396615
True,15.312801,5.089448,5.089448,0.861007,8.662292,0.307445,0.307445,8.662292


Get a consistency measure by checking how often the same prediction is made for other paraphrases

In [67]:
pred_rates = {}
for subject in pd_data.sub_label.unique():
    pred_counts = pd_data[pd_data.sub_label==subject].generation_by_choice.value_counts(normalize=True)
    for pred, count in pred_counts.items():
        pred_rates[(subject, pred)] = count
        
pd_data["pred_rate"] = pd_data.apply(lambda row: pred_rates[(row.sub_label, row.generation_by_choice)], axis=1)
pd_data.head()

Unnamed: 0,query,answers,generation,generation_by_choice,passages,metadata,sub_label,pattern,title_dispersion,num_psgs_w_obj,num_psgs_w_pred,correct,found_obj,gold_freq_rank,freq_answer,freq_answer_correct,picked_freq_answer,generation_freq_rank,pred_rate
0,The expertise of John Vincent Atanasoff is<ext...,[mathematics],reflected in his work,astronomy,"[{'id': 'infobox-3114414', 'title': 'John Vinc...",{},John Vincent Atanasoff,The expertise of [X] is [Y].,9,4,1,False,True,3,physics,False,False,19,0.076923
1,<extra_id_0> is the domain of activity of John...,[mathematics],physics of the computer. It physics he founded...,physics,"[{'id': 'infobox-3114414', 'title': 'John Vinc...",{},John Vincent Atanasoff,[Y] is the domain of activity of [X].,11,4,7,False,True,4,science,False,False,1,0.923077
2,John Vincent Atanasoff's expertise is<extra_id...,[mathematics],in computer science computer science in comput...,physics,"[{'id': 'infobox-3114414', 'title': 'John Vinc...",{},John Vincent Atanasoff,[X]'s expertise is [Y].,9,3,5,False,True,3,science,False,False,2,0.923077
3,<extra_id_0> is the specialization of Edward B...,[anthropology],anthropology of James George Frazer of,anthropology,"[{'id': 'infobox-3003644', 'title': 'Edward Bu...",{},Edward Burnett Tylor,[Y] is the specialization of [X].,12,11,11,True,True,1,anthropology,True,True,1,1.0
4,The domain of work of Edward Burnett Tylor is<...,[anthropology],anthropology anthropology anthropology anthrop...,anthropology,"[{'id': 'infobox-3003644', 'title': 'Edward Bu...",{},Edward Burnett Tylor,The domain of work of [X] is [Y].,10,15,15,True,True,1,anthropology,True,True,1,1.0


In [68]:
pd_data.mean()

title_dispersion        15.430265
num_psgs_w_obj           4.311101
num_psgs_w_pred          3.954498
correct                  0.553431
found_obj                0.814732
gold_freq_rank          12.952423
freq_answer_correct      0.237587
picked_freq_answer       0.221135
generation_freq_rank    13.902475
pred_rate                0.580760
dtype: float64

In [69]:
bins = [0, 0.2, 0.5, 0.8, 1]
agg_method = {col: ['mean'] for col in pd_data.columns if pd_data[col].dtype.kind in 'biufc'}
agg_method["pred_rate"] = agg_method["pred_rate"]+['count']
pd_data.groupby(pd.cut(pd_data.pred_rate, bins)).agg(agg_method)

Unnamed: 0_level_0,title_dispersion,num_psgs_w_obj,num_psgs_w_pred,correct,found_obj,gold_freq_rank,freq_answer_correct,picked_freq_answer,generation_freq_rank,pred_rate,pred_rate
Unnamed: 0_level_1,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,count
pred_rate,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
"(0.0, 0.2]",15.943985,3.209366,1.689624,0.061524,0.708907,18.022957,0.157943,0.057851,25.015611,0.114078,1089
"(0.2, 0.5]",15.59478,3.843407,2.284341,0.228709,0.821429,13.309753,0.183379,0.07761,20.458791,0.312658,1456
"(0.5, 0.8]",15.407651,4.816461,5.467543,0.734544,0.8551,12.488408,0.280526,0.342736,7.676971,0.696885,2588
"(0.8, 1.0]",14.971499,4.666047,4.563197,0.887856,0.815366,9.952912,0.271375,0.265799,10.472119,0.951292,1614


### Pairwise analysis

In [80]:
from scipy.special import rel_entr
from scipy.spatial.distance import cosine
from scipy.stats import chisquare
import math

In [104]:
def get_passage_title(passage):
    return passage["title"].split(":")[0]
    
def get_num_passages_overlap(passages_1, passages_2):
    #ids_1 = {passage["id"] for passage in passages_1}
    titles_1 = [get_passage_title(passage) for passage in passages_1]
    #ids_2 = {passage["id"] for passage in passages_2}
    titles_2 = [get_passage_title(passage) for passage in passages_2]
    overlapping_titles = set(titles_1) & set(titles_2)
    num_overlap_titles = sum([titles_1.count(title)+titles_2.count(title) for title in overlapping_titles])
    return num_overlap_titles/(len(titles_1)+len(titles_2))

def get_passages_position_overlap(passages_1, passages_2, pos):
    return get_passage_title(passages_1[pos]) == get_passage_title(passages_2[pos])

#def get_options_distribution_overlap(passages_1, passages_2, options):
#    eps = 1e-5
#    opt_count_1 = []
#    opt_count_2 = []
#    for opt in options:
#        opt_count_1 += [get_num_passages_w_obj(passages_1, opt)+eps] # stemmer cannot handle e.g. 'physicist'
#        opt_count_2 += [get_num_passages_w_obj(passages_2, opt)+eps]
#    opt_count_1 = np.array(opt_count_1)/sum(opt_count_1)
#    opt_count_2 = np.array(opt_count_2)/sum(opt_count_2)
#    
#    return math.exp(-sum(rel_entr(opt_count_1, opt_count_2)))

def get_options_distribution_kl(passages_1, passages_2, options):
    eps = 1e-5
    opt_count_1 = []
    opt_count_2 = []
    for opt in options:
        opt_count_1 += [get_passage_obj_freq(passages_1, opt)+eps] # stemmer cannot handle e.g. 'physicist'
        opt_count_2 += [get_passage_obj_freq(passages_2, opt)+eps]
    opt_count_1 = np.array(opt_count_1)/sum(opt_count_1)
    opt_count_2 = np.array(opt_count_2)/sum(opt_count_2)
    
    return math.exp(-sum(rel_entr(opt_count_1, opt_count_2)))

def get_options_distribution_chi2(passages_1, passages_2, options):
    opt_count_1 = []
    opt_count_2 = []
    for opt in options:
        opt_count_1 += [get_passage_obj_freq(passages_1, opt)+1] # stemmer cannot handle e.g. 'physicist'
        opt_count_2 += [get_passage_obj_freq(passages_2, opt)+1]
    
    #probability that similarity is not due to chance
    return chisquare(opt_count_2, f_exp=opt_count_1)[1]

# maybe not a good metric to compare vectors?
def get_options_distribution_similarity(passages_1, passages_2, options):
    opt_count_1 = []
    opt_count_2 = []
    for opt in options:
        opt_count_1 += [get_num_passages_w_obj(passages_1, opt)] # stemmer cannot handle e.g. 'physicist'
        opt_count_2 += [get_num_passages_w_obj(passages_2, opt)]
    opt_count_1 = np.array(opt_count_1)/sum(opt_count_1)
    opt_count_2 = np.array(opt_count_2)/sum(opt_count_2)
    
    return 1-cosine(opt_count_1, opt_count_2)

### From examples

In [10]:
passages_1 = pd_data.passages.iloc[0]
passages_2 = pd_data.passages.iloc[1]

print(passages_1[0])
print("---------------")
print(passages_2[0])

{'id': 'infobox-3114414', 'title': 'John Vincent Atanasoff', 'text': 'infobox name: John Vincent Atanasoff ; box_width: 13*10 ; image: John Atanasov.gif ; caption: Atanasoff, in the 1990s. ; birth_date: October 4, 1903 ; birth_place: Hamilton, New York, U.S. ; death_date: June 15, 1995 ; death_place: Frederick, Maryland, U.S. ; citizenship: American ; field: Physics ; doctoral_advisor: J. H. V. Vleck ; known_for: Atanasoff–Berry Computer ; prizes: Order of Saints Cyril and Methodius, First Class'}
---------------
{'id': 'infobox-3114414', 'title': 'John Vincent Atanasoff', 'text': 'infobox name: John Vincent Atanasoff ; box_width: 13*10 ; image: John Atanasov.gif ; caption: Atanasoff, in the 1990s. ; birth_date: October 4, 1903 ; birth_place: Hamilton, New York, U.S. ; death_date: June 15, 1995 ; death_place: Frederick, Maryland, U.S. ; citizenship: American ; field: Physics ; doctoral_advisor: J. H. V. Vleck ; known_for: Atanasoff–Berry Computer ; prizes: Order of Saints Cyril and Met

**Title overlap**

In [11]:
get_num_passages_overlap(passages_1, passages_2)

0.75

In [12]:
titles_1 = [get_passage_title(passage) for passage in passages_1]
titles_2 = [get_passage_title(passage) for passage in passages_2]
print(set(titles_1) & set(titles_2))
print(set(titles_1) ^ set(titles_2))

{'The Man Who Invented the Computer', 'June 15', 'Clifford Berry', 'Arthur Burks', 'John Vincent Atanasoff'}
{'Tito &amp; Tarantula', 'Mulberry High School (Florida)', 'Boyadzhik', 'Meanings of minor planet names', 'Paul Vitanyi', 'October 4', 'Atanasov', 'List of University of Florida alumni', 'New Market, Maryland', 'Computer engineering'}


**Lexical n-gram overlap**

In [130]:
stemmer_ = SnowballStemmer(language='english')

stemmer_.stem("physic")

'physic'

In [13]:
stemmer = PorterStemmer()

def get_stem(word):
    return stemmer.stem(word.lower())

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

def get_lexical_overlap(passages_1, passages_2, ngram_range=(1,3), preprocessor=None):
    vectorizer = CountVectorizer(lowercase=True, analyzer='word', stop_words='english', ngram_range=ngram_range, min_df=1, preprocessor=preprocessor)

    X = vectorizer.fit_transform([passages_1, passages_2])
    overlap_count = np.take(X[0].A, X[1].nonzero()[1]).sum()+np.take(X[1].A, X[0].nonzero()[1]).sum()

    return overlap_count/X.sum()

Same subject

In [17]:
passages_1 = pd_data.passages.iloc[0]
passages_2 = pd_data.passages.iloc[1]

passages_1_text = (" ".join([passage["text"] for passage in passages_1]))
passages_2_text = (" ".join([passage["text"] for passage in passages_2]))

val = get_lexical_overlap(passages_1_text, passages_2_text, ngram_range=(1,3))
print(val)

0.6995133819951338


Not same subject

In [20]:
passages_1 = pd_data.passages.iloc[0]
passages_2 = pd_data.passages.iloc[-1]

passages_1_text = " ".join([passage["text"] for passage in passages_1])
passages_2_text = " ".join([passage["text"] for passage in passages_2])

val = get_lexical_overlap(passages_1_text, passages_2_text, ngram_range=(1,3), preprocessor=None)#get_stem)
print(val)

0.07673073917799657


**Retriever embedding similarity**

In [47]:
nn.functional.cosine_similarity(r_embeddings[0], r_embeddings[1], dim=0)

tensor(0.7774)

In [None]:
nn.functional.cosine_similarity(r_embeddings[0], r_embeddings[100], dim=0)

Assumes that r_embeddings and r_info have already been loaded

In [119]:
def get_r_emb_sim(entry_1, entry_2):
    subject = entry_1.sub_label
    r_emb_1_ix = r_info[(r_info.sub_label==subject) & (r_info.pattern==entry_1.pattern)].iloc[0].name
    r_emb_2_ix = r_info[(r_info.sub_label==subject) & (r_info.pattern==entry_2.pattern)].iloc[0].name
    
    return nn.functional.cosine_similarity(r_embeddings[r_emb_1_ix], r_embeddings[r_emb_2_ix], dim=0).item()

Test options distribution metric

In [76]:
passages_1 = pd_data.passages.iloc[0]
passages_2 = pd_data.passages.iloc[10]

opt_count_1 = []
opt_count_2 = []
for opt in options:
    opt_count_1 += [get_passage_obj_freq(passages_1, opt)] # stemmer cannot handle e.g. 'physicist'
    opt_count_2 += [get_passage_obj_freq(passages_2, opt)]
    
sum(opt_count_2)

104

Test options distribution chi2

In [110]:
passages_1 = pd_data.passages.iloc[0]
passages_2 = pd_data.passages.iloc[4]

get_options_distribution_chi2(passages_1, passages_2, options)

0.0

### Full analysis

In [120]:
def get_comp_metrics(entry_1, entry_2):
    obj = entry_1.answers[0]
    psgs_overlap = get_num_passages_overlap(entry_1.passages, entry_2.passages)
    psgs_text_1 = " ".join([passage["text"] for passage in entry_1.passages])
    psgs_text_2 = " ".join([passage["text"] for passage in entry_2.passages])

    dict_entry = {"subject": subject,
                      "gold_obj": obj,
                      "pattern_1": pattern, 
                      "pattern_2": other_pattern, 
                      "answer_1": entry_1.generation_by_choice, 
                      "answer_2": entry_2.generation_by_choice, 
                      "same_answer": entry_1.generation_by_choice==entry_2.generation_by_choice, 
                      "avg_generation_freq_rank": (entry_1.generation_freq_rank+entry_2.generation_freq_rank)/2,
                      #"max_title_dispersion": max(entry_1.title_dispersion, entry_2.title_dispersion),
                      #"both_found_obj": entry_1.found_obj and entry_2.found_obj,
                      "num_psgs_overlap": psgs_overlap,
                      #"lexical_overlap": get_lexical_overlap(psgs_text_1, psgs_text_2, (1,3)),
                      "faithfulness_to_freq": (entry_1.picked_freq_answer + entry_2.picked_freq_answer)/2,
                      "options_distrib_chi2": get_options_distribution_chi2(entry_1.passages, entry_2.passages, options),
                      "options_distrib_kl": get_options_distribution_kl(entry_1.passages, entry_2.passages, options),
                      "r_emb_similarity": get_r_emb_sim(entry_1, entry_2),
                      #"retriever_similarity": get_avg_similarity(entry_1.passages, entry_2.passages)
                      #"min_1_psg_overlap": psgs_overlap > 0,
                      #"min_2_psg_overlap": psgs_overlap > 1,
                      #"min_5_psg_overlap": psgs_overlap > 4,
                      #"min_10_psg_overlap": psgs_overlap > 9,
                      #"min_15_psg_overlap": psgs_overlap > 14,
                      #"min_num_psgs_w_obj": min(entry_1.num_psgs_w_obj, entry_2.num_psgs_w_obj),
                      #"min_num_psgs_w_pred": min(entry_1.num_psgs_w_pred, entry_2.num_psgs_w_pred),
                      #"num_psgs_w_obj_1": entry_1.num_psgs_w_obj, 
                      #"num_psgs_w_obj_2": entry_2.num_psgs_w_obj,
                      #"num_psgs_w_pred_1": entry_1.num_psgs_w_pred, 
                      #"num_psgs_w_pred_2": entry_2.num_psgs_w_pred,
                      #"psgs_title_dispersion_1": entry_1.title_dispersion,
                      #"psgs_title_dispersion_2": entry_2.title_dispersion}
                 }
    return dict_entry

In [121]:
comp_data = []

ix = 0
for subject in pd_data.sub_label.unique():
    ix += 1
    available_patterns = pd_data.pattern.unique()
    subj_data = pd_data[pd_data.sub_label==subject]
    for i, pattern in enumerate(available_patterns):
        for other_pattern in available_patterns[i+1:]:
            entry_1 = subj_data[subj_data.pattern==pattern]
            assert len(entry_1)==1
            entry_1 = entry_1.iloc[0]
            entry_2 = subj_data[subj_data.pattern==other_pattern]
            assert len(entry_2)==1
            entry_2 = entry_2.iloc[0]
            
            dict_entry = get_comp_metrics(entry_1, entry_2)
            #for pos in range(20):
            #    dict_entry[f"same_title_pos_{pos}"] = get_passages_position_overlap(entry_1.passages, entry_2.passages, pos)
            comp_data.append(dict_entry)
            
comp_data = pd.DataFrame(comp_data)
comp_data.head()

Unnamed: 0,subject,gold_obj,pattern_1,pattern_2,answer_1,answer_2,same_answer,avg_generation_freq_rank,num_psgs_overlap,faithfulness_to_freq,options_distrib_chi2,options_distrib_kl,r_emb_similarity
0,John Vincent Atanasoff,mathematics,The expertise of [X] is [Y].,[Y] is the domain of activity of [X].,astronomy,physics,False,10.0,0.75,0.0,1.0,0.337887,0.777387
1,John Vincent Atanasoff,mathematics,The expertise of [X] is [Y].,[X]'s expertise is [Y].,astronomy,physics,False,10.5,0.85,0.0,1.0,0.520492,0.960162
2,John Vincent Atanasoff,mathematics,The expertise of [X] is [Y].,[Y] is the specialization of [X].,astronomy,physics,False,11.0,0.8,0.0,1.0,0.128507,0.829516
3,John Vincent Atanasoff,mathematics,The expertise of [X] is [Y].,The domain of work of [X] is [Y].,astronomy,physics,False,12.0,0.675,0.0,0.999971,0.306692,0.817157
4,John Vincent Atanasoff,mathematics,The expertise of [X] is [Y].,[X]'s domain of activity is [Y].,astronomy,physics,False,11.5,0.55,0.0,1.0,0.092968,0.73504


In [122]:
agg_method = {col: ['mean', 'std'] for col in comp_data.columns if comp_data[col].dtype.kind in 'biufc'}
#for col in min_passage_overlap_cols:
#    agg_method[col] += ['sum']
agg_method["same_answer"] += ['count']
comp_data.groupby(["same_answer"]).agg(agg_method)

Unnamed: 0_level_0,same_answer,same_answer,same_answer,avg_generation_freq_rank,avg_generation_freq_rank,num_psgs_overlap,num_psgs_overlap,faithfulness_to_freq,faithfulness_to_freq,options_distrib_chi2,options_distrib_chi2,options_distrib_kl,options_distrib_kl,r_emb_similarity,r_emb_similarity
Unnamed: 0_level_1,mean,std,count,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std
same_answer,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
False,False,0.0,18386,18.457332,14.471626,0.467883,0.232443,0.142065,0.225505,0.852737,0.323142,0.269305,0.253712,0.80449,0.072176
True,True,0.0,22096,10.112396,14.879183,0.523766,0.202105,0.184988,0.241404,0.884477,0.289043,0.301835,0.257445,0.822731,0.067918


In [123]:
comp_data.agg(agg_method)

Unnamed: 0,same_answer,avg_generation_freq_rank,num_psgs_overlap,faithfulness_to_freq,options_distrib_chi2,options_distrib_kl,r_emb_similarity
count,40482.0,,,,,,
mean,0.545823,13.902475,0.498385,0.165493,0.870062,0.287061,0.814446
std,0.497902,15.271396,0.21819,0.235287,0.305408,0.256266,0.070471


Correlations

In [124]:
comp_data.corr()

Unnamed: 0,same_answer,avg_generation_freq_rank,num_psgs_overlap,faithfulness_to_freq,options_distrib_chi2,options_distrib_kl,r_emb_similarity
same_answer,1.0,-0.272075,0.127522,0.090833,0.051745,0.063203,0.128878
avg_generation_freq_rank,-0.272075,1.0,-0.061353,-0.36762,-0.002758,-0.084216,-0.051342
num_psgs_overlap,0.127522,-0.061353,1.0,0.059015,0.390799,0.606003,0.508615
faithfulness_to_freq,0.090833,-0.36762,0.059015,1.0,0.020184,0.102556,0.016176
options_distrib_chi2,0.051745,-0.002758,0.390799,0.020184,1.0,0.268267,0.208976
options_distrib_kl,0.063203,-0.084216,0.606003,0.102556,0.268267,1.0,0.331637
r_emb_similarity,0.128878,-0.051342,0.508615,0.016176,0.208976,0.331637,1.0


* Check std of metrics
* Check values for random passages
* Check correlation between metrics
* Lexical overlap
    * On stemmed words?
* Retriever representation distance

## Check metrics for random passages

In [31]:
import random
random_data = []

rand_ix_1 = random.sample(list(pd_data.index), 100)
rand_ix_2 = random.sample(list(pd_data.index), 100)
for ix_1, ix_2 in zip(rand_ix_1, rand_ix_2):
    dict_entry = get_comp_metrics(pd_data.iloc[ix_1], pd_data.iloc[ix_2])
    random_data.append(dict_entry)

random_data = pd.DataFrame(random_data)
random_data.head()

Unnamed: 0,subject,gold_obj,pattern_1,pattern_2,answer_1,answer_2,same_answer,num_psgs_overlap,lexical_overlap
0,Aleksei N. Leontiev,philosophy,[X] works in the area of [Y].,[X] works in the area of [Y].,economics,physiology,False,0.0,0.036458
1,Aleksei N. Leontiev,astronomy,[X] works in the area of [Y].,[X] works in the area of [Y].,astronomy,genetics,False,0.0,0.05704
2,Aleksei N. Leontiev,chemistry,[X] works in the area of [Y].,[X] works in the area of [Y].,chemistry,mathematics,False,0.0,0.074093
3,Aleksei N. Leontiev,anthropology,[X] works in the area of [Y].,[X] works in the area of [Y].,anthropology,science,False,0.0,0.078989
4,Aleksei N. Leontiev,nightclub,[X] works in the area of [Y].,[X] works in the area of [Y].,drawing,astronomy,False,0.0,0.027993


In [32]:
agg_method = {col: ['mean', 'std'] for col in random_data.columns if random_data[col].dtype.kind in 'biufc'}
#for col in min_passage_overlap_cols:
#    agg_method[col] += ['sum']
agg_method["same_answer"] += ['count']
random_data.groupby(["same_answer"]).agg(agg_method)

Unnamed: 0_level_0,same_answer,same_answer,same_answer,num_psgs_overlap,num_psgs_overlap,lexical_overlap,lexical_overlap
Unnamed: 0_level_1,mean,std,count,mean,std,mean,std
same_answer,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
False,False,0.0,94,0.010638,0.05709,0.062672,0.03887
True,True,0.0,6,0.008333,0.020412,0.073273,0.026668
