# Evaluate Diagnostic Datasets

- Use the previously created diagnostic datasets.
- Evaluate every diagnostic dataset with both ranking models (BM25 and neural).
- Load all results with pandas and display each dataset's correlation values and whether the values are significant.

In [1]:
from src import get_data, diagnostic, scorer, util
import pandas as pd
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

In [None]:
bm25_scorer = scorer.BM25Scorer()

for dataobj in get_data.CqaDupStackCollector().iter_datasets():
    for qset in dataobj.list_qsets():
        for diag_dataset_name in dataobj.list_probes(qset):
            diag_dataset = diagnostic.DiagnosticDataset(dataobj, qset, diag_dataset_name)
            if not diag_dataset.has_scored('bm25'):
                bm25_scorer.score_dataset(diag_dataset, dataobj, True)

In [None]:
neural_cqa_scorer = scorer.NeuralScorer('GPL/cqadupstack-msmarco-distilbert-gpl')

for dataobj in get_data.CqaDupStackCollector().iter_datasets():
    for qset in dataobj.list_qsets():
        for diag_dataset_name in dataobj.list_probes(qset):
            diag_dataset = diagnostic.DiagnosticDataset(dataobj, qset, diag_dataset_name)
            if not diag_dataset.has_scored('neural'):
                neural_cqa_scorer.score_dataset(diag_dataset, dataobj, True)

In [3]:
for dataobj in get_data.CqaDupStackCollector().iter_datasets():
    name = dataobj.path.name
    correlation_df, significant_df = util.create_results_dfs(dataobj, name)
    print(name)
    display(correlation_df)
    display(significant_df)

android


Unnamed: 0_level_0,model,bm25,neural
diagnostic,queryset,Unnamed: 2_level_1,Unnamed: 3_level_1
booltf,keyword,1.0,0.612
booltf,verbose,0.995,0.59
tf,keyword,1.0,0.186
tf,verbose,0.997,0.2
idf,keyword,0.965,0.211
idf,verbose,0.941,0.136
len,keyword,-0.868,-0.319
len,verbose,-0.859,-0.257
proximity,keyword,-0.221,-0.02
proximity,verbose,-0.266,-0.05


Unnamed: 0_level_0,model,bm25,neural
diagnostic,queryset,Unnamed: 2_level_1,Unnamed: 3_level_1
booltf,keyword,1,1
booltf,verbose,1,1
tf,keyword,1,1
tf,verbose,1,1
idf,keyword,1,1
idf,verbose,1,1
len,keyword,1,1
len,verbose,1,1
proximity,keyword,1,0
proximity,verbose,1,0


gaming


Unnamed: 0_level_0,model,bm25,neural
diagnostic,queryset,Unnamed: 2_level_1,Unnamed: 3_level_1
booltf,keyword,0.997,0.685
booltf,verbose,0.997,0.644
tf,keyword,1.0,0.39
tf,verbose,1.0,0.366
idf,keyword,0.943,0.182
idf,verbose,0.943,0.232
len,keyword,-0.925,-0.33
len,verbose,-0.92,-0.294
proximity,keyword,-0.193,-0.055
proximity,verbose,-0.125,0.02


Unnamed: 0_level_0,model,bm25,neural
diagnostic,queryset,Unnamed: 2_level_1,Unnamed: 3_level_1
booltf,keyword,1,1
booltf,verbose,1,1
tf,keyword,1,1
tf,verbose,1,1
idf,keyword,1,1
idf,verbose,1,1
len,keyword,1,1
len,verbose,1,1
proximity,keyword,1,0
proximity,verbose,0,0
