# Build Diagnostic Datasets

Different implementations (here called 'probers') are used to create different diagnostic datasets. 

In [None]:
from src import get_data, dataset, diagnostic, util, \
    measure_and_match as mmp, \
    text_manipulation as tmp, \
    dataset_transfer as dtp

from typing import List
import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)

In [None]:
def create_diagnostic_data(dataobj: get_data.Data, force_overwrite=False, field_name='text_stopped'):
    l_index = dataobj.get_lucene_index()

    probers: List[diagnostic.Prober] = [
        # MMPs
        mmp.ConstVarProber('booltf', index=l_index, var=mmp.BoolTf, const=[mmp.Len], const_epsilons=[.1], field_name=field_name),
        mmp.ConstVarProber('tf', index=l_index, var=mmp.Tf, const=[mmp.Len, mmp.BoolTf], const_epsilons=[.1, 0], field_name=field_name),
        mmp.ConstVarProber('idf', index=l_index, var=mmp.Idf, const=[mmp.Len, mmp.SumTf], const_epsilons=[.1, 0], field_name=field_name),
        mmp.ConstVarProber('len', index=l_index, var=mmp.Len, const=[mmp.Tf], const_epsilons=[.1], field_name=field_name),        

        mmp.ConstVarProber('proximity', index=l_index, var=mmp.Proximity, const=[mmp.BoolTf, mmp.Len], field_name=field_name),
        mmp.ConstVarProber('ordering', index=l_index, var=mmp.QueryTermOrdering, const=[mmp.BoolTf, mmp.Len], field_name=field_name),

        mmp.ConstVarProber('exact_match', index=l_index, var=mmp.ExactMatch, const=[mmp.Tf, mmp.Len], const_epsilons=[0, .1]),
       
        # TMPs
        tmp.LemmatizationProber(dataobj.get_document_spacy()),
        tmp.StopwordRemoveProber(stopword_list=util.stopwords()),
        tmp.SpellErrorProber(l_index, max_samples_per_term=64),
        
        tmp.AbbreviationProber(dataobj.get_document_spacy()),

        # DTPs
        dtp.CQATextSuccintnessProber(dataobj)
    ]

    for qset_name in dataobj.list_qsets():
        for prober in probers:
            prober.build_diagnostic_dataset(dataobj, qset_name, force_overwrite, True)

In [None]:
for dataobj in get_data.CqaDupStackCollector().iter_datasets():
    create_diagnostic_data(dataobj, False)