# Setup Resources
- Load Downloaded Data
- Build Faiss Index
- Build Lucene Index
- Build Querysets
- Build Spacy Indices for Docset and Querysets
- Create Rankings for Querysets

In [None]:
from src import get_data, dataset, faiss, lucene, spacy
import pandas as pd
import logging
from pathlib import Path
from sentence_transformers import SentenceTransformer

pd.set_option('display.max_colwidth', None)

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

In [None]:
sbert_model_name = 'GPL/cqadupstack-msmarco-distilbert-gpl'
spacy_model_name = 'resources/spacy_model'

In [None]:
def build_faiss(dataobj: get_data.CqaDupStackData):
    docset = dataobj.get_docset()

    index_path = dataobj.path / 'indices'
    index_path.mkdir(exist_ok=True)
    # faiss index
    if not dataobj.has_faiss():
        faiss.FaissIndexWrapper.build(docset, index_path, index_name='faiss', 
                                    docs_id_field=docset.id_field_name, docs_text_field=docset.default_text_field_name, 
                                    indexing_batch_size = 2 ** 10, faiss_bucket_size = 2 ** 8,
                                    sbert_model_name=sbert_model_name, show_progress_bar=True)

for data in get_data.CqaDupStackCollector().iter_datasets():
    build_faiss(data)

In [None]:
def build_lucene(dataobj: get_data.CqaDupStackData):
    docset = dataobj.get_docset()

    index_path = dataobj.path / 'indices'
    index_path.mkdir(exist_ok=True)
    if not dataobj.has_lucene():
        lucene.LuceneIndex.build(docset, index_path, 'lucene', lang='en', show_progress_bar=True)

for data in get_data.CqaDupStackCollector().iter_datasets():
    build_lucene(data)

In [None]:
def build_querysets(dataobj: get_data.CqaDupStackData):
    # querysets
    sbert_model = SentenceTransformer(sbert_model_name, cache_folder='__model_cache__')
    Path(dataobj.path / 'queries').mkdir(exist_ok=True)
    if 'verbose' not in dataobj.list_qsets():
        queries_df = pd.read_csv(dataobj.path / 'queries.csv')
        dataset.QuerySet.create_from_dataframe(dataobj.path/'queries', 'verbose', sbert_model, queries_df, 'query_id', 'text')
    
    if 'keyword' not in dataobj.list_qsets():
        queries_df = pd.read_csv(dataobj.path / 'keyword_queries.csv')
        dataset.QuerySet.create_from_dataframe(dataobj.path/'queries', 'keyword', sbert_model, queries_df, 'query_id', 'text')

for data in get_data.CqaDupStackCollector().iter_datasets():
    build_querysets(data)

In [None]:
def build_spacy(dataobj: get_data.CqaDupStackData):
    docset_spacy_path = dataobj.path / 'indices/spacy'
    if not docset_spacy_path.exists():
        docset = dataobj.get_docset()
        spacy.SpacyIndex.build(docset_spacy_path.parent, docset.iter_for_indexing(), docset.num_documents(), spacy_model_name)

    for qset_name in dataobj.list_qsets():
        qset_spacy_path = dataobj.path / f'queries/{qset_name}/spacy'
        if not qset_spacy_path.exists():
            qset = dataobj.get_queryset(qset_name)
            spacy.SpacyIndex.build(qset_spacy_path.parent, qset.iter_queries(), qset.num_queries(), spacy_model_name)

for data in get_data.CqaDupStackCollector().iter_datasets():
    build_spacy(data)

In [None]:
def build_rankings(dataobj: get_data.CqaDupStackData, k=10):
    f_idx, l_idx = None, None
    r_path = dataobj.path / 'rankings'
    r_path.mkdir(exist_ok=True)
    for qset_name in dataobj.list_qsets():
        for ranking_model in ['neural', 'bm25']:        
            rset_name = f'{ranking_model}_{qset_name}.csv'
            if rset_name not in dataobj.list_rankingsets():
                if ranking_model == 'neural':
                    f_idx = f_idx or dataobj.get_faiss_index()
                    f_idx.search(dataobj.get_queryset(qset_name), k=k).saved(r_path / rset_name)
                else:
                    l_idx = l_idx or dataobj.get_lucene_index()
                    l_idx.search_bm25_batch(dataobj.get_queryset(qset_name), k=k).saved(r_path / rset_name)

for data in get_data.CqaDupStackCollector().iter_datasets():
    build_rankings(data)