# Download Data

- Use `ir_datasets` to download a document corpus and a set of sample queries for the forums `android` and `gaming` form the `CQADupStack`.
- Load the model `GPL/cqadupstack-msmarco-distilbert-gpl` and filter documents that are too large to be entirely processed with it.
- Create keyword queries by reducing the original queries.
- Create abbreviation pairs from WordNet and augment the spaCy pipeline such that it is able to recognize them.

In [None]:
import ir_datasets
import pandas as pd
from pathlib import Path
import logging
from src import util
from transformers import AutoTokenizer
from tqdm.autonotebook import tqdm
import spacy
tqdm.pandas()

pd.set_option('display.max_colwidth', None)

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

In [None]:
# these are all available datasets; we only need the ones that do not involve programming
datasets = ['beir/cqadupstack/android', 'beir/cqadupstack/gaming']
sbert_model_name = 'GPL/cqadupstack-msmarco-distilbert-gpl'
tokenizer = AutoTokenizer.from_pretrained(sbert_model_name, cache_dir='__model_cache__')
max_text_len = min(tokenizer.max_model_input_sizes.values()) - 2  # account for [CLS] and [SEP]

In [None]:
base_path = Path('data/cqadupstack')
base_path.mkdir(exist_ok=True, parents=True)
def download_dataset(name: str):
    dset_name = name.split('/')[-1]
    dset_path = base_path / dset_name
    dset_path.mkdir(exist_ok=True)

    dataset_api = ir_datasets.load(name)

    # queries are <qid, text, tags>
    queries_df = pd.DataFrame.from_records(dataset_api.queries_iter(), columns=ir_datasets.datasets.beir.BeirCqaQuery._fields)
    queries_df = queries_df[queries_df.text.progress_apply(lambda text: len(tokenizer.tokenize(text)) <= max_text_len)]
    queries_df.to_csv(dset_path / 'queries.csv', index=False)

    # also reduce the queries to keyword queries
    util.reduce_to_keyword_queries(queries_df).to_csv(dset_path / 'keyword_queries.csv', index=False)

    # docs are <did, text, title, tags>
    # We want both the title and the body as individual documents
    docs_df = pd.DataFrame.from_records(dataset_api.docs_iter(), columns=ir_datasets.datasets.beir.BeirCqaDoc._fields)
    docs_df = docs_df.melt(id_vars=['doc_id', 'tags'], value_vars=['text', 'title']).sample(frac=1, random_state=42)
    docs_df['doc_id'] = docs_df.apply(lambda row: f'{row.doc_id}{"b" if row.variable == "text" else "t"}', axis=1)
    docs_df = docs_df[docs_df['value'].progress_apply(lambda text: len(tokenizer.tokenize(text)) <= max_text_len)]
    docs_df.to_csv(dset_path / 'documents.csv', index=False)

for dset in datasets:
    download_dataset(dset)


In [None]:
abbreviations_df = util.build_abbreviation_dataframe_from_wordnet()
abbreviations_df.to_csv('resources/abbreviations.csv')
spacy_model_name = 'en_core_web_sm'

nlp = spacy.load(spacy_model_name)
nlp.remove_pipe('ner')
patterns = util.abbreviation_df_to_spacy_patterns(abbreviations_df, nlp)
nlp.add_pipe('entity_ruler')
nlp.get_pipe('entity_ruler').add_patterns(patterns)
nlp.to_disk('resources/spacy_model')