We need:


1.   Document Collection (https://github.com/federicarollo/Italian-Crime-News)
2.   Test Queries
3.   Prejudged Assessments on the Queries






We have to
*   Build a parser - Ula
*   Build a preprocessor - Viktor
*   Build the data structures (lexicon, docIndex,statistics,directIndex,inverseIndex) - Mateusz
*   Implement a Ranking Strategy - Ala
*   Find qrels
*   Evaluate








In [1]:
import string
import nltk
from nltk.stem.snowball import EnglishStemmer
import numpy as np
import ir_datasets
from tqdm import tqdm

In [2]:
nltk.download("stopwords", quiet=True)
stopwords = set(nltk.corpus.stopwords.words("english"))
stemmer = EnglishStemmer()


def preprocess(s):
    # lowercasing
    s = s.lower()
    # ampersand
    s = s.replace("&", "")
    # special chars
    s = s.translate(str.maketrans("‘’´“”–-", '   ""--'))
    # replace apostrophes with spaces
    s = s.replace("'", " ")
    # substitue punctuation with periods
    s = s.translate(str.maketrans(string.punctuation, "." * len(string.punctuation)))
    # strip whitespaces
    s = s.strip()
    while "  " in s:
        s = s.replace("  ", " ")

    # tokenize
    s = s.split()

    # remove periods from beginning and end of words
    s = [t.lstrip(".").rstrip(".") for t in s]

    # add versions of acronyms and numbers without periods to the list
    for token in s:
        if "." in token:
            s.append(token.replace(".", " "))

    # stopwords
    s = " ".join([t for t in s if t not in stopwords])

    # stemming
    s = stemmer.stem(s).split()

    return s

In [13]:
dataset = ir_datasets.load("vaswani")
DOCS = dataset.docs_count()
# DOCS = 100000

In [14]:
preprocessed_docs = [
    preprocess(doc.text) for doc in tqdm(dataset.docs_iter()[:DOCS], total=DOCS)
]

100%|██████████| 11429/11429 [00:00<00:00, 28714.98it/s]


In [10]:
def lexicon(docs):
    s = set([term for doc in tqdm(docs) for term in doc])
    return {term: idx for idx, term in enumerate(s)}


def incidence_vector(doc, vocab):
    vector = np.zeros(len(vocab), dtype=int)
    for term in tqdm(doc):
        termid = vocab[term]
        vector[termid] += 1
    return vector


def inverted_index_docidonly(docs):
    vocab = {}

    for docid, doc in enumerate(tqdm(docs)):
        for term in doc:
            if term not in vocab:
                vocab[term] = [docid]
            else:
                vocab[term].append(docid)

    return vocab


def inverted_index_docidfreqs(docs):
    vocab = {}

    for docid, doc in enumerate(tqdm(docs)):
        for term in doc:
            if term not in vocab:
                vocab[term] = {docid: 1}
            else:
                if docid not in vocab[term]:
                    vocab[term][docid] = 1
                else:
                    vocab[term][docid] += 1

    return vocab


def inverted_index_docidpos(docs):
    vocab = {}

    for docid, doc in enumerate(tqdm(docs)):
        for pos, term in enumerate(doc):
            if term not in vocab:
                vocab[term] = {docid: [pos]}
            else:
                if docid not in vocab[term]:
                    vocab[term][docid] = [pos]
                else:
                    vocab[term][docid].append(pos)

    return vocab

In [11]:
l = lexicon(preprocessed_docs)
iv = incidence_vector(preprocessed_docs[10], l)
iid = inverted_index_docidonly(preprocessed_docs)
iif = inverted_index_docidfreqs(preprocessed_docs)
iip = inverted_index_docidpos(preprocessed_docs)

100%|██████████| 100000/100000 [00:00<00:00, 366401.16it/s]
100%|██████████| 26/26 [00:00<00:00, 152094.71it/s]
100%|██████████| 100000/100000 [00:01<00:00, 74439.18it/s]
100%|██████████| 100000/100000 [00:01<00:00, 82690.69it/s]
100%|██████████| 100000/100000 [00:03<00:00, 27197.40it/s]


In [12]:
print(l)



In [13]:
#  BM25 ranking function
def bm25_ranking(query, docIndex, inverseIndex, k=1.5, b=0.75):
    query_tokens = preprocess(query)
    scores = {}
    avg_doc_len = sum(len(doc) for doc in docIndex["preprocessed"]) / len(docIndex)

    for token in query_tokens:
        if token in inverseIndex:
            doc_freq = len(inverseIndex[token])
            for doc_id, posting_list in inverseIndex[token].items():
                term_freq = len(posting_list)
                doc_len = len(docIndex.iloc[doc_id]["preprocessed"])

                numerator = term_freq * (k + 1)
                denominator = term_freq + k * (1 - b + b * doc_len / avg_doc_len)
                idf = max(0, (len(docIndex) - doc_freq + 0.5) / (doc_freq + 0.5))
                bm25_score = idf * (numerator / denominator)

                if doc_id not in scores:
                    scores[doc_id] = 0
                scores[doc_id] += bm25_score

    ranked_docs = sorted(scores.items(), key=lambda item: item[1], reverse=True)
    return ranked_docs

In [14]:
for q in dataset.queries_iter():
    query = q
# df["preprocessed"] = preprocessed_docs
print(q)
results = bm25_ranking(query, df, iip)[:5]

for doc_id, score in results:
    print(df.iloc[doc_id]["title"], score)

[INFO] Please confirm you agree to the MSMARCO data usage agreement found at <http://www.msmarco.org/dataset.aspx>
[INFO] download error: HTTPSConnectionPool(host='msmarco.z22.web.core.windows.net', port=443): Max retries exceeded with url: /msmarcoranking/msmarco-test2020-queries.tsv.gz (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x74f62f3a6450>: Failed to resolve 'msmarco.z22.web.core.windows.net' ([Errno -3] Temporary failure in name resolution)")). Retrying from start.
[INFO] download error: HTTPSConnectionPool(host='msmarco.z22.web.core.windows.net', port=443): Max retries exceeded with url: /msmarcoranking/msmarco-test2020-queries.tsv.gz (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x74f5f8166f90>: Failed to resolve 'msmarco.z22.web.core.windows.net' ([Errno -3] Temporary failure in name resolution)")). Retrying from start.


ConnectionError: HTTPSConnectionPool(host='msmarco.z22.web.core.windows.net', port=443): Max retries exceeded with url: /msmarcoranking/msmarco-test2020-queries.tsv.gz (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x74f5f813f3e0>: Failed to resolve 'msmarco.z22.web.core.windows.net' ([Errno -3] Temporary failure in name resolution)"))