# IR Lab

## Install, import modules and download dataset

In [2]:
!pip install whoosh
!pip install pytrec-eval-terrier
!pip install wget




In [2]:
from whoosh import index, writing
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED
from whoosh.analysis import *
from whoosh.qparser import QueryParser
import os.path
from pathlib import Path
import tempfile
import subprocess
import pytrec_eval
import wget
import nltk
nltk.download('omw-1.4')

ModuleNotFoundError: No module named 'nltk'

In [None]:
filename = wget.download("https://github.com/MIE451-1513-2023/course-datasets/raw/main/lab-data.zip", "lab-data.zip")

In [None]:
!unzip lab-data.zip

In [None]:
DATA_DIR = "lab-data"
DOCUMENTS_DIR = os.path.join(DATA_DIR, "documents")
TOPIC_FILE = os.path.join(DATA_DIR, "air.topics")
QRELS_FILE = os.path.join(DATA_DIR, "air.qrels")

## Part 1: Basic Indexing

### Creating the index

To begin using Whoosh, you need an index object. The first time you create an index, you must define the index’s schema. The schema lists the fields in the index. A field is a piece of information for each document in the index, such as its title or text content. A field can be indexed (meaning it can be searched) and/or stored (meaning the value that gets indexed is returned with the results; this is useful for fields such as the title).

More information:
https://whoosh.readthedocs.io/en/latest/schema.html?highlight=schema

In [None]:
def createIndex(schema):
    # Generate a temporary directory for the index
    indexDir = tempfile.mkdtemp()

    # create and return the index
    return index.create_in(indexDir, schema)

In [None]:
# first, define a Schema for the index
mySchema = Schema(file_path = ID(stored=True),
                  file_content = TEXT(analyzer = RegexTokenizer()))

# now, create the index at the path INDEX_DIR based on the new schema
myIndex = createIndex(mySchema)

### Indexing the documents

In [None]:
def addFilesToIndex(indexObj, fileList):
    # open writer
    writer = writing.BufferedWriter(indexObj, period=None, limit=1000)

    try:
        # write each file to index
        for docNum, filePath in enumerate(fileList):
            with open(filePath, "r", encoding="utf-8") as f:
                fileContent = f.read()
                writer.add_document(file_path = filePath,
                                    file_content = fileContent)

                # print status every 1000 documents
                if (docNum+1) % 1000 == 0:
                    print("already indexed:", docNum+1)
        print("done indexing.")

    finally:
        # close the index
        writer.close()

In [None]:
# Build a list of files to index
filesToIndex = [str(filePath) for filePath in Path(DOCUMENTS_DIR).glob("**/*") if filePath.is_file()]

In [None]:
# Check the list
filesToIndex[:5]

In [None]:
# count files to index
print("number of files:", len(filesToIndex))

In [None]:
addFilesToIndex(myIndex, filesToIndex)

### Querying

More information: https://whoosh.readthedocs.io/en/latest/api/qparser.html?highlight=queryparser

In [None]:
# define a query parser for the field "file_content" in the index
myQueryParser = QueryParser("file_content", schema=myIndex.schema)
mySearcher = myIndex.searcher()

In [None]:
# run a sample query for the phrase "duck"
sampleQuery = myQueryParser.parse("duck")
sampleQueryResults = mySearcher.search(sampleQuery, limit=None)

In [None]:
sampleQuery

In [None]:
sampleQueryResults

In [None]:
# inspect the result:
# for each document print the rank and the score
for (docnum, result) in enumerate(sampleQueryResults):
    score = sampleQueryResults.score(docnum)
    fileName = os.path.basename(result["file_path"])
    print(fileName, docnum, score)

In [None]:
sampleQueryResults.docs()

### Evaluation using TREC_EVAL
In order to evaluate our results we will use a topic file - a list of topics we use to evaluate our IR system

In [None]:
# print the topic file
with open(TOPIC_FILE, "r") as f:
    print(f.read())

We will compare our evaluate our results with a set of judged results(qrels file) using TREC_EVAL

In [None]:
# print the first 10 lines in the qrels file
with open(QRELS_FILE, "r") as f:
    qrels10 = f.readlines()[:10]
    print("".join(qrels10))

The follwing function takes a topic file, a qrels file, a query parser and a searcher and use pytrec_eval to compare our results with the provided qrels file (see assignment PDF for more details)

In [None]:
def pyTrecEval(topicFile, qrelsFile, queryParser, searcher):
    # Load topic file - a list of topics(search phrases) used for evalutation
    with open(topicFile, "r") as tf:
        topics = tf.read().splitlines()

    # create an output file to which we'll write our results
    tempOutputFile = tempfile.mkstemp()[1]
    with open(tempOutputFile, "w") as outputTRECFile:
        # for each evaluated topic:
        # build a query and record the results in the file in TREC_EVAL format
        for topic in topics:
            topic_id, topic_phrase = tuple(topic.split(" ", 1))
            #print(topic_id, topic_phrase)
            topicQuery = queryParser.parse(topic_phrase)
            topicResults = searcher.search(topicQuery, limit=None)
            for (docnum, result) in enumerate(topicResults):
                score = topicResults.score(docnum)
                #print("%s Q0 %s %d %lf test\n" % (topic_id, os.path.basename(result["file_path"]), docnum, score))
                outputTRECFile.write("%s Q0 %s %d %lf test\n" % (topic_id, os.path.basename(result["file_path"]), docnum, score))
                topic_with_result = topic_id

    with open(qrelsFile, 'r') as f_qrel:
        qrel = pytrec_eval.parse_qrel(f_qrel)

    with open(tempOutputFile, 'r') as f_run:
        run = pytrec_eval.parse_run(f_run)

    evaluator = pytrec_eval.RelevanceEvaluator(
        qrel, pytrec_eval.supported_measures)

    results = evaluator.evaluate(run)

    #fill results dictionary with queries that were returned 0 documents
    topic_ids = {t.split()[0] for t in topics}
    for emptyresult_topicid in topic_ids.difference(set(results.keys())):
        num_rel = float(sum(qrel[emptyresult_topicid].values()))
        if num_rel>0:
          topic_stats={measure:0.0 for measure in results[topic_with_result]}
        else:
          topic_stats={measure:1.0 for measure in results[topic_with_result]}
        topic_stats["num_rel"]=num_rel
        topic_stats["num_ret"] = 0.0
        topic_stats["num_rel_ret"] = 0.0
        topic_stats["num_q"]=1.0

        results[emptyresult_topicid] = topic_stats

    def print_line(measure, scope, value):
        print('{:25s}{:8s}{:.4f}'.format(measure, scope, value))

    for query_id, query_measures in results.items():
        for measure, value in query_measures.items():
            if measure == "runid":
              continue
            print_line(measure, query_id, value)
    for measure in query_measures.keys():
        if measure == "runid":
              continue
        print_line(
            measure,
            'all',
            pytrec_eval.compute_aggregated_measure(
                measure,
                [query_measures[measure]
                 for query_measures in results.values()]))

In [None]:
pyTrecEval(TOPIC_FILE, QRELS_FILE, myQueryParser, mySearcher)

In [None]:
def printRelName(topicFile, qrelsFile, queryParser, searcher, id):
  with open(topicFile, "r") as tf:
        topics = tf.read().splitlines()
  for topic in topics:
        topic_id, topic_phrase = tuple(topic.split(" ", 1))
        if topic_id == id:
          print("---------------------------Topic_id and Topic_phrase----------------------------------")
          print(topic_id, topic_phrase)
          topicQuery = queryParser.parse(topic_phrase)
          topicResults = searcher.search(topicQuery, limit=None)
          print("---------------------------Return documents----------------------------------")
          for (docnum, result) in enumerate(topicResults):
              score = topicResults.score(docnum)
              print("%s Q0 %s %d %lf test" % (topic_id, os.path.basename(result["file_path"]), docnum, score))
          print("---------------------------Relevant documents----------------------------------")
          with open(qrelsFile, 'r') as f_qrel:
            qrels = f_qrel.readlines()
            for i in qrels:
              qid, _, doc, rel = i.rstrip().split(" ")
              if qid == id and rel == "1":
                print(i.rstrip())

In [None]:
printRelName(TOPIC_FILE, QRELS_FILE, myQueryParser, mySearcher, "01")

## Part 2: Evaluating different configurations

### Inspecting our index

In [None]:
# Is it empty?
print("Index is empty?", myIndex.is_empty())

# How many files indexed?
print("Number of indexed files:", myIndex.doc_count())

In [None]:
# define a reader object on the index
myReader = myIndex.reader()

In [None]:
# print first 5 indexed documents
[(docnum, doc_dict) for (docnum, doc_dict) in myReader.iter_docs()][0:5]

In [None]:
# list indexed terms for field "file_content"
[term for term in myReader.field_terms("file_content")][1000:1025]

In [None]:
#how many terms do we have?
print(myReader.field_length("file_content"))

In [None]:
# how many documents have the phares "bit", blob"
#   in the field "file_content"?
print("# docs with 'bit'", myReader.doc_frequency("file_content", "bit"))
print("# docs with 'are'", myReader.doc_frequency("file_content", "are"))
print("# docs with 'get'", myReader.doc_frequency("file_content", "get"))

### Text Analyzers

In [None]:
# we start with basic tokenizer
tokenizer = RegexTokenizer()
[token.text for token in tokenizer("We are going to do Text Analysis with whoosh.analysis")]

In [None]:
# we might want use stemming:
stmAnalyzer = RegexTokenizer() | StemFilter()
[token.text for token in stmAnalyzer("We are going to do Text Analysis with whoosh.analysis")]

In [None]:
# We probably want to lower-case it
# so we add LowercaseFilter
stmLwrAnalyzer = RegexTokenizer() | LowercaseFilter() | StemFilter()
[token.text for token in stmLwrAnalyzer("We are going to do Text Analysis with whoosh.analysis")]

In [None]:
# we probably want to ignore words like "we", "are", "with" when we index files
# so we add StopFilter to filter stop words
stmLwrStpAnalyzer = RegexTokenizer() | LowercaseFilter() | StopFilter() | StemFilter()
[token.text for token in stmLwrStpAnalyzer("We are going to do Text Analysis with whoosh.analysis")]

In [None]:
# we also probably want to break phrases like "whoosh.analysis" into "whoosh" and "analysis"
# so we add IntraWordFilter
stmLwrStpIntraAnalyzer = RegexTokenizer() | LowercaseFilter() | IntraWordFilter() | StopFilter() | StemFilter()
[token.text for token in stmLwrStpIntraAnalyzer("We are going to do Text Analysis with whoosh.analysis")]

### Evaluating the new analyzers

In [None]:
# define a Schema with the new analyzer
mySchema2 = Schema(file_path = ID(stored=True),
                   file_content = TEXT(analyzer = stmLwrStpIntraAnalyzer))

# create the index based on the new schema
myIndex2 = createIndex(mySchema2)

In [None]:
addFilesToIndex(myIndex2, filesToIndex)

In [None]:
# define a query parser for the field "file_content" in the index
myQueryParser2 = QueryParser("file_content", schema=myIndex2.schema)
mySearcher2 = myIndex2.searcher()

In [None]:
pyTrecEval(TOPIC_FILE, QRELS_FILE, myQueryParser2, mySearcher2)

In [None]:
# let count the same words again
myReader2 = myIndex2.reader()
print("# docs with 'bit'", myReader2.doc_frequency("file_content", "bit"))
print("# docs with 'are'", myReader2.doc_frequency("file_content", "are"))
print("# docs with 'get'", myReader2.doc_frequency("file_content", "get"))

**Can you explain the differences?**

### Using NLTK's stemmers and lemmatizers

In [None]:
import nltk
from nltk.stem import *

In [None]:
# download required resources
nltk.download("wordnet")

In [None]:
# we'll compare two stemmers and a lemmatizer
lrStem = LancasterStemmer()
sbStem = SnowballStemmer("english")
wnLemm = WordNetLemmatizer()

In [None]:
# define a list of words to compare the stemmers on
listWords = ["going", "saying", "minimize", "maximum",
             "meeting", "files", "tries", "is", "are", "beautiful",
             "summarize", "better", "dogs", "phenomena"]

In [None]:
for word in listWords:
    print("%15s %15s %15s %15s" % (lrStem.stem(word),
                                   sbStem.stem(word),
                                   wnLemm.lemmatize(word),
                                   wnLemm.lemmatize(word, 'v')))

### How to use NLTK stemmers / lemmatizers in Whoosh

In [None]:
# Dont change this! Use it as-is in your code
# This filter will run for both the index and the query
from whoosh.analysis import Filter
class CustomFilter(Filter):
    is_morph = True
    def __init__(self, filterFunc, *args, **kwargs):
        self.customFilter = filterFunc
        self.args = args
        self.kwargs = kwargs
    def __eq__(self):
        return (other
                and self.__class__ is other.__class__)
    def __call__(self, tokens):
        for t in tokens:
            if t.mode == 'query': # if called by query parser
                t.text = self.customFilter(t.text, *self.args, **self.kwargs)
                yield t
            else: # == 'index' if called by indexer
                t.text = self.customFilter(t.text, *self.args, **self.kwargs)
                yield t

In [None]:
# Example1: Whoosh filter for NLTK's LancasterStemmer
myFilter1 = RegexTokenizer() | CustomFilter(LancasterStemmer().stem)
[token.text for token in myFilter1("We are going to do Text Analysis with whoosh.analysis")]

In [None]:
# Example2: Whoosh filter for NLTK's WordNetLemmatizer
myFilter2 = RegexTokenizer() | CustomFilter(WordNetLemmatizer().lemmatize)
[token.text for token in myFilter2("We are going to do Text Analysis with whoosh.analysis")]

In [None]:
# Example3: Whoosh filter for NLTK's WordNetLemmatizer for verbs
myFilter3 = RegexTokenizer() | CustomFilter(WordNetLemmatizer().lemmatize, 'v')
[token.text for token in myFilter3("We are going to do Text Analysis with whoosh.analysis")]

You can now use myFilter1/2/3 as part of your Schema

------------
You can find details of other NLTK Stemmers and Lemmatizers here:

http://www.nltk.org/api/nltk.stem.html