# Assignment 2: IR

## Preparations
* Put all your imports, and path constants in the next cells
* Make sure all your path constants are **relative to** ***DATA_DIR*** and **NOT hard-coded** in your code.

In [1]:
# imports
# Put all your imports here
from whoosh import index, writing
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED
from whoosh.analysis import *
from whoosh.qparser import QueryParser
import os.path
from pathlib import Path
import tempfile
import subprocess
from whoosh import scoring, qparser
import nltk
from nltk.stem import *

import re

In [2]:
DATA_DIR = "government"

#
# Put other path constants here
#

DOCUMENTS_DIR = os.path.join(DATA_DIR, "documents")
TOPIC_FILE = os.path.join(DATA_DIR, "gov.topics")
QRELS_FILE = os.path.join(DATA_DIR, "gov.qrels")
try:
    TREC_EVAL = os.path.join("trec_eval", "trec_eval")
except:
    TREC_EVAL = os.path.join("trec_eval", "trec_eval.exe")

## Question 1
Provide your text answers in the following two markdown cells

### Q1 (a): Provide answer to Q1 (a) here [MAP]

### Q1 (b): Provide answer to Q1 (b) here [We want to have the ordering taken into account so that these sites can be found within the first few clicks, and at the same time, MAP also takes into account the number of relevant documents to take the average which normalizes the metric for each query.]

#### Functions

In [3]:
#create index on defined schema
def createIndex(schema):
    # Generate a temporary directory for the index
    indexDir = tempfile.mkdtemp()

    # create and return the index
    return index.create_in(indexDir, schema)

In [4]:
def addFilesToIndex(indexObj, fileList):
    # open writer
    writer = writing.BufferedWriter(indexObj, period=None, limit=1000)

    try:
        # write each file to index
        for docNum, filePath in enumerate(fileList):
            with open(filePath, "r", encoding="utf-8") as f:
                fileContent = f.read()
                writer.add_document(file_path = filePath,
                                    file_content = fileContent)

                # print status every 1000 documents
                if (docNum+1 % 1000 == 0):
                    print("already indexed:", docNum+1)
        print("done indexing.")

    finally:
        # close the index
        writer.close()

In [5]:
def trecEval(topicFile, qrelsFile, queryParser, searcher):
    # Load topic file - a list of topics(search phrases) used for evalutation
    with open(topicFile, "r") as tf:
        topics = tf.read().splitlines()

    # create an output file to which we'll write our results
    tempOutputFile = tempfile.mkstemp()[1]
    with open(tempOutputFile, "w") as outputTRECFile:
        # for each evaluated topic:
        # build a query and record the results in the file in TREC_EVAL format
        for topic in topics:
            topic_id, topic_phrase = tuple(topic.split(" ", 1))
            topicQuery = queryParser.parse(topic_phrase)
            topicResults = searcher.search(topicQuery, limit=None)
            for (docnum, result) in enumerate(topicResults):
                score = topicResults.score(docnum)
                outputTRECFile.write("%s Q0 %s %d %lf test\n" % (topic_id, os.path.basename(result["file_path"]), docnum, score))
    
    result = subprocess.run([TREC_EVAL, '-q', qrelsFile, tempOutputFile], stdout=subprocess.PIPE)
    return result.stdout.decode()

## Question 2

In [6]:
filesToIndex = [str(filePath) for filePath in Path(DOCUMENTS_DIR).glob("**/*") if filePath.is_file()]

### Q2 (a): Write your code below

In [7]:
mySchema = Schema(file_path = ID(stored=True),
                  file_content = TEXT(analyzer = RegexTokenizer())) #regextoken splits file into words

In [8]:
INDEX_Q2 = createIndex(mySchema) # Replace None with your index for Q2

In [9]:
addFilesToIndex(INDEX_Q2, filesToIndex)

done indexing.


In [10]:
QP_Q2 = QueryParser("file_content", schema=INDEX_Q2.schema)# Replace None with your query parser for Q2
SEARCHER_Q2 = INDEX_Q2.searcher() # Replace None with your searcher for Q2

In [11]:
result = trecEval(TOPIC_FILE, QRELS_FILE, QP_Q2, SEARCHER_Q2) 
print(result)

num_ret        	1	1
num_rel        	1	5
num_rel_ret    	1	0
map            	1	0.0000
R-prec         	1	0.0000
bpref          	1	0.0000
recip_rank     	1	0.0000
ircl_prn.0.00  	1	0.0000
ircl_prn.0.10  	1	0.0000
ircl_prn.0.20  	1	0.0000
ircl_prn.0.30  	1	0.0000
ircl_prn.0.40  	1	0.0000
ircl_prn.0.50  	1	0.0000
ircl_prn.0.60  	1	0.0000
ircl_prn.0.70  	1	0.0000
ircl_prn.0.80  	1	0.0000
ircl_prn.0.90  	1	0.0000
ircl_prn.1.00  	1	0.0000
P5             	1	0.0000
P10            	1	0.0000
P15            	1	0.0000
P20            	1	0.0000
P30            	1	0.0000
P100           	1	0.0000
P200           	1	0.0000
P500           	1	0.0000
P1000          	1	0.0000
num_ret        	2	6
num_rel        	2	2
num_rel_ret    	2	0
map            	2	0.0000
R-prec         	2	0.0000
bpref          	2	0.0000
recip_rank     	2	0.0000
ircl_prn.0.00  	2	0.0000
ircl_prn.0.10  	2	0.0000
ircl_prn.0.20  	2	0.0000
ircl_prn.0.30  	2	0.0000
ircl_prn.0.40  	2	0.0000
ircl_prn.0.50  	2	0.0000
ircl_prn.0.60  	2	0.0000
ircl_

In [12]:
# show MAP of all evaluations
map_list = [x for x in result.splitlines() if 'map' in x]
for line in range(len(map_list)):
    print(map_list[line])

map            	1	0.0000
map            	2	0.0000
map            	4	0.0312
map            	6	0.0000
map            	7	0.0000
map            	9	0.0000
map            	10	0.1667
map            	14	0.2500
map            	16	0.0000
map            	18	1.0000
map            	22	0.2000
map            	24	1.0000
map            	26	0.1111
map            	28	0.0000
map            	all	0.1971


### Q2 (b): Provide answer to Q2 (b) here [Poorly, map (all) = 0.1971]

### Q2 (c): Provide answer to Q2(c) here [Yes, it did a fairly good job for Query 18 and 24 with MAP equal to 1. On the other hand, it did extremely badly for 1, 2, 6, 7, 9, 16  and 28 where MAP=0]

## Question 3

### Q3 (a): Provide answer to Q3 (a) here [Intitively, stemming and  lemmatization of both queries and documents would help. In the original search system, a basic regular expression analyzer was used to tokenize the words in the documents, by only looking at spaces between text. It is possbile that some potentially relevent documents contain the words in the queries but in different forms, such as surfix ('preparedness' in 16), caplitization ('gold' in 1), tense ('mining' in 1), plurality ('searches' in 9) etc.  Also, removing stop words ('and' in 16 and 19) could help put more emphasis on the more important terms. Take query 28 as an example for checking FN and FP. By searching 'Early Childhood Education', an FP file 'G00-75-2371200' pops up, which is really just a resource book containing all names of different institutions. It showed up in the result only because it the keywords  'early','childhood' or 'education' show up in the document multiple times in the names of different institutions. On the other hand, 'G00-54-2576117' is an FN because it is a relevent file but not extracted out. It's because the key words are not explicitly shown in this document but in their various forms. ] 

In [13]:
# print the topic file
with open(TOPIC_FILE, "r") as f:
    print(f.read())

1 mining gold silver coal
2 juvenile delinquency
4 wireless communications
6 physical therapists
7 cotton industry
9 genealogy searches
10 Physical Fitness
14 Agricultural biotechnology
16 Emergency and disaster preparedness assistance
18 Shipwrecks
19 Cybercrime, internet fraud, and cyber fraud
22 Veteran's Benefits
24 Air Bag Safety
26 Nuclear power plants
28 Early Childhood Education



In [14]:
# define a reader object on the index
myReader = INDEX_Q2.reader()

Surfix

In [15]:
print("# of docs with 'preparedness': ", myReader.doc_frequency("file_content", "preparedness"))
print("# of docs with 'prepare': ", myReader.doc_frequency("file_content", "prepare"))

# of docs with 'preparedness':  26
# of docs with 'prepare':  73


Capitilization

In [16]:
print("# of docs with 'gold': ", myReader.doc_frequency("file_content", "gold"))
print("# of docs with 'Gold': ", myReader.doc_frequency("file_content", "Gold"))

# of docs with 'gold':  28
# of docs with 'Gold':  33


Tense

In [17]:
print("# of docs with 'mine': ", myReader.doc_frequency("file_content", "mine"))
print("# of docs with 'mining': ", myReader.doc_frequency("file_content", "mining"))

# of docs with 'mine':  30
# of docs with 'mining':  45


Plurality

In [18]:
print("# of docs with 'search': ", myReader.doc_frequency("file_content", "search"))
print("# of docs with 'searches': ", myReader.doc_frequency("file_content", "searches"))

# of docs with 'search':  581
# of docs with 'searches':  50


In [19]:
with open(QRELS_FILE,'r') as f:
    qrels_file = f.read()
#look at qrels by query specified and if it's relevant (only show 1's)
qrels_list = [x for x in qrels_file.splitlines() if ' 1' in x]

for line in range(len(qrels_list)):
    print(qrels_list[line])

1 0 G00-00-1006224 1
1 0 G00-02-0901987 1
1 0 G00-03-1898526 1
1 0 G00-10-3730888 1
1 0 G00-10-3849661 1
2 0 G00-08-1145623 1
2 0 G00-37-1427392 1
4 0 G00-03-2855342 1
4 0 G00-36-1275993 1
4 0 G00-47-2117970 1
4 0 G00-65-0162935 1
6 0 G00-10-0106475 1
7 0 G00-07-4009621 1
7 0 G00-10-3302265 1
7 0 G00-76-1350144 1
9 0 G00-91-3181951 1
10 0 G00-04-0412407 1
14 0 G00-89-0000000 1
16 0 G00-03-0589290 1
16 0 G00-21-0494028 1
16 0 G00-21-2114990 1
16 0 G00-32-0551737 1
16 0 G00-86-3719816 1
16 0 G00-92-2974327 1
16 0 G00-99-0140748 1
18 0 G00-07-0978415 1
19 0 G00-02-3479535 1
19 0 G00-10-2344253 1
22 0 G00-08-2045138 1
24 0 G00-35-3406418 1
26 0 G00-01-1806077 1
26 0 G00-01-3645577 1
26 0 G00-92-1620651 1
28 0 G00-02-0541868 1
28 0 G00-54-2576117 1


In [20]:
# relevent docs for query 4
qrels2 = [x for x in qrels_list if re.match(r'^[2][8]',x)]

for line in range(len(qrels2)):
    print(qrels2[line])

28 0 G00-02-0541868 1
28 0 G00-54-2576117 1


In [21]:
#test query 
sampleQuery = QP_Q2.parse("Early Childhood Education")
sampleQueryResults = SEARCHER_Q2.search(sampleQuery, limit=None)

# inspect the result:
# for each document print the rank and the score
for (docnum, result) in enumerate(sampleQueryResults):
    score = sampleQueryResults.score(docnum)
    fileName = os.path.basename(result["file_path"])
    print(fileName, docnum, score)

G00-75-2371200 0 24.505669484474577
G00-93-3702508 1 21.152985613082592
G00-99-2279811 2 20.492992401981702
G00-30-2788847 3 20.139545266992123
G00-31-0429249 4 19.988408410778327
G00-61-3894960 5 19.96065237788488
G00-93-4160214 6 19.96065237788488
G00-48-1527977 7 19.96065237788488
G00-78-2978026 8 19.848212509177156
G00-28-3705847 9 19.793503966440316
G00-50-3231467 10 19.3188594060358
G00-74-2972556 11 19.244841496114134
G00-77-3295130 12 19.031552270465305
G00-91-3997333 13 18.80822706005531
G00-93-1203370 14 18.558858464045002
G00-82-0211909 15 18.217312863267484
G00-16-2494170 16 17.101565082467438
G00-09-4172401 17 16.71374741412056
G00-27-2159399 18 13.352770387844203
G00-82-3144058 19 12.296383589803305
G00-65-4078383 20 11.960487737698525
G00-39-3477058 21 11.365611520580897


In [22]:
# G00-54-2576117 is a FN, check it out
with open(DOCUMENTS_DIR+'/54/G00-54-2576117','r') as f:
    print(f.read())

http://goldmine.cde.ca.gov/cyfsbranch/chssco/

                        California Dept. of Education
          Work Plan   | Advisory Committee  |  BRIDGES  | CPR | Search

           CHSSCO


                 California Head Start State Collaboration Office

                 Michael Silver, Project Director
                 Michael Zito, Project Coordinator


   Hot topics

             CHSA Annual Conference & Leadership Day

             Ninth Annual Family Literacy Conference


   An Overview of Head Start

   Head Start is a national program that provides comprehensive
   developmental services for low-income children from birth to entry in
   elementary school. Over a span of more than thirty years, Head Start
   has provided educational, social, medical, dental, nutrition, mental
   health services, and parent involvement activities to almost 18 million
   children nationwide.

   Head Start is currently funded at over $6 billion and serves more than
   857,000 low-income childr

In [23]:
# G00-75-2371200 is a FP, check it out
with open(DOCUMENTS_DIR+'/75/G00-75-2371200','r') as f:
    print(f.read())

http://www.cdpac.ca.gov/files/Resources.html

   [CDPAC_Resources_01.gif]


   [CDPAC_Resources_02.gif] CDPAC Home Page
   [CDPAC_Resources_06.gif] Who We Are
   [CDPAC_Resources_08.gif] Local Planning
   [CDPAC_Resources_10.gif] Monthly Meetings
   [CDPAC_Resources_14.gif] Legislation
   [CDPAC_Resources_16.gif] Publications
   [CDPAC_Resources_18.gif] Resources
   [CDPAC_Resources_20.gif] E-Mail Us
   [CDPAC_Resources_22.gif]


   Job Announcements & Duty Statements
   *** There are currently no Job Announcements ***
   Resource Links

   blue_dot.gif (336 bytes) American Academy of Pediatrics
   blue_dot.gif (336 bytes) American Medical Association
   blue_dot.gif (336 bytes) American Public Human Services Association
   blue_dot.gif (336 bytes) AT&T Foundation
   blue_dot.gif (336 bytes) Brain Awareness Information
   blue_dot.gif (336 bytes) Bureau of Labor Statistics
   blue_dot.gif (336 bytes) California Children and Families Commission
   blue_dot.gif (336 bytes) California Com

### Q3 (b): Write your code below

In [24]:
# Dont change this! Use it as-is in your code
# This filter will run for both the index and the query
from whoosh.analysis import Filter
class CustomFilter(Filter):
    is_morph = True
    def __init__(self, filterFunc, *args, **kwargs):
        self.customFilter = filterFunc
        self.args = args
        self.kwargs = kwargs
    def __eq__(self):
        return (other
                and self.__class__ is other.__class__)
    def __call__(self, tokens):
        for t in tokens:
            if t.mode == 'query': # if called by query parser
                t.text = self.customFilter(t.text, *self.args, **self.kwargs)
                yield t
            else: # == 'index' if called by indexer
                t.text = self.customFilter(t.text, *self.args, **self.kwargs)
                yield t

In [25]:
# download required resources
nltk.download("wordnet")

[nltk_data] Downloading package wordnet to /Users/rong/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [26]:
# Define Tokenizers
lancasterTokenizer = RegexTokenizer() | LowercaseFilter() | IntraWordFilter() | StopFilter() | CustomFilter(LancasterStemmer().stem) | CustomFilter(WordNetLemmatizer().lemmatize) | CustomFilter(WordNetLemmatizer().lemmatize, 'n')

### Rebuild schema with new text analyzer

In [27]:
mySchema_Q3 = Schema(file_path = ID(stored=True),
                   file_content = TEXT(analyzer = lancasterTokenizer))

In [28]:
# now, create the index at the path INDEX_DIR based on the new schema
INDEX_Q3 = createIndex(mySchema_Q3)

In [29]:
addFilesToIndex(INDEX_Q3, filesToIndex)

done indexing.


In [30]:
QP_Q3 = QueryParser("file_content", schema=INDEX_Q3.schema) # Replace None with your query parser for Q3
SEARCHER_Q3 = INDEX_Q3.searcher()  # Replace None with your searcher for Q3

In [31]:
# define a reader object on the index
myReader_Q3 = INDEX_Q3.reader()

In [32]:
result_Q3 = trecEval(TOPIC_FILE, QRELS_FILE, QP_Q3, SEARCHER_Q3) 
print(result_Q3)

num_ret        	1	3
num_rel        	1	5
num_rel_ret    	1	0
map            	1	0.0000
R-prec         	1	0.0000
bpref          	1	0.0000
recip_rank     	1	0.0000
ircl_prn.0.00  	1	0.0000
ircl_prn.0.10  	1	0.0000
ircl_prn.0.20  	1	0.0000
ircl_prn.0.30  	1	0.0000
ircl_prn.0.40  	1	0.0000
ircl_prn.0.50  	1	0.0000
ircl_prn.0.60  	1	0.0000
ircl_prn.0.70  	1	0.0000
ircl_prn.0.80  	1	0.0000
ircl_prn.0.90  	1	0.0000
ircl_prn.1.00  	1	0.0000
P5             	1	0.0000
P10            	1	0.0000
P15            	1	0.0000
P20            	1	0.0000
P30            	1	0.0000
P100           	1	0.0000
P200           	1	0.0000
P500           	1	0.0000
P1000          	1	0.0000
num_ret        	2	13
num_rel        	2	2
num_rel_ret    	2	1
map            	2	0.5000
R-prec         	2	0.5000
bpref          	2	0.5000
recip_rank     	2	1.0000
ircl_prn.0.00  	2	1.0000
ircl_prn.0.10  	2	1.0000
ircl_prn.0.20  	2	1.0000
ircl_prn.0.30  	2	1.0000
ircl_prn.0.40  	2	1.0000
ircl_prn.0.50  	2	1.0000
ircl_prn.0.60  	2	0.0000
ircl

In [33]:
map_list_Q3 = [x for x in result_Q3.splitlines() if 'map' in x]

for line in range(len(map_list)):
    print('Q2: '+map_list[line])

print(38*'-')
    
for line in range(len(map_list_Q3)):
    print('Q3: '+map_list_Q3[line])

Q2: map            	1	0.0000
Q2: map            	2	0.0000
Q2: map            	4	0.0312
Q2: map            	6	0.0000
Q2: map            	7	0.0000
Q2: map            	9	0.0000
Q2: map            	10	0.1667
Q2: map            	14	0.2500
Q2: map            	16	0.0000
Q2: map            	18	1.0000
Q2: map            	22	0.2000
Q2: map            	24	1.0000
Q2: map            	26	0.1111
Q2: map            	28	0.0000
Q2: map            	all	0.1971
--------------------------------------
Q3: map            	1	0.0000
Q3: map            	2	0.5000
Q3: map            	4	0.5357
Q3: map            	6	0.0000
Q3: map            	7	0.0000
Q3: map            	9	0.0588
Q3: map            	10	0.2500
Q3: map            	14	1.0000
Q3: map            	16	0.0000
Q3: map            	18	1.0000
Q3: map            	19	0.5000
Q3: map            	22	0.0357
Q3: map            	24	1.0000
Q3: map            	26	0.0771
Q3: map            	28	0.2262
Q3: map            	all	0.3456


In [34]:
#test query 
sampleQuery = QP_Q3.parse("Early Childhood Education")
sampleQueryResults = SEARCHER_Q3.search(sampleQuery, limit=None)

# inspect the result:
# for each document print the rank and the score
for (docnum, result) in enumerate(sampleQueryResults):
    score = sampleQueryResults.score(docnum)
    fileName = os.path.basename(result["file_path"])
    print(fileName, docnum, score)

G00-75-2371200 0 17.71994603112268
G00-11-3066108 1 17.5517193874587
G00-93-3702508 2 17.251674698724507
G00-77-3295130 3 16.157671351212255
G00-93-1203370 4 16.048806873782414
G00-54-2576117 5 15.483452463739809
G00-02-0541868 6 15.358370484368413
G00-82-0211909 7 14.948996234831622
G00-78-2978026 8 14.904012791403312
G00-78-0877232 9 14.50523067274203
G00-93-4160214 10 14.262640313290104
G00-48-1527977 11 14.262640313290104
G00-81-3327332 12 14.261005184179258
G00-99-2279811 13 14.22064542658908
G00-61-3894960 14 14.16849585343095
G00-27-2159399 15 13.905323240246142
G00-28-3705847 16 13.842042049021794
G00-31-0429249 17 13.834770675706274
G00-50-3231467 18 13.784004276703303
G00-78-1531079 19 13.746114379412095
G00-30-2788847 20 13.604117991761974
G00-74-2972556 21 13.554090713277297
G00-91-3997333 22 13.279465822737585
G00-74-1394517 23 13.229918583070255
G00-49-2602614 24 13.02540025948743
G00-16-2494170 25 12.51392827477842
G00-04-3016417 26 12.407227147728996
G00-04-4166204 27 1

**G00-54-2576117 was successfully extracted and randed No. 5 in the result.**

### Q3 (c): Provide answer to Q3 (c) here [A word stemmer and word lemmatizer was added to the analyzer. The overall MAP improved from 0.1971 to 0.3456. Most queries improved, some got unchanged, however queries 22 and 26 got worse. The FN situation improved for the case in query 28 as one of the relevent files shows up in the searching result. FP issue has not been improved however.]

### Q3 (d): Provide answer to Q3 (d) here [Yes]

### Q3 (e): Provide answer to Q3 (e) here [Yes]

### Q3 (f): Provide answer to Q3 (f) here [IThis method definitely improves the performance of the IR system since we could see a significant increase in MAP from 0.1971 to 0.3456. Except for 22 and 26, most of the rest either improves or remains the same. We could say this is at least a good direction to follow.]

## Question 4 (Graduate Students)

In [35]:
GRAD_STUDENT = True # change to True if you are a grad student

### Q4 (a): Provide answer to Q4 (a) here [To improve performance, giving a stronger weighting to words that are more unique, that appear in documents can help compensate for missing query terms in the results.  Adjusting the query to use 'OR' instead of 'AND' may help, as well as adjusting the document scoring system. The query object was adjusted to join multi-term queries with an "OR" instead of an "AND" term.  An extra weighting was also added to documents that had multiple query terms with the factory() function. BM25 scoring method has been used for comparison.  ]

### Q4 (b): Write your code below

### Adding Scoring System to Search

In [36]:
queryParserList = []
bm25SearcherBList = []
bm25SearcherK1List = []

In [37]:
mySchema_Q4 = Schema(file_path = ID(stored=True),
                   file_content = TEXT(analyzer = lancasterTokenizer))

In [38]:
# now, create the index at the path INDEX_DIR based on the new schema
INDEX_Q4 = createIndex(mySchema_Q4)

In [39]:
addFilesToIndex(INDEX_Q4, filesToIndex)

done indexing.


BM25F (best matching) is a ranking function to rank matching documents according to their relevance to a given query. It is a bag-of-word retrieval function that ranks a set of documents based on the query terms appearing in each document, regardless of the inter-relationship between query terms within the document. 

In [40]:
# join query terms with 'OR' instead of 'AND' by specifying 'group' parameter in parser
QP_Q4 = QueryParser("file_content", schema=INDEX_Q4.schema,group=qparser.OrGroup.factory(0)) # Replace None with your query parser for Q3
# use a different scoring method (BM25F)
SEARCHER_Q4 = INDEX_Q4.searcher(weighting=scoring.BM25F(B=0.75, K1=1.5))  # Replace None with your searcher for Q3

In [41]:
result_Q4 = trecEval(TOPIC_FILE, QRELS_FILE, QP_Q4, SEARCHER_Q4)
print(result_Q4)

num_ret        	1	482
num_rel        	1	5
num_rel_ret    	1	5
map            	1	0.0651
R-prec         	1	0.0000
bpref          	1	0.0000
recip_rank     	1	0.0500
ircl_prn.0.00  	1	0.1034
ircl_prn.0.10  	1	0.1034
ircl_prn.0.20  	1	0.1034
ircl_prn.0.30  	1	0.1034
ircl_prn.0.40  	1	0.1034
ircl_prn.0.50  	1	0.1034
ircl_prn.0.60  	1	0.1034
ircl_prn.0.70  	1	0.0460
ircl_prn.0.80  	1	0.0460
ircl_prn.0.90  	1	0.0459
ircl_prn.1.00  	1	0.0459
P5             	1	0.0000
P10            	1	0.0000
P15            	1	0.0000
P20            	1	0.0500
P30            	1	0.1000
P100           	1	0.0400
P200           	1	0.0250
P500           	1	0.0100
P1000          	1	0.0050
num_ret        	2	59
num_rel        	2	2
num_rel_ret    	2	2
map            	2	0.5357
R-prec         	2	0.5000
bpref          	2	0.5000
recip_rank     	2	1.0000
ircl_prn.0.00  	2	1.0000
ircl_prn.0.10  	2	1.0000
ircl_prn.0.20  	2	1.0000
ircl_prn.0.30  	2	1.0000
ircl_prn.0.40  	2	1.0000
ircl_prn.0.50  	2	1.0000
ircl_prn.0.60  	2	0.0714
ir

In [42]:
map_list_Q4 = [x for x in result_Q4.splitlines() if 'map' in x]

for line in range(len(map_list_Q3)):
    print('Q3: '+map_list_Q3[line])

print(38*'-')
    
for line in range(len(map_list_Q4)):
    print('Q4: '+map_list_Q4[line])

Q3: map            	1	0.0000
Q3: map            	2	0.5000
Q3: map            	4	0.5357
Q3: map            	6	0.0000
Q3: map            	7	0.0000
Q3: map            	9	0.0588
Q3: map            	10	0.2500
Q3: map            	14	1.0000
Q3: map            	16	0.0000
Q3: map            	18	1.0000
Q3: map            	19	0.5000
Q3: map            	22	0.0357
Q3: map            	24	1.0000
Q3: map            	26	0.0771
Q3: map            	28	0.2262
Q3: map            	all	0.3456
--------------------------------------
Q4: map            	1	0.0651
Q4: map            	2	0.5357
Q4: map            	4	0.5583
Q4: map            	6	0.0278
Q4: map            	7	0.1778
Q4: map            	9	0.0625
Q4: map            	10	0.2500
Q4: map            	14	1.0000
Q4: map            	16	0.1673
Q4: map            	18	1.0000
Q4: map            	19	0.5000
Q4: map            	22	0.0357
Q4: map            	24	1.0000
Q4: map            	26	0.1083
Q4: map            	28	0.2262
Q4: map            	all	0.3810


### Q4 (c): Provide answer to Q4 (a) here [ Instead of parsing queires using 'AND', 'OR' parsing method was used in this section. BM25, a different scoring method, has also been used for comparison.  The B and K1 are two scaling factor parameters for the BM25 scoring method. Empirically, B=0.75 and K1 is between 1.2 and 2. In this section, K1=1.5. The overall perfomance improved since MAP increased from 0.3456 to 0.3810. FP/FN issue has not been improve for case 28, but possibly improved for other cases. ]

### Q4 (d): Provide answer to Q4 (a) here [Yes]

### Q4 (e): Provide answer to Q4 (a) here [Yes]

### Q4 (f): Provide answer to Q4 (a) here [Parsing queries with 'OR' and using BM25F scroing method definitely improved the IR performance even further. The overall MAP increased from 0.3456 to 0.3810, and in the meantime, no ZERO MAP was shown in the result any more and most of the queries get better. In conclusion, this method is good.]

## Validation

In [43]:
# Run the following cells to make sure your code returns the correct value types

In [44]:
from whoosh.index import FileIndex
from whoosh.qparser import QueryParser
from whoosh.searching import Searcher
import os.path

### Q2 Validation

In [45]:
assert(isinstance(INDEX_Q2, FileIndex)), "Index Type"
assert(isinstance(QP_Q2, QueryParser)), "Query Parser Type"
assert(isinstance(SEARCHER_Q2, Searcher)), "Searcher Type"
print("Q2 Types Validated")

Q2 Types Validated


### Q3 Validation

In [46]:
assert(isinstance(INDEX_Q3, FileIndex)), "Index Type"
assert(isinstance(QP_Q3, QueryParser)), "Query Parser Type"
assert(isinstance(SEARCHER_Q3, Searcher)), "Searcher Type"
print("Q3 Types Validated")

Q3 Types Validated


### Q4 Validation (Graduate Students)

In [47]:
assert((not GRAD_STUDENT) or isinstance(INDEX_Q4, FileIndex)), "Index Type"
assert((not GRAD_STUDENT) or isinstance(QP_Q4, QueryParser)), "Query Parser Type"
assert((not GRAD_STUDENT) or isinstance(SEARCHER_Q4, Searcher)), "Searcher Type"
print("Q4 Types Validated")

Q4 Types Validated
