In [1]:
from preprocessing import *
from vocabulary_and_postings import *
from vectorization import *
from scoring import *
from vect_answer import *
from fast_answer import *
from evaluation import *

import warnings
warnings.filterwarnings("ignore")

### LOAD DATASET

In [2]:
doc_set, qry_set, rel_set = load_dataset("CISI")

#### DATASET INFO

In [3]:
print(f' # N Documents = {len(doc_set)}\n # N Queries = {len(qry_set)}')
print(f' # Avg number of relevant docs per query = {np.mean([len(r) for r in rel_set.values()]):.2f}' )
print(f' # Min number of relevant docs per query = {int(np.min([len(r) for r in rel_set.values()]))}' )
print(f' # Max number of relevant docs per query = {int(np.max([len(r) for r in rel_set.values()]))}' )
print(f' # Query without relevant docs = {[i for i in qry_set.keys() if i not in rel_set.keys()]}' )

 # N Documents = 1460
 # N Queries = 112
 # Avg number of relevant docs per query = 40.97
 # Min number of relevant docs per query = 1
 # Max number of relevant docs per query = 155
 # Query without relevant docs = [35, 37, 39, 46, 47, 50, 52, 58, 59, 62, 63, 67, 69, 71, 72, 73, 74, 76, 77, 79, 82, 84, 85, 86, 87, 88, 90, 92, 93, 102, 104, 105, 106, 107, 109, 111]


### STOPWORDS NOT REMOVED, NO LEMMATIZATION

In [4]:
DO_LEMMATIZE = False
DO_REMOVE_SW = False

print_eval(doc_set=doc_set, qry_set=qry_set, rel_set=rel_set, remove_sw=DO_REMOVE_SW, lemmatize=DO_LEMMATIZE)

Remove stopwords = False, lemmatize = False
Vocabulary contains 10759 tokens

Standard vect | MAP = 0.21 	 MRP = 0.16 	 MK10 = 0.16
Standard fast | MAP = 0.21 	 MRP = 0.16 	 MK10 = 0.16


Pseu exp vect | MAP = 0.19 	 MRP = 0.15 	 MK10 = 0.15
Pseu exp fast | MAP = 0.20 	 MRP = 0.17 	 MK10 = 0.17


Pseu mov vect | MAP = 0.21 	 MRP = 0.17 	 MK10 = 0.16
Pseu mov fast | MAP = 0.18 	 MRP = 0.16 	 MK10 = 0.14


Feed mov vect | MAP = 0.43 	 MRP = 0.25 	 MK10 = 0.25
Feed mov fast | MAP = 0.38 	 MRP = 0.21 	 MK10 = 0.21


Feed exp vect | MAP = 0.38 	 MRP = 0.21 	 MK10 = 0.20
Feed exp fast | MAP = 0.36 	 MRP = 0.20 	 MK10 = 0.20


### STOPWORDS REMOVED, NO LEMMATIZATION

In [5]:
DO_LEMMATIZE = False
DO_REMOVE_SW = True

print_eval(doc_set=doc_set, qry_set=qry_set, rel_set=rel_set, remove_sw=DO_REMOVE_SW, lemmatize=DO_LEMMATIZE)

Remove stopwords = True, lemmatize = False
Vocabulary contains 10639 tokens

Standard vect | MAP = 0.23 	 MRP = 0.18 	 MK10 = 0.18
Standard fast | MAP = 0.25 	 MRP = 0.19 	 MK10 = 0.19


Pseu exp vect | MAP = 0.22 	 MRP = 0.18 	 MK10 = 0.17
Pseu exp fast | MAP = 0.14 	 MRP = 0.10 	 MK10 = 0.10


Pseu mov vect | MAP = 0.22 	 MRP = 0.19 	 MK10 = 0.18
Pseu mov fast | MAP = 0.23 	 MRP = 0.19 	 MK10 = 0.18


Feed mov vect | MAP = 0.45 	 MRP = 0.27 	 MK10 = 0.26
Feed mov fast | MAP = 0.43 	 MRP = 0.24 	 MK10 = 0.24


Feed exp vect | MAP = 0.40 	 MRP = 0.23 	 MK10 = 0.22
Feed exp fast | MAP = 0.41 	 MRP = 0.24 	 MK10 = 0.23


### STOPWORDS REMOVED AND LEMMATIZATION

In [6]:
DO_LEMMATIZE = True
DO_REMOVE_SW = True

print_eval(doc_set=doc_set, qry_set=qry_set, rel_set=rel_set, remove_sw=DO_REMOVE_SW, lemmatize=DO_LEMMATIZE)

Remove stopwords = True, lemmatize = True
Vocabulary contains 6773 tokens

Standard vect | MAP = 0.26 	 MRP = 0.21 	 MK10 = 0.20
Standard fast | MAP = 0.28 	 MRP = 0.21 	 MK10 = 0.20


Pseu exp vect | MAP = 0.23 	 MRP = 0.18 	 MK10 = 0.18
Pseu exp fast | MAP = 0.24 	 MRP = 0.18 	 MK10 = 0.17


Pseu mov vect | MAP = 0.26 	 MRP = 0.22 	 MK10 = 0.21
Pseu mov fast | MAP = 0.24 	 MRP = 0.20 	 MK10 = 0.18


Feed mov vect | MAP = 0.48 	 MRP = 0.29 	 MK10 = 0.29
Feed mov fast | MAP = 0.42 	 MRP = 0.25 	 MK10 = 0.25


Feed exp vect | MAP = 0.42 	 MRP = 0.24 	 MK10 = 0.24
Feed exp fast | MAP = 0.40 	 MRP = 0.24 	 MK10 = 0.23
