In [1]:
from preprocessing import *
from vocabulary_and_postings import *
from vectorization import *
from scoring import *
from proposal import *

### LOAD DATASET

In [2]:
### Processing DOCUMENTS
doc_set = {}
doc_id = ""
doc_text = ""
with open('CISI.ALL') as f:
    lines = ""
    for l in f.readlines():
        lines += "\n" + l.strip() if l.startswith(".") else " " + l.strip()
    lines = lines.lstrip("\n").split("\n")
doc_count = 0
for l in lines:
    if l.startswith(".I"):
        doc_id = int(l.split(" ")[1].strip())-1
    elif l.startswith(".X"):
        doc_set[doc_id] = doc_text.lstrip(" ")
        doc_id = ""
        doc_text = ""
    else:
        doc_text += l.strip()[3:] + " " # The first 3 characters of a line can be ignored.    

        
### Processing QUERIES
with open('CISI.QRY') as f:
    lines = ""
    for l in f.readlines():
        lines += "\n" + l.strip() if l.startswith(".") else " " + l.strip()
    lines = lines.lstrip("\n").split("\n")
    
qry_set = {}
qry_id = ""
for l in lines:
    if l.startswith(".I"):
        qry_id = int(l.split(" ")[1].strip()) -1
    elif l.startswith(".W"):
        qry_set[qry_id] = l.strip()[3:]
        qry_id = ""

### Processing QRELS
rel_set = {}
with open('CISI.REL') as f:
    for l in f.readlines():
        qry_id = int(l.lstrip(" ").strip("\n").split("\t")[0].split(" ")[0]) -1
        doc_id = int(l.lstrip(" ").strip("\n").split("\t")[0].split(" ")[-1])-1
        if qry_id in rel_set:
            rel_set[qry_id].append(doc_id)
        else:
            rel_set[qry_id] = []
            rel_set[qry_id].append(doc_id)

#### DATASET INFO

In [3]:
print(f"Read {len(doc_set)} documents, ")

Read 1460 documents, 


In [14]:
# EVALUATION
def compute_MAP(relevance_set, proposed_set):
    """
    Function that takes as input a set of relevant docs
    for every query of tha dataset and the set of proposed
    documents for every query with at least one relevant doc
    """
    N = len(relevance_set)
    MAP = 0
    
    for idx, relevant_docs in relevance_set.items():
        AP = 0
        mj = len(relevant_docs)
        docs_proposed = proposed_set[idx]
        for k in range(1, mj+1):
            docs_proposed = proposed_set[idx][:k]
            AP += len(set(relevant_docs).intersection(set(docs_proposed))) / k
        MAP += AP / mj
    return MAP / N

### STOPWORDS NOT REMOVED, NO LEMMATIZATION

In [15]:
DO_LEMMATIZE = False
DO_REMOVE_SW = False

### Prepocess DOCS
processed_docs_set = {}
for docId, doc in doc_set.items():
    processed_docs_set[docId] = preprocess_string(doc, remove_stop_words=DO_REMOVE_SW, lemmatize=DO_LEMMATIZE)
    
### Preprocess QUERY
processed_qry_set = {}
for qryId, qry in qry_set.items():
    processed_qry_set[qryId] = preprocess_string(qry, remove_stop_words=DO_REMOVE_SW, lemmatize=DO_LEMMATIZE)
    
# Create vocabulary list and postings
v, p = build_vocab_and_postings(processed_docs_set)
# Create vocabulary with tokenId and inverse document frequency (idf)
v_idf = build_vocab_with_idf(v, p, processed_docs_set)
print(f"Vocabulary contains {len(v)} tokens")

# VECTORIZE doc set and qry set
vector_docs = {docID : vectorize_doc(docID, processed_docs_set, p, v_idf) for docID in processed_docs_set.keys()}
#vector_queries = {qId : vectorize_query(q, v) for qId, q in processed_qry_set.items()}

# STANDARD PROPOSAL
proposed_docs = {}
for idx, relevant_docs in rel_set.items():
    q = qry_set[idx]
    prop_docs = propose_docs(q, len(relevant_docs), vector_docs, v, DO_REMOVE_SW, DO_LEMMATIZE)
    proposed_docs[idx] = prop_docs
print(compute_MAP(rel_set, proposed_docs))

# PSEUDO RELEVANCE FEEDBACK
pseudo_proposed_docs = {}
for idx, relevant_docs in rel_set.items():
    q = qry_set[idx]
    prop_docs = propose_docs_with_pseudo_relevance(q, len(relevant_docs), vector_docs, p, v, v_idf, processed_docs_set, DO_REMOVE_SW, DO_LEMMATIZE)
    pseudo_proposed_docs[idx] = prop_docs
print(compute_MAP(rel_set, pseudo_proposed_docs))

# PSEUDO RELEVANCE MOVING QUERY
pseudo_proposed_docs = {}
for idx, relevant_docs in rel_set.items():
    q = qry_set[idx]
    prop_docs = propose_docs_with_pseudo_relevance_moving_query(q, len(relevant_docs), vector_docs, v, DO_REMOVE_SW, DO_LEMMATIZE)
    pseudo_proposed_docs[idx] = prop_docs
print(compute_MAP(rel_set, pseudo_proposed_docs))

# SMART FEEDBACK
smart_feed_docs = {}
for idx, relevant_docs in rel_set.items():
    q = qry_set[idx]
    prop_docs = propose_smart_feedback(q, len(relevant_docs), vector_docs, v, relevant_docs, DO_REMOVE_SW, DO_LEMMATIZE)
    smart_feed_docs[idx] = prop_docs
    
print(compute_MAP(rel_set, smart_feed_docs))

Vocabulary contains 10759 tokens
0.21419977478749802
0.19005105903050531
0.21346134320237586
0.42956595427696714


### STOPWORDS REMOVED, NO LEMMATIZATION

In [16]:
DO_LEMMATIZE = False
DO_REMOVE_SW = True

### Prepocess DOCS
processed_docs_set = {}
for docId, doc in doc_set.items():
    processed_docs_set[docId] = preprocess_string(doc, remove_stop_words=DO_REMOVE_SW, lemmatize=DO_LEMMATIZE)
    
### Preprocess QUERY
processed_qry_set = {}
for qryId, qry in qry_set.items():
    processed_qry_set[qryId] = preprocess_string(qry, remove_stop_words=DO_REMOVE_SW, lemmatize=DO_LEMMATIZE)
    
# Create vocabulary list and postings
v, p = build_vocab_and_postings(processed_docs_set)
# Create vocabulary with tokenId and inverse document frequency (idf)
v_idf = build_vocab_with_idf(v, p, processed_docs_set)
print(f"Vocabulary contains {len(v)} tokens")

# VECTORIZE doc set and qry set
vector_docs = {docID : vectorize_doc(docID, processed_docs_set, p, v_idf) for docID in processed_docs_set.keys()}
#vector_queries = {qId : vectorize_query(q, v) for qId, q in processed_qry_set.items()}

# STANDARD PROPOSAL
proposed_docs = {}
for idx, relevant_docs in rel_set.items():
    q = qry_set[idx]
    prop_docs = propose_docs(q, len(relevant_docs), vector_docs, v, DO_REMOVE_SW, DO_LEMMATIZE)
    proposed_docs[idx] = prop_docs
print(compute_MAP(rel_set, proposed_docs))

# PSEUDO RELEVANCE FEEDBACK
pseudo_proposed_docs = {}
for idx, relevant_docs in rel_set.items():
    q = qry_set[idx]
    prop_docs = propose_docs_with_pseudo_relevance(q, len(relevant_docs), vector_docs, p, v, v_idf, processed_docs_set, DO_REMOVE_SW, DO_LEMMATIZE)
    pseudo_proposed_docs[idx] = prop_docs
print(compute_MAP(rel_set, pseudo_proposed_docs))

# PSEUDO RELEVANCE MOVING QUERY
pseudo_proposed_docs = {}
for idx, relevant_docs in rel_set.items():
    q = qry_set[idx]
    prop_docs = propose_docs_with_pseudo_relevance_moving_query(q, len(relevant_docs), vector_docs, v, DO_REMOVE_SW, DO_LEMMATIZE)
    pseudo_proposed_docs[idx] = prop_docs
print(compute_MAP(rel_set, pseudo_proposed_docs))

# SMART FEEDBACK
smart_feed_docs = {}
for idx, relevant_docs in rel_set.items():
    q = qry_set[idx]
    prop_docs = propose_smart_feedback(q, len(relevant_docs), vector_docs, v, relevant_docs, DO_REMOVE_SW, DO_LEMMATIZE)
    smart_feed_docs[idx] = prop_docs
    
print(compute_MAP(rel_set, smart_feed_docs))

Vocabulary contains 10639 tokens
0.22966438580721393
0.21893699451188073
0.2239540214499538
0.4500351603167675


### STOPWORDS REMOVED AND LEMMATIZATION

In [17]:
DO_LEMMATIZE = True
DO_REMOVE_SW = True

### Prepocess DOCS
processed_docs_set = {}
for docId, doc in doc_set.items():
    processed_docs_set[docId] = preprocess_string(doc, remove_stop_words=DO_REMOVE_SW, lemmatize=DO_LEMMATIZE)
    
### Preprocess QUERY
processed_qry_set = {}
for qryId, qry in qry_set.items():
    processed_qry_set[qryId] = preprocess_string(qry, remove_stop_words=DO_REMOVE_SW, lemmatize=DO_LEMMATIZE)
    
# Create vocabulary list and postings
v, p = build_vocab_and_postings(processed_docs_set)
# Create vocabulary with tokenId and inverse document frequency (idf)
v_idf = build_vocab_with_idf(v, p, processed_docs_set)
print(f"Vocabulary contains {len(v)} tokens")

# VECTORIZE doc set and qry set
vector_docs = {docID : vectorize_doc(docID, processed_docs_set, p, v_idf) for docID in processed_docs_set.keys()}
#vector_queries = {qId : vectorize_query(q, v) for qId, q in processed_qry_set.items()}

# STANDARD PROPOSAL
proposed_docs = {}
for idx, relevant_docs in rel_set.items():
    q = qry_set[idx]
    prop_docs = propose_docs(q, len(relevant_docs), vector_docs, v, DO_REMOVE_SW, DO_LEMMATIZE)
    proposed_docs[idx] = prop_docs
print(compute_MAP(rel_set, proposed_docs))

# PSEUDO RELEVANCE FEEDBACK
pseudo_proposed_docs = {}
for idx, relevant_docs in rel_set.items():
    q = qry_set[idx]
    prop_docs = propose_docs_with_pseudo_relevance(q, len(relevant_docs), vector_docs, p, v, v_idf, processed_docs_set, DO_REMOVE_SW, DO_LEMMATIZE)
    pseudo_proposed_docs[idx] = prop_docs
print(compute_MAP(rel_set, pseudo_proposed_docs))

# PSEUDO RELEVANCE MOVING QUERY
pseudo_proposed_docs = {}
for idx, relevant_docs in rel_set.items():
    q = qry_set[idx]
    prop_docs = propose_docs_with_pseudo_relevance_moving_query(q, len(relevant_docs), vector_docs, v, DO_REMOVE_SW, DO_LEMMATIZE)
    pseudo_proposed_docs[idx] = prop_docs
print(compute_MAP(rel_set, pseudo_proposed_docs))

# SMART FEEDBACK
smart_feed_docs = {}
for idx, relevant_docs in rel_set.items():
    q = qry_set[idx]
    prop_docs = propose_smart_feedback(q, len(relevant_docs), vector_docs, v, relevant_docs, DO_REMOVE_SW, DO_LEMMATIZE)
    smart_feed_docs[idx] = prop_docs
    
print(compute_MAP(rel_set, smart_feed_docs))

Vocabulary contains 6773 tokens
0.2620603235011236
0.22573472280367513
0.2601768574580408
0.48041901242368523
