In [238]:
from preprocessing import *
from vocabulary_and_postings import *
from vectorization import *
from scoring import *

In [239]:
DO_LEMMATIZE = True
DO_REMOVE_SW = True

### LOAD DATASET

In [169]:
### Processing DOCUMENTS
doc_set = {}
doc_id = ""
doc_text = ""
with open('CISI.ALL') as f:
    lines = ""
    for l in f.readlines():
        lines += "\n" + l.strip() if l.startswith(".") else " " + l.strip()
    lines = lines.lstrip("\n").split("\n")
doc_count = 0
for l in lines:
    if l.startswith(".I"):
        doc_id = int(l.split(" ")[1].strip())-1
    elif l.startswith(".X"):
        doc_set[doc_id] = doc_text.lstrip(" ")
        doc_id = ""
        doc_text = ""
    else:
        doc_text += l.strip()[3:] + " " # The first 3 characters of a line can be ignored.    

        
### Processing QUERIES
with open('CISI.QRY') as f:
    lines = ""
    for l in f.readlines():
        lines += "\n" + l.strip() if l.startswith(".") else " " + l.strip()
    lines = lines.lstrip("\n").split("\n")
    
qry_set = {}
qry_id = ""
for l in lines:
    if l.startswith(".I"):
        qry_id = int(l.split(" ")[1].strip()) -1
    elif l.startswith(".W"):
        qry_set[qry_id] = l.strip()[3:]
        qry_id = ""

### Processing QRELS
rel_set = {}
with open('CISI.REL') as f:
    for l in f.readlines():
        qry_id = int(l.lstrip(" ").strip("\n").split("\t")[0].split(" ")[0]) -1
        doc_id = int(l.lstrip(" ").strip("\n").split("\t")[0].split(" ")[-1])-1
        if qry_id in rel_set:
            rel_set[qry_id].append(doc_id)
        else:
            rel_set[qry_id] = []
            rel_set[qry_id].append(doc_id)
            
### Prepocess DOCS
processed_docs_set = {}
for docId, doc in doc_set.items():
    processed_docs_set[docId] = preprocess_string(doc, remove_stop_words=DO_REMOVE_SW, lemmatize=DO_LEMMATIZE)
    
### Preprocess QUERY
processed_qry_set = {}
for qryId, qry in qry_set.items():
    processed_qry_set[qryId] = preprocess_string(qry, remove_stop_words=DO_REMOVE_SW, lemmatize=DO_LEMMATIZE)

### VOCAB AND POSTINGS

In [170]:
# create vocab and postings
v, p = build_vocab_and_postings(processed_docs_set)
# create vocab with idf
v_idf = build_vocab_with_idf(v, p, processed_docs_set)

In [171]:
print(len(v))

6773


### VECTORIZE DOCS

In [172]:
# vectorize_docs
vector_docs = {docID : vectorize_doc(docID, processed_docs_set, p, v_idf) for docID in processed_docs_set.keys()}

### VECTORIZE QUERIES

In [173]:
vector_queries = {qId : vectorize_query(q, v) for qId, q in processed_qry_set.items()}

### EVALUATION

In [32]:
MAP0 = MAP(vector_queries, vector_docs, rel_set)
print(MAP0)

0.2620603235011236


In [33]:
MRP = Mean_R_precision(vector_queries, vector_docs, rel_set)
print(MRP)

0.20627801438639806


### PROPOSE DOCS

In [180]:
def propose_docs(query, k, vector_docs, v):
    query = preprocess_string(query, remove_stop_words=DO_REMOVE_SW, lemmatize=DO_LEMMATIZE)
    vq = vectorize_query(query, v)
    scores = cosine_sim_score(vector_docs.values(), vq)
    ind = extract_top_docs_index(scores, k)
    return ind

def propose_docs_with_vector_query(vq, k, vector_docs, v):
    scores = cosine_sim_score(vector_docs.values(), vq)
    ind = extract_top_docs_index(scores, k)
    return ind

In [175]:
idx = 0
query = qry_set[idx]

print(f'Query = {query}\n')

proposed_idx = propose_docs(query, k, vector_docs, v)
print(f"Proposed documents indices = {proposed_idx}\n")

print(f'Relevant docs indices = {rel_set[idx]}\n')

print(f"Precision at k10 = {len(set(rel_set[idx]).intersection(set(proposed_idx))) / 10}")

Query = What problems and concerns are there in making up descriptive titles? What difficulties are involved in automatically retrieving articles from approximate titles? What is the usual relevance of the content of articles to their titles?

Proposed documents indices = [ 428  446  314  721  448  602  564  174 1280 1418  588]

Relevant docs indices = [27, 34, 37, 41, 42, 51, 64, 75, 85, 149, 188, 191, 192, 194, 214, 268, 290, 319, 428, 464, 465, 481, 482, 509, 523, 540, 575, 581, 588, 602, 649, 679, 710, 721, 725, 782, 812, 819, 867, 868, 893, 1161, 1163, 1194, 1195, 1280]

Precision at k10 = 0.5


In [183]:
def propose_docs_with_pseudo_relevance(query, k, vector_docs, postings, v, v_idf, processed_docs_set):
    
    TOP_K = 20
    TOP_TERMS = 30
    
    ind = propose_docs(query, TOP_K, vector_docs, v)
    
    top_docs = {i : processed_docs_set[i] for i in ind}
    
    terms_tf_idf = [0] * len(v)
    
    for docId, doc_text in top_docs.items():
        for tk in doc_text:
            new_tf = postings[tk][docId]
            tokenId, new_idf = v_idf[tk]
            new_tf_idf = new_tf * new_idf
            
            if terms_tf_idf[tokenId] < new_tf_idf:
                terms_tf_idf[tokenId] = new_tf_idf
            
          # if tokenId in terms_tf_idf:
          #     if new_tf_idf > terms_tf_idf[tk]:
          #         terms_tf_idf[tk] = new_tf_idf
          # else:
          #     terms_tf_idf[tk] = new_tf_idf
          #     
            
    terms_tf_idf = np.asarray(terms_tf_idf)
    ind_best_tokens = terms_tf_idf.argsort()[::-1][:TOP_TERMS]
    
    tokens = [v[i] for i in ind_best_tokens]
    
    extra_terms = ' '.join(tokens)
    new_query = query + ' ' +extra_terms
    
    new_ind = propose_docs(new_query, k, vector_docs, v)
    
    return new_ind

In [228]:
def propose_docs_with_feedback(query, k, vector_docs, v, alpha = 1, beta = 0.75, gamma = 0.15):
    # proposal docs
    ind = propose_docs(query, k, vector_docs, v)
    top_docs = {i : processed_docs_set[i] for i in ind}
    
    # ask user for feedback:
    print("Insert feedback for every proposed document (1 = relevant, 0 = non relevant):\n")
    r_doc_idx = []
    nr_doc_idx = []
    for docId in top_docs.keys():
        is_relevant = int(input(f'Is document {docId} relevant?'))
        print(is_relevant)
        if is_relevant == 1:
            r_doc_idx.append(docId)
        elif is_relevant == 0:
            nr_doc_idx.append(docId)
        else:
            raise Exception
            
    # modify query
    query = preprocess_string(query, remove_stop_words=DO_REMOVE_SW, lemmatize=DO_LEMMATIZE)
    vq = vectorize_query(query, v)  
    
    rd = np.array([vector_docs[i] for i in r_doc_idx])
    nrd = np.array([vector_docs[i] for i in nr_doc_idx])
    
    centroid_r = np.zeros_like(vq)
    centroid_nr = np.zeros_like(vq)
    
    for i in r_doc_idx:
        centroid_r += vector_docs[i]
    for i in nr_doc_idx:
        centroid_nr += vector_docs[i]
        
    centroid_r = centroid_r / len(r_doc_idx)
    centroid_nr = centroid_nr / len(nr_doc_idx)
    
    new_query = alpha * vq + beta * centroid_r - gamma * centroid_nr
    new_query = new_query / np.linalg.norm(new_query)
    
    ind = propose_docs_with_vector_query(new_query, k, vector_docs, v)
    
    return ind

In [233]:
k = 10
idx = 1

query = qry_set[idx]

print(f'Relevant docs indices = {rel_set[idx]}\n')

proposed_idx = propose_docs(query, k, vector_docs, v)
print(f"Proposed documents indices = {proposed_idx}")
print(f"Precision at k{k} = {len(set(rel_set[idx]).intersection(set(proposed_idx))) / k}\n")

ind_pseduo_rel = propose_docs_with_pseudo_relevance(query, k, vector_docs, p, v, v_idf, processed_docs_set)
print(f"Proposed documents indices psuedo_rel = {ind_pseduo_rel}")
print(f"Precision at k{k} = {len(set(rel_set[idx]).intersection(set(ind_pseduo_rel))) / k}")

ind_feedback = propose_docs_with_feedback(query, k, vector_docs, v)
print(f"Proposed documents indices feedback = {ind_feedback}")
print(f"Precision at k{k} = {len(set(rel_set[idx]).intersection(set(ind_feedback))) / k}")

Relevant docs indices = [28, 67, 196, 212, 213, 308, 318, 323, 428, 498, 635, 668, 669, 673, 689, 691, 694, 699, 703, 708, 719, 730, 732, 737, 739, 1135]

Proposed documents indices = [1154 1137 1135  564  174 1157  531  308  789 1395]
Precision at k10 = 0.2

Proposed documents indices psuedo_rel = [ 610  561 1326  487 1395  564  308  445  658 1137]
Precision at k10 = 0.1
Insert feedback for every proposed document (1 = relevant, 0 = non relevant):



Is document 1154 relevant? 0


0


Is document 1137 relevant? 0


0


Is document 1135 relevant? 1


1


Is document 564 relevant? 0


0


Is document 174 relevant? 0


0


Is document 1157 relevant? 0


0


Is document 531 relevant? 0


0


Is document 308 relevant? 1


1


Is document 789 relevant? 0


0


Is document 1395 relevant? 0


0
Proposed documents indices feedback = [1135  308  564 1326  174  482 1154  178   71  450]
Precision at k10 = 0.2
