# Indexing & Document Retrieval

Imports

In [1]:
# import
from sklearn.metrics.pairwise import *
from sklearn.feature_extraction.text import *
import numpy as np

Preparing corpus

In [2]:
corpus = []
for d in range(1400):
    f = open("./d/"+str(d+1)+".txt")
    text = f.read()
    corpus.append(text)
for q in range(225):
    f = open("./q/"+str(q+1)+".txt")
    text = f.read()
    corpus.append(text)

Prepare vectorizers (binary, term_frequency, tf-idf) and compute cosine and euclidean similarity between given queries and documents

In [3]:
# init vectorizers
vectorizer_binary = CountVectorizer(input=u'content', encoding=u'utf-8', decode_error=u'strict',
                                   stop_words=u'english', binary=True)

vectorizer_tf = CountVectorizer(input=u'content', encoding=u'utf-8', decode_error=u'strict',
                                   stop_words=u'english', binary=False)

vectorizer_tfidf = TfidfVectorizer(input=u'content', encoding=u'utf-8', decode_error=u'strict',
                                   stop_words=u'english', use_idf=True)
 
# prepare matrices
term_matrix_binary = vectorizer_binary.fit_transform(corpus)
term_matrix_tf = vectorizer_tf.fit_transform(corpus)
term_matrix_tfidf = vectorizer_tfidf.fit_transform(corpus)

 
# compute similarity between each query and all docs
cos_sim_binary = np.array(cosine_similarity(term_matrix_binary[len(corpus)-225:], 
                             term_matrix_binary[0:len(corpus)-225]))

cos_sim_tf = np.array(cosine_similarity(term_matrix_tf[len(corpus)-225:], 
                             term_matrix_tf[0:len(corpus)-225]))

cos_sim_tfidf = np.array(cosine_similarity(term_matrix_tfidf[len(corpus)-225:], 
                             term_matrix_tfidf[0:len(corpus)-225]))

euc_sim_binary = np.array(euclidean_distances(term_matrix_binary[len(corpus)-225:], 
                             term_matrix_binary[0:len(corpus)-225]))

euc_sim_tf = np.array(euclidean_distances(term_matrix_tf[len(corpus)-225:], 
                             term_matrix_tf[0:len(corpus)-225]))

euc_sim_tfidf = np.array(euclidean_distances(term_matrix_tfidf[len(corpus)-225:], 
                             term_matrix_tfidf[0:len(corpus)-225]))

Thresholds how many documents should be returned to maximize F-measure for each similarity.
These thresholds are experimentally found for given dataset.

In [4]:
cos_sim_binary_threshold = 17
cos_sim_tf_threshold = 15
cos_sim_tfidf_threshold = 14

euc_sim_binary_threshold = 16
euc_sim_tf_threshold = 19
euc_sim_tfidf_threshold = 6

Function for getting precision, recall and F-measure.
Inputs: (Found documents for given query, Relative documents for given query)

In [5]:
def get_prec_rec_F_meas(best_docs, relative_docs):
    tp = 0
    tn = 0
    fn = 0
    fp = 0
    
    for k in range(len(best_docs)):
        if best_docs[k] in relative_docs:
            tp = tp + 1
    fn = len(relative_docs)-tp
    fp = len(best_docs) - tp
    tn = 1400 - len(best_docs) - fn
    
    try:
        prec = tp/float((tp+fp))
        rec = tp/float((tp+fn))
        F_measure = 2*(prec*rec)/(prec+rec)
    except ZeroDivisionError:
        prec = 0
        rec = 0
        F_measure = 0
    
    result = {}
    result["prec"] = prec
    result["rec"] = rec
    result["F_meas"] = F_measure
    
    return result

Compute precision, recall, F-measure for every given query.

In [6]:
results = []

for i in range(225):
    f = open("./r/"+str(i+1)+".txt")
    text = f.read() 
    relative_docs = text.split('\n')[:-1]
    relative_docs = [int(j) for j in relative_docs]
    
    cos_sim_bin_best_docs = (cos_sim_binary[i].argsort()[:][::-1]+1)[:cos_sim_binary_threshold]
    cos_sim_tf_best_docs = (cos_sim_tf[i].argsort()[:][::-1]+1)[:cos_sim_tf_threshold]
    cos_sim_tfidf_best_docs = (cos_sim_tfidf[i].argsort()[:][::-1]+1)[:cos_sim_tfidf_threshold]
    
    euc_sim_bin_best_docs = (euc_sim_binary[i].argsort()[:][::-1]+1)[:euc_sim_binary_threshold]
    euc_sim_tf_best_docs = (euc_sim_tf[i].argsort()[:][::-1]+1)[:euc_sim_tf_threshold]
    euc_sim_tfidf_best_docs = (euc_sim_tfidf[i].argsort()[:][::-1]+1)[:euc_sim_tfidf_threshold]
    
    tmp_res = []
    
    tmp_res.append(get_prec_rec_F_meas(cos_sim_bin_best_docs, relative_docs))
    tmp_res.append(get_prec_rec_F_meas(cos_sim_tf_best_docs, relative_docs))
    tmp_res.append(get_prec_rec_F_meas(cos_sim_tfidf_best_docs, relative_docs))
    
    tmp_res.append(get_prec_rec_F_meas(euc_sim_bin_best_docs, relative_docs))
    tmp_res.append(get_prec_rec_F_meas(euc_sim_tf_best_docs, relative_docs))
    tmp_res.append(get_prec_rec_F_meas(euc_sim_tfidf_best_docs, relative_docs))
    
    results.append(np.array([
        tmp_res[0]["F_meas"],
        tmp_res[1]["F_meas"],
        tmp_res[2]["F_meas"],
        tmp_res[3]["F_meas"],
        tmp_res[4]["F_meas"],
        tmp_res[5]["F_meas"],
        tmp_res[0]["prec"],
        tmp_res[1]["prec"],
        tmp_res[2]["prec"],
        tmp_res[3]["prec"],
        tmp_res[4]["prec"],
        tmp_res[5]["prec"],
        tmp_res[0]["rec"],
        tmp_res[1]["rec"],
        tmp_res[2]["rec"],
        tmp_res[3]["rec"],
        tmp_res[4]["rec"],
        tmp_res[5]["rec"]
    ]))
    
    ''' Print methods sorted by F-measure
    array = np.array([
    tmp_res[0]["F_meas"],
    tmp_res[1]["F_meas"],
    tmp_res[2]["F_meas"],
    tmp_res[3]["F_meas"],
    tmp_res[4]["F_meas"],
    tmp_res[5]["F_meas"]
    ])
    print(array.argsort()[::-1])
    '''
    
    '''
    print(cos_sim_bin_best_docs)
    print(euc_sim_bin_best_docs)
    print(cos_sim_tf_best_docs)
    print(euc_sim_tf_best_docs)
    print(cos_sim_tf_best_docs)
    print(euc_sim_tf_best_docs)
    print()
    '''

results = np.array(results)

Save results to files

In [7]:
np.savetxt("cos_sim_binary.csv", cos_sim_binary, delimiter=",")
np.savetxt("cos_sim_tf.csv", cos_sim_tf, delimiter=",")
np.savetxt("cos_sim_tfidf.csv", cos_sim_tfidf, delimiter=",")
np.savetxt("euc_sim_binary.csv", euc_sim_binary, delimiter=",")
np.savetxt("euc_sim_tf.csv", euc_sim_tf, delimiter=",")
np.savetxt("euc_sim_tfidf.csv", euc_sim_tfidf, delimiter=",")

np.savetxt("results.csv", results, delimiter=",")