## Imports and I/O function

In [1]:
import os
import numpy as np
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import cdist


def ingest_docs(path):
	lines = [l.strip() for l in open(path, 'r').readlines()]
	docs = []
	acc = ""
	ingest = False

	for l in lines:
		if l.startswith(".W"):
			ingest = True
			acc = ""
			continue

		if ingest:
			if l.startswith(".I"):
				ingest = False
				docs.append(acc)
			else:
				acc += l
	docs.append(acc)

	return docs

## Average precision @ k function

In [2]:
def apk(ref, res, k):

	if (len(ref) == 0) or (len(res) == 0):
		return 0.0

	if len(res) > k:
		res = res[:k]

	score = 0.0
	num_hits = 0.0

	for i, p in enumerate(res):
		if p in ref: # Supposing there are no duplicates in res.
			num_hits += 1.0
			score += num_hits / (i+1.0) # Precision @ i

	return score / min(len(ref), k)

## Ingesting the data

In [3]:
docs = ingest_docs('cran/original/cran.all.1400')
queries = ingest_docs('cran/original/cran.qry')
crans = [l.strip().split() for l in open('cran/cranqrel', 'r').readlines()]

## Transforming to tf-idf

In [63]:
vv = TfidfVectorizer(stop_words='english', sublinear_tf=True, use_idf=True)
doc_tfidf = vv.fit_transform(docs)
v2 = TfidfVectorizer(stop_words='english', sublinear_tf=True, use_idf=True, vocabulary=vv.vocabulary_)
v2.fit(docs) # using lnc.ltc gave a worse map so I resorted to ltc.ltc
query_tfidf = v2.fit_transform(queries)

## Calculating similiarities and building a ranked index for each query

In [64]:
# print(np.isfinite(query_tfidf.toarray()).all())
# print(np.isfinite(doc_tfidf.toarray()).all())
with np.errstate(invalid='ignore', divide='ignore'):
    sims = (1 - cdist(doc_tfidf.toarray(), query_tfidf.toarray(), metric='cosine')).T # T because I want sims for every q
# print(sims.shape)
query_idx = [(np.argsort(sim)[::-1] + 1) for sim in sims]

## Structuring the judging data

In [65]:
judgs = defaultdict(list)
for j in crans:
	judgs[int(j[0])].append(int(j[1]))

## Calculating MAP

In [66]:
MAP = 0.0

for i in range(225):
	ap = apk(judgs[i+1], query_idx[i], len(query_idx[i]))
	MAP += ap

MAP /= 225.0

print(MAP)

0.19350145401795585
