In [1]:
from nltk.stem import PorterStemmer
import matplotlib.pyplot as plt
import numpy as np
import gensim
import time
import nltk

In [2]:
stemmer = PorterStemmer()
stopwords = nltk.corpus.stopwords.words('english')

# Read Documents
def load_data(path):
    files = open(path, 'r', encoding='utf-8')
    raw_docs = files.readlines()
    docs = []
    flag = 0
    temp = ""
    for doc in raw_docs:
        if doc[:2] == '.I':
            flag = 0
            docs.append(temp)
            temp = ""
        elif doc[:2] == '.W':
            flag = 1
        elif flag == 1:
            content = doc.replace('.', '').replace("''", '').replace('\n', '').lower()
            content = gensim.utils.simple_preprocess(content)
            content = [" "+non_stopword for non_stopword in content if non_stopword not in stopwords]
            content = [stemmer.stem(word) for word in content]
            temp +=  " ".join(content)

    docs = np.asarray(docs[1:])
    return docs

In [3]:
# Build Vocab document
def build_dictionary(lst_contents):
    dictionary = set()
    for content in lst_contents:
        dictionary.update(content)
    return dictionary

# TF IDF Weighting
def calc_tf_weighting(vocab, lst_contents):
    TF = np.zeros((len(vocab), len(lst_contents)))
    for index, word in enumerate(vocab):
        for jndex, content in enumerate(lst_contents):
            TF[index,jndex] = content.count(word)
    return np.array(TF)

def calc_idf_weighting(TF, N):
    DF = np.sum(TF!=0, axis=1)
    IDF = np.log(N/DF)
    return np.array([IDF]).T

# Normalize theo slide bài giảng
def normalize_weighitng(TF,IDF):
    norm = np.sum((TF**2) * (IDF**2) +1, axis=0)
    W = TF*IDF / norm
    return W

In [4]:
def indexing(TF_IDF):
    index = []
    if TF_IDF.shape[1] > 1:
        for term in TF_IDF:
            index_term = []
            for TF in term[term > 0.]:
                idx = np.where(term == TF)
                index_term.append((idx[0][0], TF))
            index.append(index_term)
    elif TF_IDF.shape[1] == 1:
        for TF in TF_IDF[TF_IDF > 0.]:
            idx = np.where(TF_IDF == TF)
            index.append((idx[0][0], TF))
    return index

In [5]:
def Calculate_AP(retrieval, query):
    precision = 0
    precision_list = []
    recall_list = []

    for i in range(len(retrieval)):
        if retrieval[i] in query:
            precision += 1
            precision_list.append(float(precision)/(i+1))
            recall_list.append(float(precision)/len(query))

    temp = 0
    for i in range(len(precision_list)-1,-1,-1):
        if temp >= precision_list[i]:
            precision_list[i] = temp
        elif temp < precision_list[i]:
            temp = precision_list[i]
    
    precision_list_11 = []
    for i in range(11):
        for j in range(len(recall_list)):
            if recall_list[j] >= (i/10):
                precision_list_11.append(max(precision_list[j:]))
                break
    if len(precision_list_11) < 11:
        precision_list_11 += [0]*(11-len(precision_list_11))

    return sum(precision_list_11)/len(precision_list_11)

def mAP(rank, qrel):
    mAP = []
    for i in range(len(rank)):
        ap = Calculate_AP(rank[i], qrel[i])
        mAP.append(ap)
    return np.mean(np.array(mAP))

def cosine_similarity(x,y):
    return np.dot(x,y) / (np.linalg.norm(x) * np.linalg.norm(y))

In [6]:
fqrel = open('cranqrel', 'r', encoding='utf-8')
qrel = []
for f in fqrel.readlines():
    qrel.append(f.split()[:2])

groundtruth = [[] for i in range(225)] 
for index in qrel:
    idx = int(index[0]) - 1
    truth = int(index[1]) - 1
    groundtruth[idx].append(truth)

In [7]:
docs = load_data('cran.all.1400')
docs = [doc.split() for doc in docs]
queries = load_data('cran.qry')

In [8]:
start_time = time.time()
vocab = build_dictionary(docs)
TF = calc_tf_weighting(vocab, docs)
IDF = calc_idf_weighting(TF, len(docs))
vector_W = normalize_weighitng(TF, IDF)
print('Executed tf-idf time: {}'.format(time.time() - start_time))

print(vector_W.shape)
print(vector_W[0].shape)
print(vector_W[:, 0].shape)

Executed tf-idf time: 16.831020832061768
(4278, 1400)
(1400,)
(4278,)


In [9]:
index_doc = indexing(vector_W)
len(index_doc)

4278

In [10]:
index_doc[:4][:5]

[[(32, 0.002315523664277359),
  (33, 0.0005577226369853424),
  (60, 0.003298172977800566),
  (87, 0.0019176328267168077),
  (103, 0.00207108917231462),
  (111, 0.001247234390497613),
  (207, 0.003075566934775168),
  (266, 0.0035873045759648625),
  (267, 0.0026167671804609737),
  (268, 0.0032805904593704973),
  (269, 0.0027680788973944437),
  (295, 0.001552656575820492),
  (296, 0.0020011149701120805),
  (297, 0.0015519753522364006),
  (298, 0.0007167751304306248),
  (401, 0.0013319407049198397),
  (402, 0.0006843448023632851),
  (406, 0.0014932906950764953),
  (407, 0.0015054682099304943),
  (436, 0.0007481582144419605),
  (446, 0.0006395045200627539),
  (447, 0.0005833673635250012),
  (448, 0.0006932317301774058),
  (449, 0.0006989442026776246),
  (489, 0.0007428647142480561),
  (499, 0.0006236197935799408),
  (530, 0.0011422163302742118),
  (606, 0.0007784088155239551),
  (652, 0.003220778108242892),
  (966, 0.0006705155575999825),
  (1180, 0.00199693259304285),
  (1220, 0.0020922914

In [14]:
all_rank = []
print('Total available queries: {}'.format(len(queries)))
for query in queries:
    qTF = calc_tf_weighting(vocab, [query.split()])
    qTF_IDF = normalize_weighitng(qTF, IDF)

    index_query = indexing(qTF_IDF)
    res = []
    for term_idx in index_query:
        temp = []
        idx, value = term_idx
        chosen = index_doc[idx]
        for x in chosen:
            temp.append((x[0], x[1] * value))
        res.append(temp)
    result = {}
    for num_doc in res:
        for key, value in num_doc:
            result[key] = result.get(key, 0) + value
    result = list(result.items())
    result = sorted(result, key=lambda tup: tup[1], reverse=True)
    rank = np.array([idx[0] for idx in result])

    all_rank.append(rank)
all_rank = np.asarray(all_rank)
print(f'Kết quả MAP: {mAP(all_rank, groundtruth)}')

Total available queries: 225
Kết quả MAP: 0.39647188057329447
