In [1]:
from nltk.stem import PorterStemmer
import matplotlib.pyplot as plt
import numpy as np
import gensim
import time

In [2]:
stemmer = PorterStemmer()
def load_stop():
    f = open('./nfcorpus/raw/stopwords.large', 'r', encoding='utf-8')
    stopwords = [word.replace('\n','') for word in f.readlines()]
    return stopwords

stopwords = load_stop()


In [3]:
stemmer = PorterStemmer()

def load_data(path):
    files = open(path, 'r', encoding='utf-8')
    raw_docs = files.readlines()
    raw_docs = [doc.replace('\t', ' ').replace('\n', '').split() for doc in raw_docs]
    title = []
    docs = []
    
    for doc in raw_docs:
        title.append(doc[0])
        content = " ".join(doc[1:])
        content = gensim.utils.simple_preprocess(content)
        content = [non_stopword for non_stopword in content if non_stopword not in stopwords]
        content = [stemmer.stem(word) for word in content]
        docs.append(content)

    return np.array(docs), np.array(title)

In [4]:
# Build Vocab document
def build_dictionary(lst_contents):
    dictionary = set()
    for content in lst_contents:
        dictionary.update(content)
    return dictionary

# TF IDF Weighting
def calc_tf_weighting(vocab, lst_contents):
    TF = np.zeros((len(vocab), len(lst_contents)))
    for index, word in enumerate(vocab):
        for jndex, content in enumerate(lst_contents):
            TF[index,jndex] = content.count(word)
    return np.array(TF)

def calc_idf_weighting(TF, N):
    DF = np.sum(TF!=0, axis=1)
    IDF = np.log(N/DF)
    return np.array([IDF]).T

# Normalize theo slide bài giảng
def normalize_weighitng(TF,IDF):
    norm = np.sum((TF**2) * (IDF**2) +1, axis=0)
    W = TF*IDF / norm
    return W

In [5]:
def indexing(TF_IDF):
    index = []
    if TF_IDF.shape[1] > 1:
        for term in TF_IDF:
            index_term = []
            for TF in term[term > 0.]:
                idx = np.where(term == TF)
                index_term.append((idx[0][0], TF))
            index.append(index_term)
    elif TF_IDF.shape[1] == 1:
        for TF in TF_IDF[TF_IDF > 0.]:
            idx = np.where(TF_IDF == TF)
            index.append((idx[0][0], TF))
    return index

In [6]:
def Calculate_AP(retrieval, query):
    precision = 0
    precision_list = []
    recall_list = []

    for i in range(len(retrieval)):
        if retrieval[i] in query:
            precision += 1
            precision_list.append(float(precision)/(i+1))
            recall_list.append(float(precision)/len(query))

    temp = 0
    for i in range(len(precision_list)-1,-1,-1):
        if temp >= precision_list[i]:
            precision_list[i] = temp
        elif temp < precision_list[i]:
            temp = precision_list[i]
    
    precision_list_11 = []
    for i in range(11):
        for j in range(len(recall_list)):
            if recall_list[j] >= (i/10):
                precision_list_11.append(max(precision_list[j:]))
                break
    if len(precision_list_11) < 11:
        precision_list_11 += [0]*(11-len(precision_list_11))

    return sum(precision_list_11)/len(precision_list_11)

def mAP(rank, qrel_truth, qrel_truth_title, qrel_title ):
    mAP = []
    for i in range(len(rank)):
        element = qrel_truth_title.index(qrel_title[i])
        ap = Calculate_AP(rank[i], qrel_truth[element])
        mAP.append(ap)
    return np.mean(np.array(mAP))

def cosine_similarity(x,y):
    return np.dot(x,y) / (np.linalg.norm(x) * np.linalg.norm(y))

### Tập train

In [7]:
docs, docs_title = load_data('./nfcorpus/train/train.docs')
queries, queries_title = load_data('./nfcorpus/train/train.all.queries')



In [8]:
fqrel = open('./nfcorpus/train/train.3-2-1.qrel', 'r', encoding='utf-8')
qrel = []
for f in fqrel.readlines():
    content = f.replace('\t', ' ').replace('\n', '').split()
    qrel.append([content[0], content[2]])

groundtruth = [[] for i in range(len(queries))]
groundtruth_title = []
for i in qrel:
    if i[0] not in groundtruth_title:
        groundtruth_title.append(i[0])
    element = groundtruth_title.index(i[0])
    groundtruth[element].append(i[1])

In [9]:
start_time = time.time()
vocab = build_dictionary(docs)
TF = calc_tf_weighting(vocab, docs)
IDF = calc_idf_weighting(TF, len(docs))
vector_W = normalize_weighitng(TF, IDF)
print('Executed tf-idf time: {}'.format(time.time() - start_time))

print(vector_W.shape)
print(vector_W[0].shape)
print(vector_W[:, 0].shape)

Executed tf-idf time: 343.8844952583313
(15681, 3612)
(3612,)
(15681,)


In [10]:
index_doc = indexing(vector_W)
len(index_doc)

15681

In [11]:
index_doc[:3][:3]

[[(94, 0.0002919345648662814),
  (471, 0.0002343522637636209),
  (713, 0.0002778638047875274),
  (1145, 0.0003073534119215556),
  (2061, 0.0002411644446946011),
  (2150, 0.00020599538220453234),
  (2646, 0.0002548669538697745),
  (2657, 0.0002475819891792263),
  (2868, 0.00026750050073915684),
  (2881, 0.0003126212558441057),
  (3319, 0.0002769286221239339),
  (3528, 0.00027362057238604836)],
 [(265, 0.00032633510409427446)],
 [(362, 0.0004151293002980173), (2892, 0.0004033769467926772)]]

In [9]:
all_rank = []
all_rank_title = []
start_time = time.time()
for i, query in enumerate(queries):
    if i % 500 == 0:
        print('Here: {} - Time: {}'.format(i, time.time() - start_time))
    qTF = calc_tf_weighting(vocab, [query])
    qTF_IDF = normalize_weighitng(qTF, IDF)

    index_query = indexing(qTF_IDF)
    res = []
    for term_idx in index_query:
        temp = []
        idx, value = term_idx
        chosen = index_doc[idx]
        for x in chosen:
            temp.append((x[0], x[1] * value))
        res.append(temp)
    result = {}
    for num_doc in res:
        for key, value in num_doc:
            result[key] = result.get(key, 0) + value
    result = list(result.items())
    result = sorted(result, key=lambda tup: tup[1], reverse=True)
    rank = np.array([idx[0] for idx in result])
    all_rank.append(rank)
    all_rank_title.append(docs_title[rank])
    
print(f'Kết quả MAP: {mAP(all_rank_title, groundtruth, groundtruth_title, queries_title)}')

Here: 0 - Time: 0.0
Here: 500 - Time: 91.38668084144592
Here: 1000 - Time: 197.038480758667
Here: 1500 - Time: 716.5844657421112
Here: 2000 - Time: 1015.5506906509399
Here: 2500 - Time: 1113.0604956150055
Kết quả MAP: 0.21686824052779055


In [10]:
all_rank_title = docs_title[all_rank]
print(f'Kết quả MAP: {mAP(all_rank_title, groundtruth, groundtruth_title, queries_title)}')

Kết quả MAP: 0.21686824052779055


### Tập dev

In [15]:
docs, docs_title = load_data('./nfcorpus/dev/dev.docs')
queries, queries_title = load_data('./nfcorpus/dev/dev.all.queries')

fqrel = open('./nfcorpus/dev/dev.3-2-1.qrel', 'r', encoding='utf-8')
qrel = []
for f in fqrel.readlines():
    content = f.replace('\t', ' ').replace('\n', '').split()
    qrel.append([content[0], content[2]])

groundtruth = [[] for i in range(len(queries))]
groundtruth_title = []
for i in qrel:
    if i[0] not in groundtruth_title:
        groundtruth_title.append(i[0])
    element = groundtruth_title.index(i[0])
    groundtruth[element].append(i[1])

start_time = time.time()
vocab = build_dictionary(docs)
TF = calc_tf_weighting(vocab, docs)
IDF = calc_idf_weighting(TF, len(docs))
vector_W = normalize_weighitng(TF, IDF)
print('Executed tf-idf time: {}'.format(time.time() - start_time))
index_doc = indexing(vector_W)

print(vector_W.shape)
print(vector_W[0].shape)
print(vector_W[:, 0].shape)

all_rank = []
all_rank_title = []
start_time = time.time()
for i, query in enumerate(queries):
    if i % 500 == 0:
        print('Here: {} - Time: {}'.format(i, time.time() - start_time))
    qTF = calc_tf_weighting(vocab, [query])
    qTF_IDF = normalize_weighitng(qTF, IDF)
    index_query = indexing(qTF_IDF)
    res = []
    for term_idx in index_query:
        temp = []
        idx, value = term_idx
        chosen = index_doc[idx]
        for x in chosen:
            temp.append((x[0], x[1] * value))
        res.append(temp)
    result = {}
    for num_doc in res:
        for key, value in num_doc:
            result[key] = result.get(key, 0) + value
    result = list(result.items())
    result = sorted(result, key=lambda tup: tup[1], reverse=True)
    rank = np.array([idx[0] for idx in result])
    all_rank.append(rank)
    all_rank_title.append(docs_title[rank])

print(f'Kết quả MAP: {mAP(all_rank_title, groundtruth, groundtruth_title, queries_title)}')



Executed tf-idf time: 129.1065649986267
(14678, 3193)
(3193,)
(14678,)
Here: 0 - Time: 0.0
Kết quả MAP: 0.22219099104563436


In [16]:
print(f'Kết quả MAP: {mAP(all_rank_title, groundtruth, groundtruth_title, queries_title)}')

Kết quả MAP: 0.22219099104563436


### Tập test

In [17]:
docs, docs_title = load_data('./nfcorpus/test/test.docs')
queries, queries_title = load_data('./nfcorpus/test/test.all.queries')

fqrel = open('./nfcorpus/test/test.3-2-1.qrel', 'r', encoding='utf-8')
qrel = []
for f in fqrel.readlines():
    content = f.replace('\t', ' ').replace('\n', '').split()
    qrel.append([content[0], content[2]])

groundtruth = [[] for i in range(len(queries))]
groundtruth_title = []
for i in qrel:
    if i[0] not in groundtruth_title:
        groundtruth_title.append(i[0])
    element = groundtruth_title.index(i[0])
    groundtruth[element].append(i[1])

start_time = time.time()
vocab = build_dictionary(docs)
TF = calc_tf_weighting(vocab, docs)
IDF = calc_idf_weighting(TF, len(docs))
vector_W = normalize_weighitng(TF, IDF)
print('Executed tf-idf time: {}'.format(time.time() - start_time))

print(vector_W.shape)
print(vector_W[0].shape)
print(vector_W[:, 0].shape)

all_rank = []
start_time = time.time()
for i, query in enumerate(queries):
    if i % 500 == 0:
        print('Here: {} - Time: {}'.format(i, time.time() - start_time))
    qTF = calc_tf_weighting(vocab, [query])
    qTF_IDF = normalize_weighitng(qTF, IDF)
    dists = cosine_similarity(qTF_IDF.T, vector_W)[0]
    rank = np.argsort(dists)[::-1]
    all_rank.append(rank)
    
all_rank_title = docs_title[all_rank]
print(f'Kết quả MAP: {mAP(all_rank_title, groundtruth, groundtruth_title, queries_title)}')



Executed tf-idf time: 132.2583577632904
(14663, 3162)
(3162,)
(14663,)
Here: 0 - Time: 0.0
Kết quả MAP: 0.2258748886006781


In [18]:
print(f'Kết quả MAP: {mAP(all_rank_title, groundtruth, groundtruth_title, queries_title)}')

Kết quả MAP: 0.2258748886006781
