In [25]:
# IR HW1 Vector Space Model
import numpy as np
from tqdm import tqdm_notebook as tqdm
from collections import Counter
import math
from scipy.spatial.distance import cosine
from sklearn.preprocessing import Normalizer

In [13]:
path = './ntust-ir-2020'

In [14]:
# upload input data(document) and build a terms set
terms_set = set() # Set D.S. has no duplicate elements
doc_set = []
doc = []
doc_list = []
with open('{}/doc_list.txt'.format(path), 'r') as fp:
  line = fp.read().splitlines()
  for i in tqdm(line):
    doc_list.append(i)
    f_temp = open('{}/docs/{}.txt'.format(path,i), 'r')
    doc.append(f_temp.read().split())
    f_temp.close()
for d in doc:
  doc_set.append(set(d))

HBox(children=(FloatProgress(value=0.0, max=4191.0), HTML(value='')))




In [15]:
# upload input data(query)
query = []
query_list = []
with open('{}/query_list.txt'.format(path), 'r') as fp:
  line = fp.read().splitlines()
  for i in tqdm(line):
    query_list.append(i)
    f_temp = open('{}/queries/{}.txt'.format(path,i), 'r')
    query.append(f_temp.read().split())
    f_temp.close()
for q in query:
  for t in q:
    if(t not in terms_set):
      terms_set.add(t)

HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))




In [16]:
# Vector initialization
doc_len = len(doc_list)
query_len = len(query_list)
terms_len = len(terms_set)
terms_set = list(terms_set) # From set to list

In [41]:
# Build Document term-frequency vector
tf_doc = np.zeros([terms_len, doc_len]) # terms_len x doc_len
tf_doc = tf_doc.astype(float)
idf_count = []
for d in range(doc_len):
  idf_count.append(Counter(doc[d]))
for d in tqdm(range(doc_len)):
  for i in range(terms_len):
    tf_doc[i][d] = idf_count[d][terms_set[i]]
# transpose matrix
tf_doc = tf_doc.transpose()

HBox(children=(FloatProgress(value=0.0, max=4191.0), HTML(value='')))




In [42]:
# Build Document IDF vector
idf_doc = np.array([0]*terms_len) # 1 x terms_len
idf_doc = idf_doc.astype(float)
N = doc_len
for i in tqdm(range(terms_len)):
  ni = 1 # prevent zero
  for d in range(doc_len):
    if(terms_set[i] in doc_set[d]):
      ni += 1
  idf_doc[i] = 1 + math.log(1 + N/ni)

HBox(children=(FloatProgress(value=0.0, max=123.0), HTML(value='')))




In [43]:
# Build Document TD-IDF
tf_idf_doc = np.zeros([doc_len, terms_len]) # doc_len x terms_len
tf_idf_doc = tf_idf_doc.astype(float)
for d in tqdm(range(doc_len)):
  for i in range(terms_len):
    tf_idf_doc[d][i] = tf_doc[d][i] * idf_doc[i]

HBox(children=(FloatProgress(value=0.0, max=4191.0), HTML(value='')))




In [44]:
# Build Query TF-IDF
tf_idf_query = np.zeros([query_len, terms_len]) # query_len x terms_len
tf_idf_query = tf_idf_query.astype(float)
for q in tqdm(range(query_len)):
  for k in query[q]:
    ni = 1 # prevent zero
    for d in range(doc_len):
      if(k in doc_set[d]): ni += 1
    tf_idf_query[q][terms_set.index(k)] = 1 + math.log(1 + N/ni) # IDF
    tf_idf_query[q][terms_set.index(k)] *= 2 # (1 + tf)
temp = Normalizer().fit(tf_idf_query)
tf_idf_query = temp.transform(tf_idf_query)

HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))




In [45]:
# Build Cosine Similarity
sim = np.zeros([query_len, doc_len]) # query_len x doc_len
sim = sim.astype(float)
for q in tqdm(range(query_len)):
  for d in range(doc_len):
    sim[q][d] = 1- cosine(tf_idf_query[q], tf_idf_doc[d]) # a query and a document

HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))




In [46]:
# Create output file
fp = open("vsm_result.txt", "w")
fp.write("Query,RetrievedDocuments\n")
for i in tqdm(range(query_len)):
    fp.write('{},'.format(query_list[i]))
    for s in np.argsort(-sim[i]):
        fp.write(doc_list[s] + ' ')
    fp.write('\n')
fp.close()

HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


