# Homework 2 - Best Match Models

## Import package

In [1]:
import math
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

## Reading file
讀取 Document 和 Query，並將內容以切割成文章單字列表

In [2]:
doc_list_filename = 'data/doc_list.txt'  # doc_list 檔案路徑
query_list_filename = 'data/query_list.txt'  # query_list 檔案路徑
doc_path = 'data/docs/'  # document 檔案資料夾路徑
query_path = 'data/queries/'  # query 檔案資料夾路徑

In [3]:
doc_list = pd.read_table(doc_list_filename, header=None)[0].tolist()
query_list = pd.read_table(query_list_filename, header=None)[0].tolist()

In [4]:
def read_and_split(file_path, file_list, description):
    text_list = []
    text_split_list = []
    pbar = tqdm(file_list)  # 進度條
    pbar.set_description('Reading %s' % description)
    for file in pbar:
        filename = file_path + str(file) + '.txt'
        try:
            text = pd.read_table(filename, header=None)[0][0]  # 只讀取檔案的第一行
        except:
            text = ''
        text_list.append(text)  # 檔案完整內容
        text_split_list.append(text.split())  # 檔案內容切成單字列表
    return text_list, text_split_list

In [5]:
doc_text, doc_text_split = read_and_split(doc_path ,doc_list, 'doc')
query_text, query_text_split = read_and_split(query_path ,query_list, 'query')

HBox(children=(IntProgress(value=0, max=4191), HTML(value='')))




HBox(children=(IntProgress(value=0, max=50), HTML(value='')))




## Index term
將 Document 與 Query 各文章單字列表合併，去除重複得到 Index term。

In [6]:
def list_remove_duplicates(data_list):
    t_list = []
    for data in data_list:
        t_list += data
    t_dict = {}.fromkeys(t_list)  # 利用 Dictionary key 不重複的特性，取得 Index Term
    return list(t_dict)  # 將 Dictionary 轉成 List 回傳

In [7]:
index_term = list_remove_duplicates(doc_text_split + query_text_split)

## Term Frequency & Inverse Document Frequency
由於 homework2 的資料集與 homework1 相同，因此直接載入 Term Frequency 與 Inverse Document Frequency 的資料。

In [8]:
doc_tf_matrix = np.load('doc_tf_matrix.npy')
query_tf_matrix = np.load('query_tf_matrix.npy')
doc_df_matrix = np.load('doc_df_matrix.npy')

In [9]:
doc_idf_matrix = np.log((len(doc_text_split)-doc_df_matrix+0.5)/(doc_df_matrix+0.5))

## Best Match Models (BM25)
+ $sim_{BM25}(d_{j},q) \equiv \sum_{w_i=\in\{d_j\cap q\}}^{} \frac{(K_1+1) \times tf_{ij}}{K_1[(1-b) + b \times \frac{len(d_j)}{avg_{doclen}}]+tf_{ij}} \times \frac{(K_3+1) \times tf_{i,q}}{K_3 + tf_{i,q}} \times log(\frac{N-n_i + 0.5}{n_i + 0.5})$
    + $K_1 = 1.8$
    + $K_3 = 1500$
    + $b = 0.85$

In [10]:
k1 = 1.8
k3 = 1500
b = 0.85

In [11]:
avg_doclen = sum([len(doc) for doc in doc_text_split]) / len(doc_text_split)

In [12]:
pbar = tqdm(range(len(query_text_split)))  # 進度條
pbar.set_description('BM25')

sim_matrix = []
for query_index in pbar:
    query_doc_sim = []
    for doc_index in range(len(doc_text_split)):
        sim = 0
        for query in query_text_split[query_index]:
            if query in doc_text_split[doc_index]:
                word_index = index_term.index(query)
                # BM25
                v1 = ((k1 + 1) * doc_tf_matrix[doc_index][word_index])/(k1 * ((1-b) + b*len(doc_text_split[doc_index])/avg_doclen) + doc_tf_matrix[doc_index][word_index])
                v2 = ((k3 + 1) * query_tf_matrix[query_index][word_index])/(k3 + query_tf_matrix[query_index][word_index])
                v3 = doc_idf_matrix[word_index]
                sim += (v1 * v2 * v3)
        query_doc_sim.append(sim)
    sim_matrix.append(query_doc_sim)
sim_matrix = np.array(sim_matrix)

HBox(children=(IntProgress(value=0, max=50), HTML(value='')))




## Rank
1. 根據剛剛的 Cosine similarity Matrix，可以把每個 Query 與所有 Document 的相似程度做排名，並把排名結果以 Document 檔名依序列出，存成一個 Retrieved Documents List。
2. 把 Query List 和 Retrieved Documents List 建成一個 DatafFrame，輸出成 CSV。

In [13]:
retrieved_documents_list = []

pbar = tqdm(range(sim_matrix.shape[0]))
pbar.set_description('Ranking')
for i in pbar:
    # np.argsort(np.argsort(Vector)) 可得到該 Value 在此 Vector 的名次(越大名次越高)
    retrie_doc_value_dict = dict(zip(doc_list, np.argsort(np.argsort(sim_matrix[i]))))
    # 將 (key, value) 根據 Value 進行排序，輸出 key
    retrie_doc_sort_list = sorted(retrie_doc_value_dict.items(),
                                                                  key=lambda retrie_doc_value_dict:retrie_doc_value_dict[1], 
                                                                  reverse = True)
    # 將每個 key 以空格分隔輸出成 String 放至 Retrieved Documents List
    retrieved_documents_list.append(' '.join([doc[0] for doc in retrie_doc_sort_list]))

HBox(children=(IntProgress(value=0, max=50), HTML(value='')))




In [14]:
# 存成 DataFrame 
submission_df = pd.DataFrame(data={'Query': query_list,
                                                                               'RetrievedDocuments': retrieved_documents_list})
submission_df.head()

Unnamed: 0,Query,RetrievedDocuments
0,301,FBIS3-23986 FBIS4-7811 FBIS3-21961 FBIS3-19646...
1,302,LA043090-0036 FBIS4-67701 FBIS4-30637 LA031489...
2,303,FT921-7107 LA122990-0029 FT944-128 FT931-6554 ...
3,304,FR940617-0-00104 FR940706-2-00012 FR941006-2-0...
4,305,LA031689-0177 FT944-18875 FBIS4-45230 FBIS4-44...


In [15]:
# 輸出成 CSV
submission_df.to_csv('submission_hw2.csv', index=False)