# Homework 5 - Query Modeling

### Import package

In [None]:
import math
import json
import pandas as pd
import numpy as np
from tqdm import tqdm
from collections import Counter
from datetime import datetime
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize

### Preprocessing

In [None]:
def preprocessing(doc_list_filename, query_list_filename, doc_path, query_path):
    def read_and_split(file_path, file_list, description):
        text_split_list = []
        for file in tqdm(file_list, desc='Reading %s' % description):
            filename = file_path + str(file) + '.txt'
            try:
                with open(filename) as f:
                    # 檔案內容切成單字列表(全小寫)
                    text_split = [x.lower() for x in f.read().split()]
            except:
                    text_split = []
            text_split_list.append(text_split)
        return text_split_list

    with open(doc_list_filename) as f:
        doc_list = f.read().splitlines()
    with open(query_list_filename) as f:
        query_list = f.read().splitlines()
    doc_text_split = read_and_split(doc_path, doc_list, 'doc')
    query_text_split = read_and_split(query_path, query_list, 'query')
    
    return doc_list, query_list, doc_text_split, query_text_split

In [None]:
def save_json(data, filename):
    with open(filename, 'w') as f:
        json.dump(data, f)

In [None]:
def read_json(filename):
    with open(filename, 'r') as f:
        data = json.load(f)
    return data

### TF Matrix

In [None]:
def count_dd(doc_text_split, query_text_split):
    index_term = Counter()
    term_df_count = Counter()
    doc_tf_list = []
    for doc in tqdm(doc_text_split, desc='Count word in doc'):
        index_term.update(doc)
        term_df_count.update(set(doc))
        doc_tf_list.append(Counter(doc))
    for query in tqdm(query_text_split, desc='Update counter'):
        index_term.update(query)
    query_index_term = list(set([q for query in query_text_split for q in query]))
    return index_term, query_index_term, doc_tf_list, term_df_count

In [None]:
def term_frequency(index_term, docs, description):
    data = []
    row = []
    col = []
    r = -1
    for d in tqdm(docs, desc='%s TF Matrix' % description):
        r += 1
        for term in d:
            if term in index_term:
                c = index_term[term]
                row.append(r)
                col.append(c)
                data.append(1)
    data = np.array(data)
    row = np.array(row)
    col = np.array(col)
    tf_matrix = csr_matrix((data, (row, col)), shape=(len(docs), len(index_term)), dtype=np.float)
    return tf_matrix

### IDF Matrix

In [None]:
def document_requency(doc_tf_matrix):
    doc_tf_col_counter = Counter(doc_tf_matrix.tocoo().col)

    df_list = []
    for i in tqdm(range(len(doc_tf_col_counter)), desc='DF Matrix'):
        df_list.append(doc_tf_col_counter[i])

    df_matrix = np.array(df_list)
    return df_matrix

### TF-IDF Matrix

In [None]:
def TF_IDF(tf_matrix, idf_matrix):
    tfidf_matrix = tf_matrix.multiply(idf_matrix).tocsr()
    # Normalize
    tfidf_matrix = normalize(tfidf_matrix, norm='l2')
    return tfidf_matrix

### Best Match Models (BM25)

In [None]:
# Best Match Models (BM25)
def BM25(k1, k3, b, doc_text_split, query_text_split, doc_tf_matrix, query_tf_matrix, idf_matrix, index_term_dict):

    avg_doclen = sum([len(doc) for doc in doc_text_split]) / len(doc_text_split)

    bm_sim_matrix = []
    for query_index in tqdm(range(len(query_text_split)), desc='BM25'):
        query_doc_sim = []
        for doc_index in range(len(doc_text_split)):
            sim = 0
            for query in query_text_split[query_index]:
                if query in doc_text_split[doc_index]:
                    word_index = index_term_dict[query]
                    # BM25
                    v1 = ((k1 + 1) * doc_tf_matrix[doc_index, word_index])/(k1 * ((1-b) + b*len(doc_text_split[doc_index])/avg_doclen) + doc_tf_matrix[doc_index, word_index])
                    v2 = ((k3 + 1) * query_tf_matrix[query_index, word_index])/(k3 + query_tf_matrix[query_index, word_index])
                    v3 = idf_matrix[word_index]
                    sim += (v1 * v2 * v3)
            query_doc_sim.append(sim)
        bm_sim_matrix.append(query_doc_sim)
    bm_sim_matrix = np.array(bm_sim_matrix)
    bm_sim_matrix[bm_sim_matrix==0] = 1.
    return bm_sim_matrix

### Rocchio Algorithm

In [None]:
# Rocchio Algorithm

def rocchio(doc_tfidf_matrix, query_tfidf_matrix, additional_matrix):
    rocchio_query_tfidf_matrix = query_tfidf_matrix.copy()
    
    for _ in tqdm(range(iteration), desc='Rocchio'):
        # Cosine similarity matrix
        cos_matrix = cosine_similarity(rocchio_query_tfidf_matrix, doc_tfidf_matrix)

        # Combine some models result to achieve a good performance
        sim_matrix = np.multiply(cos_matrix, additional_matrix)

        # Get relevant document
        # rank_matrix = np.flip(sim_matrix.argsort(axis=1)[:5000], axis=1)
        rank_matrix = np.flip(sim_matrix.argsort(axis=1), axis=1)
        all_non_relevant_docs = non_relevant_docs + (rank_matrix.shape[1]-5000)

        for i in range(rank_matrix.shape[0]):
            # relevant document vector
            rele_doc_vec = doc_tfidf_matrix[rank_matrix[i,:relevant_docs]].mean(axis=0)
            # non relevant document vector
            # non_rele_vec = doc_tfidf_matrix[rank_matrix[i,:-non_relevant_docs]].mean(axis=0)
            non_rele_vec = doc_tfidf_matrix[rank_matrix[i,:-all_non_relevant_docs]].mean(axis=0)
            # move some distance toward the centroid of the relevant documents, 
            # and move some distance away from the centroid of the non relevant documents
            rocchio_query_tfidf_matrix[i] = alpha * rocchio_query_tfidf_matrix[i] + beta * rele_doc_vec - gamma * non_rele_vec
    return cos_matrix

### Ranking

In [None]:
def get_retrieved_dataf(cos_matrix, doc_list, query_list):
    retrieved_documents_list = []

    for i in tqdm(range(cos_matrix.shape[0]), desc='Ranking'):
        # np.argsort(np.argsort(Vector)) 可得到該 Value 在此 Vector 的名次(越大名次越高)
        retrie_doc_value_dict = dict(zip(doc_list, np.argsort(np.argsort(cos_matrix[i]))))
        # 將 (key, value) 根據 Value 進行排序，輸出 key
        retrie_doc_sort_list = sorted(retrie_doc_value_dict.items(),
        key = lambda retrie_doc_value_dict:retrie_doc_value_dict[1],
        reverse = True)
        # 將每個 key 以空格分隔輸出成 String 放至 Retrieved Documents List
        retrieved_documents_list.append(' '.join([doc[0] for doc in retrie_doc_sort_list[:5000]]))
    
    # 存成 DataFrame 
    retrieved_doc_dataf = pd.DataFrame(data={
        'Query': query_list,
        'RetrievedDocuments': retrieved_documents_list})
    
    return retrieved_doc_dataf

### Program

In [None]:
# 讀取事先存好的 json 檔
read_preprocessed_file = False

In [None]:
# File path
data_folder = 'ntust-ir-2020_hw5_new'
doc_list_filename = data_folder + '/doc_list.txt'  # doc_list 檔案路徑
query_list_filename = data_folder + '/query_list.txt'  # query_list 檔案路徑
doc_path = data_folder + '/docs/'  # document 檔案資料夾路徑
query_path = data_folder + '/queries/'  # query 檔案資料夾路徑

# Read doc and query file
if read_preprocessed_file == True:
    doc_list = read_json('doc_list.json')
    query_list = read_json('query_list.json')
    doc_text_split = read_json('doc_text_split.json')
    query_text_split = read_json('query_text_split.json')
else:
    doc_list, query_list, doc_text_split, query_text_split = preprocessing(doc_list_filename, query_list_filename, doc_path, query_path)
    save_json(doc_list, 'doc_list.json')
    save_json(query_list, 'query_list.json')
    save_json(doc_text_split, 'doc_text_split.json')
    save_json(query_text_split, 'query_text_split.json')

# 
if read_preprocessed_file == True:
    index_term = Counter(read_json('index_term.json'))
    query_index_term = read_json('query_index_term.json')
    doc_tf_list = [Counter(doc) for doc in read_json('doc_tf_list.json')]
    term_df_count = Counter(read_json('term_df_count.json'))
else:
    index_term, query_index_term, doc_tf_list, term_df_count = count_dd(doc_text_split, query_text_split)
    with open('index_term.json', 'w') as f:
        json.dump(dict(index_term), f)
    with open('query_index_term.json', 'w') as f:
        json.dump(query_index_term, f)
    with open('doc_tf_list.json', 'w') as f:
        data = [dict(doc) for doc in doc_tf_list]
        json.dump(data, f)
    with open('term_df_count.json', 'w') as f:
        json.dump(dict(term_df_count), f)

In [None]:
# Filter Min-DF and Max-DF
minDf = 10
maxDf = 0.25

if(isinstance(minDf, float) and minDf >= 0.0 and minDf <= 1.0):
    minDf_size = int(index_term.most_common(1)[0][1] * minDf)
else:
    minDf_size = minDf

if(isinstance(maxDf, float) and maxDf >= 0.0 and maxDf <= 1.0):
    maxDf_size = int(index_term.most_common(1)[0][1] * maxDf)
else:
    maxDf_size = min(index_term.most_common(1)[0][1], maxDf)

filter_index_term = Counter(dict(filter(lambda elem: elem[0] in query_index_term or (elem[1] >= minDf_size and elem[1] <= maxDf_size)
, term_df_count.items())))
index_term_dict = {k: v for v, k in enumerate(list(filter_index_term.keys()))} 

# Create tf matrix
doc_tf_matrix = term_frequency(index_term_dict, doc_text_split, 'Doc')
query_tf_matrix = term_frequency(index_term_dict, query_text_split, 'Query')
doc_tf_matrix.data = 1 + np.log(doc_tf_matrix.data)
query_tf_matrix.data = 1 + np.log(query_tf_matrix.data)

# Create df matrix
df_matrix = document_requency(doc_tf_matrix)

# Create idf matrix
idf_matrix = np.log((1 + len(doc_text_split))/(1 + df_matrix)) + 1

# Document tfidf matrix
doc_tfidf_matrix = TF_IDF(doc_tf_matrix, idf_matrix)

# Query tfidf matrix
query_tfidf_matrix = TF_IDF(query_tf_matrix, idf_matrix)

In [None]:
# BM25
k1 = 1.8
k3 = 1500
b = 0.85

if read_preprocessed_file == True:
    bm_sim_matrix = np.load('bm_sim_matrix.npy')
else:
    bm_sim_matrix = BM25(k1, k3, b, doc_text_split, query_text_split, doc_tf_matrix, query_tf_matrix, idf_matrix, index_term_dict)
    np.save('bm_sim_matrix.npy', bm_sim_matrix)

In [None]:
# Rocchio
alpha = 1
beta = 0.5
gamma = 0.15
relevant_docs = 5
non_relevant_docs = 1
iteration = 7

cos_matrix = rocchio(doc_tfidf_matrix, query_tfidf_matrix, bm_sim_matrix)

In [None]:
# Get Retrieved Documents dataframe
submission_df = get_retrieved_dataf(cos_matrix, doc_list, query_list)

# Current date and time
date_time = datetime.now().strftime("%m%d%H%M")

# Format filename
submission_filename = 'hw5_%s_a%s_b%s_g%s_rd%s_nrd%s_it%s_hdf%s_ldf%s.csv' % (date_time, alpha, beta, gamma, relevant_docs, non_relevant_docs, iteration, maxDf, minDf)
submission_message = 'alpha=%s, beta=%s, gamma=%s, relevant_docs=%s, non_relevant_docs=%s, iteration=%s, maxDf=%s, minDf=%s, BM25(k1=%s, k3=%s, b=%s)' % (alpha, beta, gamma, relevant_docs, non_relevant_docs, iteration, maxDf, minDf, k1, k3, b)

# Submission CSV
submission_df.to_csv(submission_filename, index=False)

### Kaggle Submit API

In [None]:
# Kaggle Submit API

import os
os.system('cmd /c \"kaggle competitions submit -c 2020-information-retrieval-and-applications-hw5 -f %s -m \"%s\"\"' % (submission_filename, submission_message))