In [None]:
import math
import pandas as pd
import numpy as np
from tqdm import tqdm
from collections import Counter
from datetime import datetime
from scipy.sparse import csr_matrix, coo_matrix
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
import json

## Preprocessing

In [None]:
# 讀取事先存好的 json 檔
read_preprocessed_file = True

In [None]:
data_folder = 'ntust-ir-2020_hw5_new'
doc_list_filename = data_folder + '/doc_list.txt'  # doc_list 檔案路徑
query_list_filename = data_folder + '/query_list.txt'  # query_list 檔案路徑
doc_path = data_folder + '/docs/'  # document 檔案資料夾路徑
query_path = data_folder + '/queries/'  # query 檔案資料夾路徑

In [None]:
def preprocessing(doc_list_filename, query_list_filename, doc_path, query_path):
    def read_and_split(file_path, file_list, description):
        text_split_list = []
        for file in tqdm(file_list, desc='Reading %s' % description):
            filename = file_path + str(file) + '.txt'
            try:
                with open(filename) as f:
                    # 檔案內容切成單字列表(全小寫)
                    text_split = [x.lower() for x in f.read().split()]
            except:
                    text_split = []
            text_split_list.append(text_split)
        return text_split_list

    with open(doc_list_filename) as f:
        doc_list = f.read().splitlines()
    with open(query_list_filename) as f:
        query_list = f.read().splitlines()
    doc_text_split = read_and_split(doc_path, doc_list, 'doc')
    query_text_split = read_and_split(query_path, query_list, 'query')
    
    return doc_list, query_list, doc_text_split, query_text_split

In [None]:
def save_json(data, filename):
    with open(filename, 'w') as f:
        json.dump(data, f)

In [None]:
def read_json(filename):
    with open(filename, 'r') as f:
        data = json.load(f)
    return data

In [None]:
def count_dd(doc_text_split, query_text_split):
    index_term = Counter()
    term_df_count = Counter()
    doc_tf_list = []
    for doc in tqdm(doc_text_split, desc='Count word in doc'):
        index_term.update(doc)
        term_df_count.update(set(doc))
        doc_tf_list.append(Counter(doc))
    for query in tqdm(query_text_split, desc='Update counter'):
        index_term.update(query)
    query_index_term = list(set([q for query in query_text_split for q in query]))
    return index_term, query_index_term, doc_tf_list, term_df_count

In [None]:
if read_preprocessed_file == True:
    doc_list = read_json('doc_list.json')
    query_list = read_json('query_list.json')
    doc_text_split = read_json('doc_text_split.json')
    query_text_split = read_json('query_text_split.json')
else:
    doc_list, query_list, doc_text_split, query_text_split = preprocessing(doc_list_filename, query_list_filename, doc_path, query_path)
    save_json(doc_list, 'doc_list.json')
    save_json(query_list, 'query_list.json')
    save_json(doc_text_split, 'doc_text_split.json')
    save_json(query_text_split, 'query_text_split.json')

In [None]:
if read_preprocessed_file == True:
    index_term = Counter(read_json('index_term.json'))
    query_index_term = read_json('query_index_term.json')
    doc_tf_list = [Counter(doc) for doc in read_json('doc_tf_list.json')]
    term_df_count = Counter(read_json('term_df_count.json'))
else:
    index_term, query_index_term, doc_tf_list, term_df_count = count_dd(doc_text_split, query_text_split)
    with open('index_term.json', 'w') as f:
        json.dump(dict(index_term), f)
    with open('query_index_term.json', 'w') as f:
        json.dump(query_index_term, f)
    with open('doc_tf_list.json', 'w') as f:
        data = [dict(doc) for doc in doc_tf_list]
        json.dump(data, f)
    with open('term_df_count.json', 'w') as f:
        json.dump(dict(term_df_count), f)

### TF Matrix

In [None]:
def term_frequency(index_term, docs):
    data = []
    row = []
    col = []
    r = -1
    for d in tqdm(docs):
        r += 1
        for term in d:
            if term in index_term:
                c = index_term[term]
                row.append(r)
                col.append(c)
                data.append(1)
    data = np.array(data)
    row = np.array(row)
    col = np.array(col)
    tf_matrix = csr_matrix((data, (row, col)), shape=(len(docs), len(index_term)), dtype=np.float)
    return tf_matrix

In [None]:
# Filter Min-DF and Max-DF

minDf = 7
maxDf = 0.85

if(isinstance(minDf, float) and minDf >= 0.0 and minDf <= 1.0):
    minDf_size = int(index_term.most_common(1)[0][1] * minDf)
else:
    minDf_size = minDf

if(isinstance(maxDf, float) and maxDf >= 0.0 and maxDf <= 1.0):
    maxDf_size = int(index_term.most_common(1)[0][1] * maxDf)
else:
    maxDf_size = min(index_term.most_common(1)[0][1], maxDf)

filter_index_term = Counter(dict(filter(lambda elem: elem[0] in query_index_term or (elem[1] >= minDf_size and elem[1] <= maxDf_size)
, term_df_count.items())))
index_term_dict = {k: v for v, k in enumerate(list(filter_index_term.keys()))} 

In [None]:
# Create tf matrix

doc_tf_matrix = term_frequency(index_term_dict, doc_text_split)
query_tf_matrix = term_frequency(index_term_dict, query_text_split)

doc_tf_matrix.data = 1 + np.log(doc_tf_matrix.data)
query_tf_matrix.data = 1 + np.log(query_tf_matrix.data)

### IDF Matrix
Inverse Document Frequency

In [None]:
def document_requency(doc_tf_matrix):
    doc_tf_col_counter = Counter(doc_tf_matrix.tocoo().col)

    df_list = []
    for i in range(len(doc_tf_col_counter)):
        df_list.append(doc_tf_col_counter[i])

    df_matrix = np.array(df_list)
    return df_matrix

In [None]:
# Create df matrix

df_matrix = document_requency(doc_tf_matrix)

In [None]:
# Create idf matrix

N = len(doc_text_split)
idf_matrix = np.log((1 + N)/(1 + df_matrix)) + 1

### TF-IDF Matrix

In [None]:
# Document tfidf matrix
doc_tfidf_matrix = doc_tf_matrix.multiply(idf_matrix)
doc_tfidf_matrix = doc_tfidf_matrix.tocsr()

# Query tfidf matrix
query_tfidf_matrix = query_tf_matrix.multiply(idf_matrix)
query_tfidf_matrix = query_tfidf_matrix.toarray()

In [None]:
# Normalize

doc_tfidf_matrix = normalize(doc_tfidf_matrix, norm='l2')
query_tfidf_matrix = normalize(query_tfidf_matrix, norm='l2')

### Rocchio Algorithm

In [None]:
# Rocchio  parameter

alpha = 1
beta = 0.5
gamma = 0.15
relevant_docs = 5
non_relevant_docs = 1
iteration = 5

In [None]:
# Rocchio Algorithm

for _ in tqdm(range(iteration), desc='Rocchio'):
    # Cosine similarity matrix
    cos_matrix = cosine_similarity(query_tfidf_matrix, doc_tfidf_matrix)
    rank_matrix = np.flip(cos_matrix.argsort(axis=1)[:5000], axis=1)

    for i in range(rank_matrix.shape[0]):
        rele_doc_vec = doc_tfidf_matrix[rank_matrix[i,:relevant_docs]].mean(axis=0)
        non_rele_vec = doc_tfidf_matrix[rank_matrix[i,:-non_relevant_docs]].mean(axis=0)
        query_tfidf_matrix[i] = alpha * query_tfidf_matrix[i] + beta * rele_doc_vec - gamma * non_rele_vec

### Rank
1. 根據剛剛的 Cosine similarity Matrix，可以把每個 Query 與所有 Document 的相似程度做排名，並把排名結果以 Document 檔名依序列出，存成一個 Retrieved Documents List。
2. 把 Query List 和 Retrieved Documents List 建成一個 DatafFrame，輸出成 CSV。

In [None]:
def get_retrieved_dataf(cos_matrix, doc_list, query_list):
    retrieved_documents_list = []

    for i in tqdm(range(cos_matrix.shape[0]), desc='Ranking'):
        # np.argsort(np.argsort(Vector)) 可得到該 Value 在此 Vector 的名次(越大名次越高)
        retrie_doc_value_dict = dict(zip(doc_list, np.argsort(np.argsort(cos_matrix[i]))))
        # 將 (key, value) 根據 Value 進行排序，輸出 key
        retrie_doc_sort_list = sorted(retrie_doc_value_dict.items(),
        key = lambda retrie_doc_value_dict:retrie_doc_value_dict[1],
        reverse = True)
        # 將每個 key 以空格分隔輸出成 String 放至 Retrieved Documents List
        retrieved_documents_list.append(' '.join([doc[0] for doc in retrie_doc_sort_list[:5000]]))
    
    # 存成 DataFrame 
    retrieved_doc_dataf = pd.DataFrame(data={
        'Query': query_list,
        'RetrievedDocuments': retrieved_documents_list})
    
    return retrieved_doc_dataf

In [None]:
submission_df = get_retrieved_dataf(cos_matrix, doc_list, query_list)

In [None]:
# Current date and time
date_time = datetime.now().strftime("%m%d%H%M")

submission_filename = 'hw5_%s_a%s_b%s_g%s_rd%s_nrd%s_it%s_hdf%s_ldf%s.csv' % (date_time, alpha, beta, gamma, relevant_docs, non_relevant_docs, iteration, maxDf, minDf)
submission_message = 'alpha=%s, beta=%s, gamma=%s, relevant_docs=%s, non_relevant_docs=%s, iteration=%s, maxDf=%s, minDf=%s' % (alpha, beta, gamma, relevant_docs, non_relevant_docs, iteration, maxDf, minDf)

# Submission CSV
submission_df.to_csv(submission_filename, index=False)

### Kaggle Submit API

In [None]:
# Kaggle Submit API

import os
os.system('cmd /c \"kaggle competitions submit -c 2020-information-retrieval-and-applications-hw5 -f %s -m \"%s\"\"' % (submission_filename, submission_message))