In [None]:
import math
import json
import pandas as pd  # Data processing, CSV file I/O (e.g. pd.read_csv)
import numpy as np  # Linear Algebra
from numba import jit # Just-in-Time compilation
from tqdm import tqdm  # Process bar
from collections import Counter
from datetime import datetime
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize

In [None]:
def preprocessing(doc_list_filename, query_list_filename, doc_path, query_path):
    def read_and_split(file_path, file_list, description):
        text_split_list = []
        for file in tqdm(file_list, desc='Reading %s' % description):
            filename = file_path + str(file) + '.txt'
            try:
                with open(filename) as f:
                    # 檔案內容切成單字列表(全小寫)
                    text_split = [x.lower() for x in f.read().split()]
            except:
                    text_split = []
            text_split_list.append(text_split)
        return text_split_list

    with open(doc_list_filename) as f:
        doc_list = f.read().splitlines()
    with open(query_list_filename) as f:
        query_list = f.read().splitlines()
    doc_text_split = read_and_split(doc_path, doc_list, 'doc')
    query_text_split = read_and_split(query_path, query_list, 'query')
    
    return doc_list, query_list, doc_text_split, query_text_split

In [None]:
def save_json(data, filename):
    with open(filename, 'w') as f:
        json.dump(data, f)

In [None]:
def read_json(filename):
    with open(filename, 'r') as f:
        data = json.load(f)
    return data

In [None]:
def count_word(doc_text_split, query_text_split):
    index_term = Counter()
    term_df_count = Counter()
    doc_tf_list = []
    for doc in tqdm(doc_text_split, desc='Count word in doc'):
        index_term.update(doc)
        term_df_count.update(set(doc))
        doc_tf_list.append(Counter(doc))
    for query in tqdm(query_text_split, desc='Update counter'):
        index_term.update(query)
    query_index_term = list(set([q for query in query_text_split for q in query]))
    return index_term, query_index_term, doc_tf_list, term_df_count

In [None]:
def term_frequency(index_term, docs):
    data = []
    row = []
    col = []
    r = -1
    for d in docs:
        r += 1
        for term in d:
            if term in index_term:
                c = index_term[term]
                row.append(r)
                col.append(c)
                data.append(1)
    data = np.array(data)
    row = np.array(row)
    col = np.array(col)
    tf_matrix = csr_matrix((data, (row, col)), shape=(len(docs), len(index_term)), dtype=np.float)
    return tf_matrix

In [None]:
def document_frequency(doc_tf_matrix):
    doc_tf_col_counter = Counter(doc_tf_matrix.tocoo().col)

    df_list = []
    for i in tqdm(range(len(doc_tf_col_counter)), desc='DF Matrix'):
        df_list.append(doc_tf_col_counter[i])

    df_matrix = np.array(df_list)
    return df_matrix

## Ranking

In [None]:
# Ranking
def get_retrieved_dataf(cos_matrix, doc_list, query_list, rank):
    retrieved_documents_list = []

    for i in tqdm(range(cos_matrix.shape[0]), desc='Ranking'):
        # np.argsort(np.argsort(Vector)) 可得到該 Value 在此 Vector 的名次(越大名次越高)
        retrie_doc_value_dict = dict(zip(doc_list, np.argsort(np.argsort(cos_matrix[i]))))
        # 將 (key, value) 根據 Value 進行排序，輸出 key
        retrie_doc_sort_list = sorted(retrie_doc_value_dict.items(),
        key = lambda retrie_doc_value_dict:retrie_doc_value_dict[1],
        reverse = True)
        # 將每個 key 以空格分隔輸出成 String 放至 Retrieved Documents List
        retrieved_documents_list.append(' '.join([doc[0] for doc in retrie_doc_sort_list[:rank]]))
    
    # 存成 DataFrame 
    retrieved_doc_dataf = pd.DataFrame(data={
        'Query': query_list,
        'RetrievedDocuments': retrieved_documents_list})
    
    return retrieved_doc_dataf

## Expectation Maximization algorithm

In [None]:
@jit(nopython=True)
def e_step(pwt, ptd, word_size, doc_size, topic_size):
    # P(T_k|w_i, d_j)
    ptwd = np.empty((topic_size, word_size, doc_size))
    # each word in index_term
    for i in range(word_size):
        # each doc
        for j in range(doc_size):
            # each topic
            for k in range(topic_size):
                ptwd[k][i][j] = pwt[i][k] * ptd[k][j]
        k_sum = ptwd[k][i].sum()
        if k_sum > 0:
            ptwd[k][i] /= k_sum

    return ptwd

In [None]:
@jit(nopython=True)
def m_step(ptwd, cwd, word_size, doc_size, topic_size):
    doc_text_sum = cwd.sum(axis=0)
    
    # create empty P(w_i|T_k) & P(T_k|d_j)
    pwt = np.empty((word_size, topic_size))
    ptd = np.empty((topic_size, doc_size))
    
    # each topic
    for k in range(topic_size):
        topic_wd = np.multiply(cwd, ptwd[k])

        # update P(w_i|T_k)
        wt_sum = topic_wd.sum()
        if wt_sum > 0:
            for i in range(word_size):
                pwt[i][k] = topic_wd[i].sum() / wt_sum
        else:
            pwt[:,k] = 1 / word_size

        # update P(T_k|d_j)
        for j in range(doc_size):
            if doc_text_sum[j] > 0:
                ptd[k][j] = topic_wd[:,j].sum() / doc_text_sum[j]

    return pwt, ptd

## Program

In [None]:
read_preprocess_file = True

In [None]:
#Filepath
data_folder = 'ntust-ir-2020_hw4_v2/' # /kaggle/input/2020-information-retrieval-and-applications-hw4-v2
doc_list_filename = data_folder + 'doc_list.txt'  # doc_list.txt filepath
query_list_filename = data_folder + 'query_list.txt'  # query_list.txt filepath
doc_path = data_folder + 'docs/'  # document folder path
query_path = data_folder + 'queries/'  # query folder path

In [None]:
# Read document and query
if read_preprocess_file==True:
    doc_list = read_json('doc_list.json')
    query_list = read_json('query_list.json')
    doc_text_split = read_json('doc_text_split.json')
    query_text_split = read_json('query_text_split.json')
else:
    doc_list, query_list, doc_text_split, query_text_split = preprocessing(doc_list_filename, query_list_filename, doc_path, query_path)
    save_json(doc_list, 'doc_list.json')
    save_json(query_list, 'query_list.json')
    save_json(doc_text_split, 'doc_text_split.json')
    save_json(query_text_split, 'query_text_split.json')

In [None]:
# Get index term
index_term, query_index_term, doc_tf_list, term_df_count = count_word(doc_text_split, query_text_split)

In [None]:
# Filter Min-DF and Max-DF
minDf = 5
maxDf = 0.0007

if(isinstance(minDf, float) and minDf >= 0.0 and minDf <= 1.0):
    minDf_size = int(index_term.most_common(1)[0][1] * minDf)
else:
    minDf_size = minDf

if(isinstance(maxDf, float) and maxDf >= 0.0 and maxDf <= 1.0):
    maxDf_size = int(index_term.most_common(1)[0][1] * maxDf)
else:
    maxDf_size = min(index_term.most_common(1)[0][1], maxDf)

filter_index_term = Counter(dict(filter(lambda elem: elem[0] in query_index_term or (elem[1] >= minDf_size and elem[1] <= maxDf_size)
, term_df_count.items())))
index_term_dict = {k: v for v, k in enumerate(list(filter_index_term.keys()))} 

print('index_term size:' ,len(index_term), '->', len(filter_index_term))

In [None]:
# BG Model
index_term_total = sum(filter_index_term.values())
bg_matrix = np.empty((len(filter_index_term), 1))
for word in filter_index_term:
    bg_matrix[index_term_dict[word],0] = filter_index_term[word] / index_term_total

In [None]:
word_size = len(filter_index_term)  # index term size
doc_size = len(doc_list)  # number of documents
topic_size = 8  # number of Topics

# P(T_k|w_i, d_j)
ptwd = np.empty((topic_size, word_size, doc_size))

# P(w_i|T_k)
pwt = np.random.random(size = (word_size, topic_size))
for k in range(topic_size):
    pwt[:,k] /= pwt[:,k].sum()
      
# P(T_k|d_j)
ptd = np.full((topic_size, doc_size), 1 / topic_size)

# c(w_i, d_j)
cwd = term_frequency(index_term_dict, doc_text_split).A.transpose()

In [None]:
# EM algorithm
maxIteration = 10
for i in tqdm(range(maxIteration), desc='EM algorithm'):
    ptwd = e_step(pwt, ptd, word_size, doc_size, topic_size)
    pwt, ptd = m_step(ptwd, cwd, word_size, doc_size, topic_size)
em_matrix = np.matmul(pwt, ptd).transpose()

In [None]:
# PLSA
alpha = 0.6
beta = 0.1
sim_matrix = []
# each query
for query_index in tqdm(range(len(query_text_split)), desc='PLSA'):
    query_doc_sim = []
    # each doc
    for doc_index in range(len(doc_text_split)):
        sim_val = 1
        # each word in query
        for query in query_text_split[query_index]:
            sim = alpha * (doc_tf_list[doc_index][query] / len(doc_text_split[doc_index]))
            sim += beta * em_matrix[doc_index][query_index]
            sim += (1-alpha-beta) * (filter_index_term[query] / index_term_total)
            sim_val *= sim
        query_doc_sim.append(sim_val)
    sim_matrix.append(query_doc_sim)
sim_matrix = np.array(sim_matrix)

In [None]:
# Get Retrieved Documents dataframe
submission_df = get_retrieved_dataf(sim_matrix, doc_list, query_list, 1000)

# Format filename
submission_filename = 'submission.csv'

# Submission CSV
submission_df.to_csv(submission_filename, index=False)