In [3]:
import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm

# load doc list
with open('docs_id_list.txt') as f:
    doc_list = f.read().splitlines()

# load doc from list
docs = {}
words = set()
for doc in tqdm(doc_list):
    with open('documents/' + doc + '.txt') as f:
        data = f.read()
        docs[doc] = data.split()
#         words = words.union(set(docs[doc]))

# load query list
with open('queries_id_list.txt') as f:
    query_list = f.read().splitlines()

# load query from list
queries = {}
for query in tqdm(query_list):
    with open('queries/' + query + '.txt') as f:
        data = f.read()
        queries[query] = data.split()
        words = words.union(set(queries[query]))

# save words
with open('min_word_list.txt', 'w') as f:
    f.write(' '.join(words))
    
words = list(words)

# load words dict from file
with open('min_word_list.txt') as f:
    words = f.read().split()

## Calculate docment-tf, query-tf, df, idf

# term frequency in document
tf_docs_list = []

for content in tqdm(docs.values()):
    tf_doc = []
    for word in words:
        tf_doc.append(content.count(word))
    tf_docs_list.append(tf_doc)
tf_docs_npy = np.array(tf_docs_list)
np.save('min_tf_docs_npy', tf_docs_npy)

# document frequency
df_list = []

for word in tqdm(words):
    count = 0
    for content in docs.values():
        if word in content:
            count += 1
    df_list.append(count)
df_npy = np.array(df_list)
np.save('min_df_npy', df_npy)

# term frequency in query
tf_queries_list = []

for content in tqdm(queries.values()):
    tf_query = []
    for word in words:
        tf_query.append(content.count(word))
    tf_queries_list.append(tf_query)
tf_queries_npy = np.array(tf_queries_list)
np.save('min_tf_queries_npy', tf_queries_npy)

# inverse document frequency
idf = []
docs_len = len(docs)

for df in tqdm(df_npy):
    idf.append(np.log((docs_len - df + 0.5) / (df + 0.5)))
idf_npy = np.array(idf)
np.save('min_idf_npy', idf_npy)

## Load calculated matrix (save calculate time)
tf_docs_npy = np.load('min_tf_docs_npy.npy')
tf_queries_npy = np.load('min_tf_queries_npy.npy')
df_npy = np.load('min_df_npy.npy')
idf_npy = np.load('min_idf_npy.npy')

## BM25 calculate

K1 = 0.28
K3 = 1000
b = 0.85

avg_doclen = 0
for doc in docs.values():
    avg_doclen += len(doc)
avg_doclen /= len(docs)

queries_result = []

for query_id, query in tqdm(queries.items()):
    query_result = []
    query_index = query_list.index(query_id)
    for doc_name, doc_content in docs.items():
        bm25_weight = 0
        doc_index = doc_list.index(doc_name)
        doc_len = len(doc_content)
        for word in query:
            word_index = words.index(word)
            tf_ij = tf_docs_npy[doc_index][word_index]
            tf_iq = tf_queries_npy[query_index][word_index]
            idf_i = idf_npy[word_index]
            single_term_weight = idf_i * (K1 + 1) * tf_ij / (tf_ij + K1 * ((1 - b) + b * doc_len / avg_doclen)) # * (K3 + 1) * tf_iq / (K3 + tf_iq)
            bm25_weight += single_term_weight
        query_result.append(bm25_weight)
    queries_result.append(query_result)

## sort and export result
sim_df = pd.DataFrame(queries_result)
sim_df = sim_df.transpose()
sim_df.index = doc_list
sim_df.columns = query_list

# save results
now = datetime.datetime.now()
save_filename = 'results/result' + '_' + now.strftime("%y%m%d_%H%M") + '.txt'
print(save_filename)

with open(save_filename, 'w') as f:
    f.write('Query,RetrievedDocuments\n')
    for query in query_list:
        f.write(query + ",")
        query_sim_df = sim_df[query].sort_values(ascending=False)
        f.write(' '.join(query_sim_df.index.to_list()) + '\n')



100%|█████████████████████████████████████| 5000/5000 [00:00<00:00, 9294.23it/s]
100%|██████████████████████████████████████| 100/100 [00:00<00:00, 13201.26it/s]
100%|██████████████████████████████████████| 5000/5000 [00:18<00:00, 277.04it/s]
100%|█████████████████████████████████████████| 329/329 [00:15<00:00, 21.18it/s]
100%|██████████████████████████████████████| 100/100 [00:00<00:00, 19039.92it/s]
100%|█████████████████████████████████████| 329/329 [00:00<00:00, 208858.18it/s]
100%|█████████████████████████████████████████| 100/100 [00:27<00:00,  3.58it/s]

results/result_221011_0138.txt





FileNotFoundError: [Errno 2] No such file or directory: 'results/result_221011_0138.txt'

In [4]:
sim_df

Unnamed: 0,1007770,1034514,1048052,1070173,108370,1147277,1152094,1163217,1164600,1174120,...,899962,915393,923647,947138,95344,985052,988051,992991,99841,999874
D1000312,-5.890479,3.600115,0.560964,-2.444556,-2.584956,-5.702228,-7.381344,-3.568347,-10.981337,0.0,...,0.000000,-2.444556,0.881336,-0.107721,0.000000,-1.636678,-1.636678,-1.636678,-2.584956,-1.636678
D1001022,-7.416475,-2.504915,0.000000,-0.929929,1.421330,-5.750652,-4.524813,-3.585196,-9.343240,0.0,...,3.478597,-2.501886,0.003028,1.201016,0.000000,-2.504915,-2.504915,-2.504915,-2.131967,-2.504915
D1005788,-5.166622,-2.038863,0.485705,-2.524568,-2.616241,-5.721429,-6.375848,-3.560088,-11.264243,0.0,...,0.153203,-2.522127,0.990838,1.291544,0.000000,-1.767472,-1.767472,-1.767472,-2.616241,0.619732
D1006884,-7.588494,-2.516022,3.460840,-2.516022,2.223152,-5.748945,-7.356888,-3.572913,-11.679913,0.0,...,-0.096236,-2.516022,0.000000,-0.121517,0.000000,-2.516022,-2.516022,-2.516022,-2.637676,-2.516022
D1006973,-3.386996,0.574508,0.574508,0.000000,0.000000,-3.081591,-4.725878,-3.386996,-10.720530,0.0,...,0.002887,0.002887,0.002887,0.847997,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
D996854,-3.499425,0.000000,0.000000,0.000000,-2.526829,0.000000,-3.934125,-3.499425,-11.005308,0.0,...,-0.077925,0.000000,0.000000,1.722884,0.000000,0.000000,0.000000,0.000000,-2.526829,0.000000
D997818,-7.524684,-2.505295,0.000000,-2.505295,-2.616403,-5.661158,-7.424399,-3.468401,-11.075765,0.0,...,0.002749,-2.502546,0.002749,1.762243,2.406185,-2.505295,-2.505295,-2.505295,-0.463336,-2.505295
D997919,-4.556056,-2.002039,0.523731,0.092787,-1.578617,-3.485771,-3.844846,-3.566653,-11.356017,0.0,...,-0.083565,0.862504,0.992786,3.337597,0.000000,0.256940,-1.679077,-1.679077,1.554042,0.311300
D998373,-7.281079,-1.916493,0.556384,-2.472877,-2.634404,-5.681581,-7.366838,-3.517133,-11.210289,0.0,...,-0.051846,-2.469995,0.002882,0.721028,0.000000,-2.472877,-2.472877,-2.472877,-2.634404,-2.472877
