In [1]:
import math
from six import iteritems
from six.moves import xrange
import codecs
from gensim import corpora
from gensim.summarization import bm25
from gensim.summarization.bm25 import get_bm25_weights
import os
import re

In [2]:
# BM25 parameters.
PARAM_K1 = 1.5
PARAM_B = 0.75
EPSILON = 0.25


class BM25(object):

    def __init__(self, corpus):
        self.corpus_size = len(corpus)
        self.avgdl = sum(map(lambda x: float(len(x)), corpus)) / self.corpus_size
        self.corpus = corpus
        self.f = []
        self.df = {}
        self.idf = {}
        self.initialize()

    def initialize(self):
        for document in self.corpus:
            frequencies = {}
            for word in document:
                if word not in frequencies:
                    frequencies[word] = 0
                frequencies[word] += 1
            self.f.append(frequencies)

            for word, freq in iteritems(frequencies):
                if word not in self.df:
                    self.df[word] = 0
                self.df[word] += 1

        for word, freq in iteritems(self.df):
            self.idf[word] = math.log(self.corpus_size - freq + 0.5) - math.log(freq + 0.5)

    def get_score(self, document, index, average_idf):
        score = 0
        for word in document:
            if word not in self.f[index]:
                continue
            idf = self.idf[word] if self.idf[word] >= 0 else EPSILON * average_idf
            score += (idf * self.f[index][word] * (PARAM_K1 + 1)
                      / (self.f[index][word] + PARAM_K1 * (1 - PARAM_B + PARAM_B * self.corpus_size / self.avgdl)))
        return score

    def get_scores(self, document, average_idf):
        scores = []
        for index in xrange(self.corpus_size):
            score = self.get_score(document, index, average_idf)
            scores.append(score)
        return scores


def get_bm25_weights_own(corpus):
    bm25 = BM25(corpus)
    average_idf = sum(map(lambda k: float(bm25.idf[k]), bm25.idf.keys())) / len(bm25.idf.keys())

    weights = []
    for doc in corpus:
        scores = bm25.get_scores(doc, average_idf)
        weights.append(scores)

    return weights

In [3]:
from stop_words import get_stop_words

stopwords = get_stop_words('en')

In [4]:
stop_flag = ['x', 'c', 'u','d', 'p', 't', 'uj', 'm', 'f', 'r']
def tokenization(filename):
    result = []
    dirname = './text/'
    f = open(dirname+filename, 'r')
    words = [word for line in f for word in line.split()]
    for word in words:
        if word not in stopwords:
            result.append(word)
    f.close()
    return result

In [5]:
corpus = [];
dirname = './text'
filenames = []
for root,dirs,files in os.walk(dirname):
    files = [f for f in files if not f[0] == '.']
    dirs[:] = [d for d in dirs if not d[0] == '.']
    for f in files:
        corpus.append(tokenization(f))
        filenames.append(f)
    
dictionary = corpora.Dictionary(corpus)
print (len(dictionary))

12566


In [6]:
doc_vectors = [dictionary.doc2bow(text) for text in corpus]
vec1 = doc_vectors[0]
vec1_sorted = sorted(vec1, key= lambda x:x[1], reverse=True)
for term, freq in vec1_sorted[:5]:
    print(dictionary[term])

fundraising
diabetes
help
registered
england


In [7]:
bm25Model = BM25(corpus)
average_idf = sum(map(lambda k: float(bm25Model.idf[k]), bm25Model.idf.keys())) / len(bm25Model.idf.keys())

In [8]:
result = get_bm25_weights(corpus, n_jobs=-1)

In [9]:
result1 = get_bm25_weights_own(corpus)

In [10]:
query_str = 'diabetes mellitus and body fat mass'
query = []
for word in query_str.strip().split():
    query.append(word)
print(query)
scores = bm25Model.get_scores(query, average_idf)
# scores.sort(reverse=True)

['diabetes', 'mellitus', 'and', 'body', 'fat', 'mass']


In [11]:
import heapq
topk = 5
temp = scores.copy()
max_number = heapq.nlargest(topk, temp) 
max_index = []
for t in max_number:
    index = temp.index(t)
    max_index.append(index)
    temp[index] = 0

print(max_number)
print(max_index)
topk_file = []
for i in range(topk):
    topk_file.append(filenames[max_index[i]])

[11.588801946393495, 9.177087032697278, 8.652530483775921, 8.470162479120084, 7.871901035380452]
[301, 205, 342, 701, 297]


In [100]:
idx = scores.index(max(scores))
idx

301

In [101]:
fname = filenames[idx]
print(fname)

576.txt


In [102]:
urls = []
dirname = './text/'
for filename in topk_file:
    with open(dirname+filename, 'r') as f:
        doc = f.read()
        doc = doc.split(' ')
        words = []
        for i in range(len(doc)):
            if i == 0:
                urls.append(doc[i])
    f.close()

In [103]:
urls

['https://www.diabetes.org.uk/diabetes-the-basics/related-conditions',
 'https://www.diabetes.org.uk/about_us/news/weight-loss-type-2-diabetes-remission-direct-latest',
 'https://www.diabetes.org.uk/guide-to-diabetes/enjoy-food/eating-with-diabetes/out-and-about/holiday-eating',
 'https://www.diabetes.org.uk/guide-to-diabetes/enjoy-food/eating-with-diabetes/what-is-a-healthy-balanced-diet/processed-and-red-meat',
 'https://www.diabetes.org.uk/professionals/resources/shared-practice/psychological-care/emotional-health-professionals-guide']

In [109]:
topk_file

['576.txt', '572.txt', '239.txt', '669.txt', '238.txt']