In [1]:
import math
from six import iteritems
from six.moves import xrange
import unicodedata
import nltk
import six
from nltk.corpus import mac_morpho
nltk.download('mac_morpho')
nltk.download('punkt')

[nltk_data] Downloading package mac_morpho to
[nltk_data]     C:\Users\hiren\AppData\Roaming\nltk_data...
[nltk_data]   Package mac_morpho is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hiren\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
class BM25(object):
    PARAM_K1 = 1.2
    PARAM_B = 0.75
    EPSILON = 0.25

    def __init__(self, corpus):
        self.corpus_size = len(corpus)
        self.dl = [float(len(d)) for d in corpus]
        self.avgdl = sum(self.dl) / self.corpus_size
        self.corpus = corpus
        self.f = []
        self.df = {}
        self.idf = {}
        self.average_idf = 0
        self._initialize()

    def _initialize(self):
        for document in self.corpus:
            frequencies = {}
            for word in document:
                if word not in frequencies:
                    frequencies[word] = 0
                frequencies[word] += 1
            self.f.append(frequencies)

            for word, freq in iteritems(frequencies):
                if word not in self.df:
                    self.df[word] = 0
                self.df[word] += 1

        for word, freq in iteritems(self.df):
            self.idf[word] = math.log(self.corpus_size - freq + 0.5) - math.log(freq + 0.5)

        self.average_idf = sum(map(lambda k: float(self.idf[k]), self.idf.keys())) / len(self.idf.keys())

    def _get_score(self, document, index):
        score = 0
        for word in document:
            if word not in self.f[index]:
                continue
            idf = self.idf[word] if self.idf[word] >= 0 else self.EPSILON * self.average_idf
            score += (idf * self.f[index][word] * (self.PARAM_K1 + 1)
                      / (self.f[index][word] + self.PARAM_K1 * (1 - self.PARAM_B + self.PARAM_B * self.dl[index] / self.avgdl)))
        return score

    def _get_scores(self, document):
        scores = []
        for index in xrange(self.corpus_size):
            score = self._get_score(document, index)
            scores.append(score)
        return scores

    def ranked(self, query, length):
        """Returns the `length` most relevant documents according to `query`"""
        scores = [(index, score) for index, score in enumerate(self._get_scores(query))]
        scores.sort(key=lambda x: x[1], reverse=True)
        indexes, _ = self._unpack(scores)
        return indexes[:length]

    @staticmethod
    def _unpack(tuples):
        return zip(*tuples)

In [3]:
def normalize_terms(terms):
    """Remove diacritics from terms and turn case to lowercase"""
    # Aqui vocÃª pode adicionar outros tratamentos:
    # - remover stopwords
    # - remover numerais
    # - stemming
    return [remove_diacritics(term).lower() for term in terms]


def remove_diacritics(text, encoding='utf8'):
    """Remove diacritics from bytestring or unicode, returning an unicode string"""
    nfkd_form = unicodedata.normalize('NFKD', to_unicode(text, encoding))
    only_ascii = nfkd_form.encode('ASCII', 'ignore')
    return only_ascii.decode(encoding)


def to_unicode(text, encoding='utf8'):
    """Convert a string (bytestring in `encoding` or unicode), to unicode."""
    if isinstance(text, six.text_type):
        return text
    return text.decode(encoding)

In [4]:
news = [normalize_terms(sentence) for sentence in mac_morpho.sents()]
print(repr(news[0]))

bm25 = BM25(news[:1000])
query = normalize_terms(nltk.word_tokenize('inflacao'))
for position, index in enumerate(bm25.ranked(query, 5)):
    print('{} - {}'.format(position, ' '.join(news[index])))

['jersei', 'atinge', 'media', 'de', 'cr$', '1,4', 'milhao', 'em', 'a', 'venda', 'de', 'a', 'pinhal', 'em', 'sao', 'paulo']
0 - " ainda ha inflacao e o aumento de prazo vem acompanhado de alguma correcao " , diz sastre
1 - a queda de a inflacao , segundo o empresario , da a o primeiro a chance de negociar o animal por o seu real valor
2 - jersei atinge media de cr$ 1,4 milhao em a venda de a pinhal em sao paulo
3 - programe sua viagem a a exposicao nacional do zebu , que comeca dia 25
4 - safra recorde e disponibilidade de credito ativam vendas de maquinas agricolas
