In [1]:
def load_docs():
    sents = [
        'aa bb cc',
        'aa bb cc dd',
        'aa bb cc ee',
        'aa bb cc ee ee',
    ]

    #分词,过滤停用词
    docs = [doc.split() for doc in sents]

    #词表
    words = []
    for doc in docs:
        words.extend(doc)
    words = list(set(words))

    return docs, words


docs, words = load_docs()

docs, words

([['aa', 'bb', 'cc'],
  ['aa', 'bb', 'cc', 'dd'],
  ['aa', 'bb', 'cc', 'ee'],
  ['aa', 'bb', 'cc', 'ee', 'ee']],
 ['cc', 'dd', 'ee', 'aa', 'bb'])

In [2]:
import numpy as np


def get_tf():
    #每个文档中的词频
    tf = np.zeros([len(docs), len(words)])

    for i, doc in enumerate(docs):
        for word in doc:
            j = words.index(word)
            tf[i, j] += 1

    return tf


tf = get_tf()

tf

array([[1., 0., 0., 1., 1.],
       [1., 1., 0., 1., 1.],
       [1., 0., 1., 1., 1.],
       [1., 0., 2., 1., 1.]])

In [3]:
def get_df():
    #每个词在几篇文档中出现
    df = np.empty(len(words))
    for i, word in enumerate(words):

        df[i] = sum([1 for doc in docs if word in doc])

    return df


df = get_df()

df

array([4., 1., 2., 4., 4.])

In [4]:
def get_idf():
    #计算idf,衡量了每个词的独特性,越是不独特的词,越接近无穷大,否则接近0
    idf = np.empty(len(words))

    for i in range(len(words)):
        #未出现该词的文档数
        fenzi = len(docs) - df[i]

        #出现该词的文档数
        fenmu = df[i]

        #平滑
        fenzi += 0.5
        fenmu += 0.5

        #商的取值是0到正无穷
        shang = fenzi / fenmu

        #加1,取对数后非负
        shang += 1

        idf[i] = np.log(shang)

    return idf


idf = get_idf()

idf

array([0.10536052, 1.2039728 , 0.69314718, 0.10536052, 0.10536052])

In [5]:
def get_score(query, i):

    #文档平均长度
    mean_len = np.mean([len(doc) for doc in docs])

    score = 0.0
    for word in query:
        #求两份文档词的交集
        if word not in docs[i]:
            continue

        j = words.index(word)

        k = 1.5

        #词独特性*词频
        fenzi = idf[j] * tf[i, j] * (k + 1)

        #目标句子的长度和平均长度的商,这个数的值域是0到正无穷
        #这意味着,目标句子越长,分母越大,分数也就越低.
        #这也是显而易见的,越长的句子内容越分散,权值越低.
        deviation = len(docs[i]) / mean_len

        #这里的b可以视为一个伸缩系数
        #当b=0时,这一项恒定为1
        #当b=1时,这一项恒定为deviation
        b = 0.75
        deviation = 1 - b + b * deviation

        fenmu = tf[i, j] + k * deviation

        #最终的分数是综合考虑了词的idf,词的数量,目标句子的长度
        score += fenzi / fenmu

    return score


query = 'aa bb cc dd ee ee'.split(' ')

for i in range(len(docs)):
    print(i, get_score(query, i))

0 0.35614822194194823
1 1.5200543512994154
2 1.7023759080933698
3 2.117234755042299
