In [21]:
# -*- coding: utf-8 -*-
import re
import math
import string
import jieba
import jieba.posseg as pseg

N = 193

**filter_punctuation:**去除单词中的符号。

In [22]:
def filter_punctuation(words):
    new_words=[]
    illegal_char = string.punctuation + u'-\n.,;《》？！“”‘’@#￥%…&×（）——+【】{};；●，。&～、|\s:：'
    pattern = re.compile('[%s]'%re.escape(illegal_char))
    for word in words:
        new_word = pattern.sub(u'',word)
        if not new_word == u'':
            new_words.append(new_word)
    return new_words

**load_word_list:**从文件中加载单词。

In [23]:
def load_word_list():
    word_list = [[] for i in range(0,N+1)]
    word_set = set()
    for i in range(1,N+1):
        with open('./page/'+ str(i) +'.txt',encoding='utf-8',errors='ignore') as f:
            for line in f.readlines():
                filter_punctuation_words = filter_punctuation(jieba.cut(line))
                for filter_punctuation_word in filter_punctuation_words:
                    word_set.add(filter_punctuation_word)
                word_list[i].extend(filter_punctuation_words)
                # print(word_list[i])
    print(len(word_set))
    return word_list, word_set


**build_inverted_index:**构造倒排索引。

In [24]:
def build_inverted_index(word_list, word_set):
    inverted_index = {}
    for word in word_set:
        indices = []
        for i in range(1,N+1):
            count = 0
            for w in word_list[i]:
                if w == word:
                    count+=1
            indices.append({
                "page": i,
                "count": count
            })
        inverted_index[word]=indices
    with open('inverted_index.txt', 'w', encoding='utf-8') as f:
        f.write(str(inverted_index))
    return inverted_index


**min_distance:**计算单词最短编辑距离。

In [25]:
def min_distance(word1, word2):
    m = len(word1) + 1
    n = len(word2) + 1
    dp = [[0 for j in range(n)] for i in range(m)]

    for i in range(m):
        dp[i][0]=i
    for j in range(n):
        dp[0][j]=j

    for i in range(1, m):
        for j in range(1, n):
            if word1[i-1] == word2[j-1]:
                dp[i][j] = dp[i-1][j-1]
            else:
                # 插入、删除、替换
                dp[i][j] = min(dp[i][j-1], dp[i-1][j], dp[i-1][j-1]) + 1

    return dp[m-1][n-1]

def spell_check(search, word_set, inverted_index):
    minn = 99999999
    similar_word = ""
    for word in word_set:
        dis = min_distance(search, word)
        if dis < minn:
            minn = dis
            similar_word = word
    return similar_word

**tf_idf:**计算td/idf值。

In [26]:
def tf_idf(word_set, inverted_index):
    word_td_idf = {} 
    for word in word_set:
        wtd = []
        num = 0
        for file in inverted_index[word]:
            if file["count"] > 0:
                num+=1
        for file in inverted_index[word]:
            wtd.append({
                "page": file["page"],
                "wtd": math.log(1+file["count"], 10)*math.log(N/num,10)
            })
        word_td_idf[word]=wtd
    return word_td_idf
    

In [27]:
word_list, word_set = load_word_list()
print(word_set)

49628
{'CLI', '顶天立地', '应用软件', '非迹', '联合开发', '艰苦', '选点', '拉丁字母', 'Territorial', '龙纳', '渊博', '智商', '冬天', '刘易斯', '叩', 'A', '美墨', '7062', '恐龙', '推理方法', '被误', '书店', '李伟生', '王兆其', '打', 'Sb', '政府首脑', '渗', '批量生产', '具体方法', '21789', '探讨', '54100', '分批', '材料', '参众', '罢了', '顺畅', '载入', '信道编码', '辩证唯物主义', '士大夫', 'Public', '翻译', '氯仿', '物以类聚', '太高', '体育场馆', '姜澜', '君主政体', 'Patton', '811001', '内部', '有病', '风趣', '阿奇博尔德', '风疹', '细胞融合', '毕业典礼', '症候', '四强赛', '一州时', '布克奖', '钐', '庄园', '体总', '完好无损', '昆仑山', '项部', '高寒', '远近', '莫泽', '首创', '意在', 'had', '两女一男', '你会', '完全一致', '分计', '演变', '我校', '物等', '666', '神秘化', '核芯', '高大', '比为', 'complete', '1607', '及近', '拜', '战平', '鳔', '宁静', '立场', '阿默斯', '王泽山', '浙赣', '触须', '雄黄', '沈家', '交通事故', '短语', '百科全书式', 'unimaginative', '侵蚀作用', '磨难', '吕', '宇多田光', '舍勒', '174cm', '病菌', '1775', '730', 'TheGathering', '爱宝', '测毒', '毒力', '城乡居民', '登记器', '怎么', '女主人', '金涌', '该所', '球极', '武装', '直属', '670', 'TIM', '干得', '胜过', 'Intenive', 'Cole', '达美', '单纯形', '煤气化', '外科', 'Google', '墨西哥人', '直通', '契机', '清扫',

In [28]:
inverted_index = build_inverted_index(word_list, word_set)
# print(inverted_index)

In [29]:
word_td_idf = tf_idf(word_set, inverted_index)
# print(word_td_idf)

In [31]:
search = input("请输入你想查询的内容：")
search_words = jieba.cut(search)
page_scores = [0 for i in range(0,N+1)]
for search_word in search_words:
    similar_word = spell_check(search_word, word_set, inverted_index)
    for i in range(1,N+1):
        page_scores[i]+=word_td_idf[similar_word][i-1]["wtd"]
    
ranked_pages=[]
for i in range(1,N+1):
    ranked_pages.append({
        "page":i,
        "score":page_scores[i]
    })

ranked_pages = sorted(ranked_pages, key=lambda page: page["score"], reverse=True)
print(ranked_pages)
# ranked_pages = sorted(page_scores, reverse=True)
# print(similar_word, ":", ranked_pages[0:5])

请输入你想查询的内容：化学方向的科研成果
[{'page': 28, 'score': 1.7385759331221347}, {'page': 147, 'score': 1.6201516021392137}, {'page': 186, 'score': 1.487404374797843}, {'page': 91, 'score': 1.4872254519328267}, {'page': 21, 'score': 1.0531054529365669}, {'page': 117, 'score': 1.0531054529365669}, {'page': 89, 'score': 0.9323057815537351}, {'page': 97, 'score': 0.9077674383318927}, {'page': 64, 'score': 0.8990316710389047}, {'page': 156, 'score': 0.8705749855571069}, {'page': 68, 'score': 0.7997074228875658}, {'page': 62, 'score': 0.7995629295256994}, {'page': 159, 'score': 0.7978623246560825}, {'page': 170, 'score': 0.7641862107223044}, {'page': 188, 'score': 0.7188632746634531}, {'page': 98, 'score': 0.7108569618340719}, {'page': 74, 'score': 0.6905878758299913}, {'page': 67, 'score': 0.6833642000365661}, {'page': 54, 'score': 0.6645640611564319}, {'page': 3, 'score': 0.6458793424005238}, {'page': 168, 'score': 0.6326289480127465}, {'page': 115, 'score': 0.6055141384217795}, {'page': 155, 'score': 0.

实现方法中，自行实现拼写检查（spell_check），倒排索引（build_inverted_index），使用TF/IDF进行文档排序（tf_idf）。输入内容首先进行分词处理，然后使用spell_check来查找距离分词结果最近的单词，使用倒排索引求得tf/idf值，之后将输入所有分词后单词的td/idf值相加后进行排序，输出文档序号。其中使用的tf计算公式为：log10(1+tf)，idf计算公式为：log10(N/df)。