<a href="https://colab.research.google.com/github/linkToHeart/jupyter-notebook/blob/main/learn/SearchEngine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

定义基础类

In [3]:
class SearchEngineBase(object):
  def __init__(self):
    pass

  def add_corpus(self, file_path):
    with open(file_path, 'r') as fin:
      text = fin.read()
      self.process_corups(file_path, text)

  def process_corups(self, id, text):
    raise Exception('process_corpus not implemented.')

  def search(self, query):
    raise Exception('search not implemented.')


定义子类

In [17]:
class SimpleEngine(SearchEngineBase):
  def __init__(self):
    super(SimpleEngine, self).__init__()
    self.__id_to_texts = {}

  def process_corups(self, id, text):
    self.__id_to_texts[id] = text

  def search(self, query):
    results = []
    for id, text in self.__id_to_texts.items():
      if query in text:
        results.append(id)
    return results

def main(search_engine):
  base_path = '/content/drive/MyDrive/Colab Notebooks/colab data/'
  for flie_name in ['1.txt', '2.txt', '3.txt', '4.txt', '5.txt']:
    search_engine.add_corpus(base_path + flie_name)

  try:
    while True:
      query = input()
      results = search_engine.search(query)
      print('found {} result(s):'.format(len(results)))
      for result in results:
        print(result)
  except:
    print('End')


search_engine = SimpleEngine()
main(search_engine)


low
found 2 result(s):
/content/drive/MyDrive/Colab Notebooks/colab data/3.txt
/content/drive/MyDrive/Colab Notebooks/colab data/5.txt
End


Bag of Words 搜索模型

In [16]:
import re
class BOWEngine(SearchEngineBase):
  def __init__(self):
    super(BOWEngine, self).__init__()
    self.__id_to_words = {}

  def process_corups(self, id, text):
    self.__id_to_words[id] = self.parse_text_to_words(text)

  def search(self, query):
    query_words = self.parse_text_to_words(query)
    results = []
    for id, words in self.__id_to_words.items():
      if self.query_match(query_words, words):
        results.append(id)
    return results

  @staticmethod
  def query_match(query_words, words):
    for query_word in query_words:
      if query_word in words:
        return True
    return False

  @staticmethod
  def parse_text_to_words(text):
    # 使用正则表达式去除标点符号和换行符
    text = re.sub(r'[^\w ]', ' ', text)
    # 转为小写
    text = text.lower()
    # 生成所有单词的列表
    word_list = text.split(' ')
    # 去除空白单词
    word_list = filter(None, word_list)
    # 返回单词的 set
    return set(word_list)

search_engine = BOWEngine()
main(search_engine)

i have a dream
found 4 result(s):
/content/drive/MyDrive/Colab Notebooks/colab data/1.txt
/content/drive/MyDrive/Colab Notebooks/colab data/2.txt
/content/drive/MyDrive/Colab Notebooks/colab data/3.txt
/content/drive/MyDrive/Colab Notebooks/colab data/4.txt
End
