In [1]:
from IPython.core.display import display, HTML
import bisect
from collections import defaultdict
import jieba
import pickle

class MySearchC4V0():
    """
    C3V0: Base class for Search Engine.
    C3V1: Data multiplication added.
    C3V2: Sorting optimization.
    C3V3: Add lowered version of docs.
    C3V4: For long doc.
    C3V5: Caching search results.
    C3V6: Pre-caching all words in docs.
    C3V7: Add Serialize/UnSerialize.
    ----------------C4V0-----------------

    Attributes
    ----------
    filename : str
        file name of doc data
    multi_factor : int
        data multiplication factor(default 1)

    Methods
    -------
    load_data(filename):
        load data from file.
    save_data(filename):
        save data to file
    pre_cache_all():
        Pre-caching all words in docs.
    highlight(text, keyword):
        highlight text with keyword.
    score(text, keyword):
        get score of text for a query.
    get_word_match(self, keyword):
        get doc set containing keyword.
    search(keyword, num=15):
        get top num search results of a query.
    render(result_list, keyword):
        output search results with highlight.
    """
    
    def __init__(self, filename, multi_factor=1):
        self.docs = []
        self.docs_lower = []
        self.search_cache = defaultdict(set)
        self.multi_factor = multi_factor
        self.load_data(filename)
    
    def highlight(self, text, keyword, ori_text):
        idx = text.find(keyword)
        result = text
        if idx >= 0:
            ori_keyword = ori_text[idx:idx+len(keyword)]
            result = ori_text.replace(ori_keyword, '<span style="color:red">{}</span>'.format(ori_keyword))
        return result
    
    def score(self, text, keyword):
        result = text.count(keyword)
        return result
    
    def get_word_match(self, keyword):
        result_set = set()
        if keyword in self.search_cache: 
            result_set = self.search_cache[keyword] 
        else:
            for tid, title in enumerate(self.docs_lower):
                if keyword in title:
                    result_set.add(tid)
            self.search_cache[keyword] = result_set
        return result_set
            
    def search(self, keyword, num=15):
        keyword_lower = keyword.lower()    
        result_list = []
        min_score = 0
        for tid in self.get_word_match(keyword_lower):
            doc = self.docs_lower[tid]
            if keyword_lower in doc:
                score = self.score(doc, keyword_lower)
                if len(result_list) == num:
                    if score > min_score:
                        insert_idx = bisect.bisect(
                            [doc_score[1] for doc_score in result_list], 
                            score
                        )
                        min_score = result_list[0][1]
                        result_list = result_list[1:insert_idx] + \
                                        [(tid, score)] + \
                                        result_list[insert_idx:]
                elif len(result_list) < num - 1:
                    result_list.append((tid, score))
                elif len(result_list) == num - 1:
                    result_list.append((tid, score))
                    result_list.sort(key = lambda x: x[1])
                    min_score = result_list[0][1]
        return [doc_id for doc_id, _ in result_list[::-1]]
    
    def render(self, result_list, keyword):
        count = 1
        for item in result_list:
            result = self.highlight(
                self.docs_lower[item], 
                keyword.lower(), 
                self.docs[item]
            ).replace('$$$', '<br/>') #
            display(HTML("{}、{}......".format(count,result[:150]))) #
            count += 1
            
    def pre_cache_all(self):
        for tid, doc in enumerate(self.docs_lower):
            for word in jieba.cut_for_search(doc):
                self.search_cache[word].add(tid)
                
    def load_data(self, filename):
        if filename[-3:] == 'txt':
            with open(filename, 'r',encoding='utf-8') as f:
                self.docs = f.read().split('\n')
            self.docs_lower = [doc.lower() for doc in self.docs]
            self.docs = self.docs * self.multi_factor 
            self.docs_lower = self.docs_lower * self.multi_factor
            self.pre_cache_all()
        elif filename[-3:] == 'dat':
            with open(filename, 'rb') as f:
                self.docs, self.docs_lower, self.search_cache = pickle.load(f)
                
    def save_data(self, filename):
        with open(filename, 'wb') as f:
            pickle.dump((self.docs, self.docs_lower, self.search_cache), f)

In [2]:
searcher=MySearchC4V0('c:/python data/titles_l.txt',1)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\01630\AppData\Local\Temp\jieba.cache
Loading model cost 0.587 seconds.
Prefix dict has been built successfully.


In [3]:
searcher.save_data('c:/python data/titles.dat')

In [4]:
searcher123=MySearchC4V0('c:/python data/titles.dat',1)

In [5]:
search_result=searcher123.search("手机",num=12)
searcher123.render(search_result,"手机")

------------
布尔查询，指利用and,or,not等操作符的复合查询，可以实现多个检索关键词的组合，可以考虑集合这种数据结构

In [12]:
w1=set([2,4,8,16,64,128])
w2=set([1,2,3,5,8,16,21,34])
w3=set([13,16])

In [13]:
w1&w2-w3

{2, 8}

In [17]:
searcher.search_cache['手机']

{0,
 7,
 15,
 16,
 32,
 36,
 39,
 51,
 52,
 53,
 55,
 57,
 65,
 68,
 75,
 82,
 85,
 90,
 91,
 94,
 102,
 107,
 122,
 135,
 137,
 156,
 159,
 161,
 165,
 176,
 177,
 179,
 188,
 189,
 190,
 194,
 207,
 208,
 210,
 213,
 215,
 222,
 232,
 247,
 268,
 270,
 271,
 273,
 274,
 275,
 276,
 278,
 314}

In [18]:
searcher.search_cache['电脑']

{44,
 47,
 53,
 66,
 80,
 94,
 119,
 147,
 149,
 156,
 165,
 172,
 191,
 196,
 247,
 258,
 262,
 263}

In [21]:
searcher.search_cache['手机'] & searcher.search_cache['电脑']

{53, 94, 156, 165, 247}

In [23]:
for d_id in searcher.search_cache['手机'] & searcher.search_cache['电脑']:
    print(searcher.docs[d_id][:150]+'......')

芯片行业：正猛烈巨变$$$五年前，英特尔的市值相当于英伟达和AMD的总和。如今，英伟达的市值与另外两家公司相当。台积电的股价也在飙升，反映出半导体行业在过去10年发生了巨大的变化。就在不久前，英特尔公司还是美国芯片制造商中无可置疑的王者，也是市值最大的半导体公司。而且大多数计算都是通过个人电脑完成的......
一张仅6分钱！首款鸿蒙打印机便宜了：到手1749元$$$去年9月，全球首款搭载HarmonyOS的激光打印机——华为PixLab X1正式发布，打印机支持靠近配网、一碰打印、远程打印、身份证智能复印等一系列功能。今日，@华为终端公司 官微宣布，4月18日起至4月19日，作为首款搭载HarmonyOS......
5399元！华为MateBook 14非触屏版开售：11代酷睿+2K屏$$$前不久，华为发布MateBook 14非触屏版新款笔记本电脑，与此前发布的MateBook 14不同的是，显示方面，采用一块14英寸的2K全面屏，拥有3:2屏幕比例、90%屏占比，支持100%sRGB广色域、无频闪DC调光、......
电脑能玩手机App还能互传文件！快来试试这神器$$$想把手机屏幕投影到电脑屏幕上，还想着使用鼠标键盘甚至大触摸屏来控制手机？AnLink可以满足你。AnLink一款免费的电脑控制手机软件，对比单纯的投屏，还有把手机音频在电脑音箱中播放出来。也就是相当于把手机“摆”上了电脑屏幕，那样你不仅有了大屏，也......
峰米跃升中国家用投影第三：激光投影技术实现降维打击$$$中国家用投影机市场的格局已经变了，这一次不仅是销量、销售额的增长。国际知名研究机构IDC发布的《2021年第四季度中国投影机市场跟踪报告》显示，去年国内家用投影机出货量348万台，同比增长16%，销售额124亿元也增长了18,3%。在厂商排名中......


query='手机 + (电脑 not 飞机)'=>'手机&电脑-飞机'

In [34]:
eval("searcher.search_cache['手机'] & searcher.search_cache['电脑']")

{53, 94, 156, 165, 247}

In [36]:
def get_word_match(word):
    return searcher.search_cache[word]

In [37]:
eval("get_word_match('手机') & get_word_match('电脑')")

{53, 94, 156, 165, 247}

AND,OR,and,or,+,NOT,not,(,)," ",''=>&,|,-(,)

In [48]:
def query_to_set_expression(query):
    all_parts=list(query.replace('('," ( ").replace(')',' ) ').split())
    query_new_parts=[]
    idx=0
    cache=''
    print("seg result:","{}".format(' '.join(all_parts)))
    while idx<len(all_parts):
        if all_parts[idx]=='('or all_parts[idx]==')':
            query_new_parts.append(all_parts[idx])
            print('new_parts:{}'.format(all_parts[idx]))
        elif all_parts[idx]==' 'or all_parts[idx]=='':
            query_new_parts.append(' ')
            print('new_parts:{}'.format(' '))
        elif all_parts[idx] in ('and','AND','+'):
            query_new_parts.append('&')
            print('new_parts:{}'.format('&'))
        elif all_parts[idx] in ('OR','or'):
            query_new_parts.append('|')
            print('new_parts:{}'.format('|'))
        elif all_parts[idx] in ('NOT','not'):
            query_new_parts.append('-')
            print('new_parts:{}'.format('-'))
        else:
            if cache:
                cache+=all_parts[idx]
            else:
                cache=all_parts[idx]
                
            if (idx+1==len(all_parts) or all_parts[idx+1] in ('(',')','and','AND','+','OR','or','NOT','not')):
                query_new_parts.append("get_word_match('{}')".format(cache))
                print("new_parts:get_word_match('{}')".format(cache))
            cache=''
        idx+=1
    query_new=''.join(query_new_parts)
    return query_new

    
            


In [46]:
query_to_set_expression('手机 AND 电脑 NOT 咖啡')

seg result: 手机 AND 电脑 NOT 咖啡
new_parts:get_word_match('手机')
new_parts:&
new_parts:get_word_match('电脑')
new_parts:-
new_parts:get_word_match('咖啡')


"get_word_match('手机')&get_word_match('电脑')-get_word_match('咖啡')"

In [1]:
eval(query_to_set_expression(''))

NameError: name 'query_to_set_expression' is not defined

In [63]:
from IPython.core.display import display, HTML
import bisect
from collections import defaultdict
import jieba
import pickle

class MySearchC4V1(MySearchC4V0):
    """
    C3V0: Base class for Search Engine.
    C3V1: Data multiplication added.
    C3V2: Sorting optimization.
    C3V3: Add lowered version of docs.
    C3V4: For long doc.
    C3V5: Caching search results.
    C3V6: Pre-caching all words in docs.
    C3V7: Add Serialize/UnSerialize.
    ----------------C4V0-----------------
    C4V1: Add basic bool query

    Attributes
    ----------
    filename : str
        file name of doc data
    multi_factor : int
        data multiplication factor(default 1)

    Methods
    -------
    load_data(filename):
        load data from file.
    save_data(filename):
        save data to file
    pre_cache_all():
        Pre-caching all words in docs.
    highlight(text, keyword):
        highlight text with keyword.
    score(text, keyword):
        get score of text for a query.
    get_word_match(self, keyword):
        get doc set containing keyword.
    search(keyword, num=15):
        get top num search results of a query.
    render(result_list, keyword):
        output search results with highlight.
    get_word_match(word):
        get match set of the word
    """
    def get_word_match(self,word): 
        return self.search_cache[word]
    def query_to_set_expression(self,query): 
        all_parts=list(query.replace('('," ( ").replace(')',' ) ').split())
        query_new_parts=[]
        idx=0
        cache=''
        print("seg result:","{}".format(' '.join(all_parts)))
        while idx<len(all_parts):
            if all_parts[idx]=='('or all_parts[idx]==')':
                query_new_parts.append(all_parts[idx])
                print('new_parts:{}'.format(all_parts[idx]))
            elif all_parts[idx]==' 'or all_parts[idx]=='':
                query_new_parts.append(' ')
                print('new_parts:{}'.format(' '))
            elif all_parts[idx] in ('and','AND','+'):
                query_new_parts.append('&')
                print('new_parts:{}'.format('&'))
            elif all_parts[idx] in ('OR','or'):
                query_new_parts.append('|')
                print('new_parts:{}'.format('|'))
            elif all_parts[idx] in ('NOT','not'):
                query_new_parts.append('-')
                print('new_parts:{}'.format('-'))
            else:
                if cache:
                    cache+=all_parts[idx]
                else:
                    cache=all_parts[idx]

                if (idx+1==len(all_parts) or all_parts[idx+1] in ('(',')','and','AND','+','OR',\
                                                                  'or','NOT','not',' ','')):
                    query_new_parts.append("get_word_match('{}')".format(cache))
                    print("new_parts:get_word_match('{}')".format(cache))
                cache=''
            idx+=1
        query_new=''.join(query_new_parts)
        return query_new
    def search(self,query, num=15): 
        query_lower = query.lower()    
        result_list = []
        min_score = 0
        query_new=self.query_to_set_expression(query_lower) #输入一个字符串组合
        for tid in eval(query_new):
            doc = self.docs_lower[tid]
            score=1
                            #score = self.score(doc, keyword_lower)
            if len(result_list) == num:
                if score > min_score:
                    insert_idx = bisect.bisect(
                        [doc_score[1] for doc_score in result_list], 
                        score
                     )
                    min_score = result_list[0][1]
                    result_list = result_list[1:insert_idx] + \
                                    [(tid, score)] + \
                                    result_list[insert_idx:]
            elif len(result_list) < num - 1:
                 result_list.append((tid, score))
            elif len(result_list) == num - 1:
                result_list.append((tid, score))
                result_list.sort(key = lambda x: x[1])
                min_score = result_list[0][1]
        return [doc_id for doc_id, _ in result_list[::-1]]

In [64]:
searcher=MySearchC4V1('c:/python data/titles.dat',1)

In [66]:
query='电脑 AND 手机 NOT 咖啡'
search_result=searcher.search(query,num=10)
searcher.render(search_result,query)

seg result: 电脑 and 手机 not 咖啡
new_parts:get_word_match('电脑')
new_parts:&
new_parts:get_word_match('手机')
new_parts:-
new_parts:get_word_match('咖啡')
