# 封装成类

In [2]:
from IPython.core.display import display,HTML

'''
搜索引擎基类
---------
属性：
filename:str
    数据的文档名
---------
方法：
load_data(filename):
    从文档中导入数据
hightlight(text,keyword)
    高亮文本中的关键词
score(text,keyword)
    从给查询的文本打分
search(keyword)
    获得查询结果
render(result_list,keyword)
    带高亮显示的输出结果
'''

class MySearchV0():
    def __init__(self,filename):
        self.docs=[]
        self.load_data(filename)
        
    def load_data(self,filename):
        with open(filename) as f:
            self.docs=f.read().split('\n')
            
    def highlight(self,text,keyword):
        idx=text.lower().find(keyword.lower())
        result=text
        if idx>=0:
            ori_keyword=text[idx:idx+len(keyword)]
            result=text.replace(ori_keyword,"<span style='color:red'>{}</span>".format(ori_keyword))
        return result
    def score(self,text,keyword):
        result=text.lower().count(keyword.lower())
        return result
    
    def search(self,keyword):
        result_list=[]
        for tid,title in enumerate(self.docs):
            if keyword.lower() in title.lower():
                result_list.append([tid,self.score(title,keyword)])
        result_list.sort(key=lambda x:x[1],reverse=True)
        return [doc_id for doc_id,_ in result_list]
    
    def render(self,result_list,keyword):
        count=1
        for item in result_list:
            display(HTML("{}、{}".format(count,self.highlight(self.docs[item],keyword))))
            count+=1

In [3]:
help(MySearchV0)

Help on class MySearchV0 in module __main__:

class MySearchV0(builtins.object)
 |  Methods defined here:
 |  
 |  __init__(self, filename)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  highlight(self, text, keyword)
 |  
 |  load_data(self, filename)
 |  
 |  render(self, result_list, keyword)
 |  
 |  score(self, text, keyword)
 |  
 |  search(self, keyword)
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors defined here:
 |  
 |  __dict__
 |      dictionary for instance variables (if defined)
 |  
 |  __weakref__
 |      list of weak references to the object (if defined)



In [3]:
searcher=MySearchV0('c:/python data/titles.txt')

In [4]:
keyword='手机'
search_result=searcher.search(keyword)

In [5]:
searcher.render(search_result,keyword)

In [7]:
from IPython.core.display import display,HTML

class MySearchV1(MySearchV0):
    '''
    V0: Base Class
    V1: Data vulunm added

    Attributes
    ---------
    filename:str
    multi_factor:int

    methods:
    ---------
    load_data(filename):load data from disk
    hightlight(text,keyword)
        高亮文本中的关键词
    score(text,keyword)
        从给查询的文本打分
    search(keyword)
        获得查询结果
    render(result_list,keyword)
        带高亮显示的输出结果
    '''
    def __init__(self,filename,multi_factor=1):
        self.docs=[]
        self.multi_factor=multi_factor
        self.load_data(filename)
    def load_data(self,filename):
        with open(filename) as f:
            self.docs=f.read().split('\n')
            self.docs=self.docs*self.multi_factor
        


In [8]:
searcher=MySearchV1('c:/python data/titles.txt',100000)

In [9]:
len(searcher.docs)

51100000

In [10]:
%time searcher_list=searcher.search('手机')

Wall time: 34.8 s


In [11]:
%prun searcher_list=searcher.search('手机')

 

In [15]:
from IPython.core.display import display,HTML
import bisect

class MySearchV2(MySearchV1):
    '''
    V0: Base Class
    V1: Data vulunm added
    V2：Sorting optimization

    Attributes
    ---------
    filename:str
    multi_factor:int

    methods:
    ---------
    load_data(filename):load data from disk
    hightlight(text,keyword)
        高亮文本中的关键词
    score(text,keyword)
        从给查询的文本打分
    search(keyword)
        get search result
    render(result_list,keyword)
        带高亮显示的输出结果
    '''
    def search(self,keyword,num=10):
        result_list=[]
        min_score=0
        for tid,title in enumerate(self.docs):
            if keyword.lower() in title.lower():
                score=self.score(title,keyword)
                if len(result_list)==num:
                    if score>min_score:
                        insert_idx=bisect.bisect([doc_score[1] for doc_score in result_list],score)
                        result_list=result_list[1:insert_idx]+[(tid,score)]+result_list[insert_idx:]
                        min_score=result_list[0][1]
                elif len(result_list)<num-1:
                    result_list.append((tid,score))                    
                elif len(result_list)==num-1:
                    result_list.append((tid,score))
                    result_list.sort(key=lambda x:x[1])
                    min_score=result_list[0][1]
        
        return [doc_id for doc_id,_ in result_list[::-1]]

In [16]:
searcher=MySearchV2('c:/python data/titles.txt',100000)

In [28]:
keyword='iphone'
searcher_result=searcher.search(keyword)

In [29]:
searcher.render(searcher_result,keyword)

In [22]:
keyword='手机'
%time search_result=searcher.search(keyword)

Wall time: 33 s


In [23]:
%prun search_result=searcher.search(keyword)

 

In [36]:
from IPython.core.display import display,HTML
import bisect

class MySearchV3(MySearchV2):
    '''
    V0: Base Class
    V1: Data vulunm added
    V2：Sorting optimization
    V3:Add a lowered version of docs

    Attributes
    ---------
    filename:str
    multi_factor:int

    methods:
    ---------
    load_data(filename):load data from disk
    hightlight(text,keyword)
        高亮文本中的关键词
    score(text,keyword)
        从给查询的文本打分
    search(keyword)
        get search result
    render(result_list,keyword)
        带高亮显示的输出结果
    '''
    def __init__(self,filename,multi_factor=1):
        self.docs=[]
        self.docs_lower=[]
        self.multi_factor=multi_factor
        self.load_data(filename)
    def load_data(self,filename):
        with open(filename) as f:
            self.docs=f.read().split('\n')
        self.docs_lower=[doc.lower() for doc in self.docs] #加载的时候小写化，查询不再需要小写化
        self.docs=self.docs*self.multi_factor
        self.docs_lower=self.docs_lower*self.multi_factor
    def highlight(self,text,keyword,ori_text):
        idx=text.find(keyword)
        result=text
        if idx>=0:
            ori_keyword=ori_text[idx:idx+len(keyword)]
            result=ori_text.replace(ori_keyword,"<span style='color:red'>{}</span>".format(ori_keyword))
        return result
    def score(self,text,keyword):    #text是小写版本参数
        result=text.count(keyword)
        return result
    def search(self,keyword,num=10):
        result_list=[]
        min_score=0
        keyword_lower=keyword.lower()
        for tid,title in enumerate(self.docs):
            if keyword_lower in title:     #title小写版本数据
                score=self.score(title,keyword_lower)
                if len(result_list)==num:
                    if score>min_score:
                        insert_idx=bisect.bisect([doc_score[1] for doc_score in result_list],score)
                        result_list=result_list[1:insert_idx]+[(tid,score)]+result_list[insert_idx:]
                        min_score=result_list[0][1]
                elif len(result_list)<num-1:
                    result_list.append((tid,score))                    
                elif len(result_list)==num-1:
                    result_list.append((tid,score))
                    result_list.sort(key=lambda x:x[1])
                    min_score=result_list[0][1]
        
        return [doc_id for doc_id,_ in result_list[::-1]]
    def render(self,result_list,keyword):
        count=1
        for item in result_list:
            result=self.highlight(self.docs_lower[item],keyword.lower(),self.docs[item])
            display(HTML("{}、{}".format(count,result)))
            count+=1

In [37]:
searcher=MySearchV3('c:/python data/titles.txt',100000)

In [38]:
keyword='手机'
search_result=searcher.search(keyword)

In [39]:
searcher.render(search_result,keyword)

In [40]:
keyword='手机'
%time search_result=searcher.search(keyword)

Wall time: 5.03 s


In [41]:
%prun search_result=searcher.search(keyword)

 