#### OOP (object oriented programming) 

* 积极锻炼直觉思考和快速类比的能力
* 类：一群有着相同属性和函数的对象的集合
* 对象：集合中的一个事物，这里对应由 class 生成的某一个 object
* 属性：对象的某个静态特征
* 函数：对象的某个动态能力
* 静态函数:与类没有什么关联，最明显的特征便是，第一个参数没有任何特殊性
    - 用来做一些简单独立的任务，既方便测试，也能优化代码结构
    - 可以通过在函数前一行加上 @staticmethod 来表示
    - 没有状态的，不涉及对象的私有变量（没有 self 作为参数），相同的输入能够得到完全相同的输出结果
* 类函数的第一个参数一般为 cls，表示必须传一个类进来
    - 最常用的功能是实现不同的 init 构造函数
* 继承
    - 每个类都有构造函数，继承类在生成对象的时候，是不会自动调用父类的构造函数的，因此必须在 init() 函数中显式调用父类的构造函数。执行顺序是 子类的构造函数 -> 父类的构造函数
    - 函数重写:使子类必须重新写一遍函数，来覆盖掉原有函数
    - 支持多重继承
        - 直接初始化该类的一个父类,要求继承链的最顶层父类必须要继承 object
        - 多个构造函数需要调用， 必须用传统的方法
    - super 来召唤父类的构造函数，而且 python 使用一种叫做方法解析顺序的算法（具体实现算法叫做 C3），来保证一个类只会被初始化一次
* 抽象类
    - 作为父类存在的，一旦对象化就会报错
    - 抽象函数:定义在抽象类之中，子类必须重写该函数才能使用。相应的抽象函数，则是使用装饰器 @abstractmethod 来表示
    - 定义接口
        + 功能方面名称
        + 主流程

In [3]:
class Document():
    WELCOME_STR = 'Welcome! The context for this book is {}.'
    
    def __init__(self, title, author, context):
        print('init function called.')
        self.title = title
        self.author = author
        self.__context = context
    
    @classmethod    
    def create_empty_book(cls, title, author):
        return cls(title=title, author=author, context='nothing')

    def get_context_lentgh(self):
        return len(self.__context)
    
    def intercept_context(self, length):
        self.__context = self.__context[:length]
        
    @staticmethod
    def get_welcome(context): 
        return Document.WELCOME_STR.format(context)
  
empty_book = Document.create_empty_book('What Every Man Thinks About Apart from Sex', 'Professor Sheridan Simove')
print(empty_book.get_context_lentgh())
print(empty_book.get_welcome('indeed nothing'))

init function called.
7
Welcome! The context for this book is indeed nothing.


In [6]:
# 接口：公用方法接口声明，公共构造方法
class Entity():
    def __init__(self, object_type):
        print('parent class init called')
        self.object_type = object_type
    
    def get_context_length(self):
        raise Exception('get_context_length not implemented')
    
    def print_title(self):
        print(self.title)

class Document(Entity):
    def __init__(self, title, author, context):
        print('Document class init called')
        Entity.__init__(self, 'document') # 调用父类构造方法
        self.title = title
        self.author = author
        self.__context = context
    
    def get_context_length(self):
        return len(self.__context)
    
class Video(Entity):
    def __init__(self, title, author, video_length):
        print('Video class init called')
        Entity.__init__(self, 'video')
        self.title = title
        self.author = author
        self.__video_length = video_length
    
    def get_context_length(self):
        return self.__video_length

harry_potter_book = Document('Harry Potter(Book)', 'J. K. Rowling', '... Forever Do not believe any thing is capable of thinking independently ...')
harry_potter_movie = Video('Harry Potter(Movie)', 'J. K. Rowling', 120)

print(harry_potter_book.object_type)
print(harry_potter_movie.object_type)

harry_potter_book.print_title()
harry_potter_movie.print_title()

print(harry_potter_book.get_context_length())
print(harry_potter_movie.get_context_length())

Document class init called
parent class init called
Video class init called
parent class init called
document
video
Harry Potter(Book)
Harry Potter(Movie)
77
120


In [7]:
# 抽象函数
from abc import ABCMeta, abstractmethod

class Entity(metaclass=ABCMeta):
    @abstractmethod
    def get_title(self):
        pass

    @abstractmethod
    def set_title(self, title):
        pass

class Document(Entity):
    def get_title(self):
        return self.title
    
    def set_title(self, title):
        self.title = title

document = Document()
document.set_title('Harry Potter')
print(document.get_title())

# entity = Entity()

Harry Potter


##### 流程

* idea 提出之后，开发组和产品组首先会召开产品设计会
* PM（Product Manager，产品经理） 写出产品需求文档，然后迭代
* TL（Team Leader，项目经理）编写开发文档，开发文档中会定义不同模块的大致功能和接口、每个模块之间如何协作、单元测试和集成测试、线上灰度测试、监测和日志等等一系列开发流程

##### 文档

*  Launch Doc （上线文档）中要求用五个单词总结你的文档

In [9]:
class A():
    def __init__(self):
        print('enter A')
        print('leave A')

class B(A):
    def __init__(self):
        print('enter B')
        super().__init__()
        print('leave B')

class C(A):
    def __init__(self):
        print('enter C')
        super().__init__()
        print('leave C')

class D(B, C):
    def __init__(self):
        print('enter D')
        super().__init__()
        print('leave D')

D()

enter D
enter B
enter C
enter A
leave A
leave C
leave B
leave D


<__main__.D at 0x7f15de7f8910>

##### 搜索引擎

* 搜索器:爬虫（scrawler）在互联网上大量爬取各类网站的内容，送给索引器
* 索引器:拿到网页和内容后，会对内容进行处理，形成索引（index），存储于内部的数据库等待检索
* 检索器:高效检索后，再将结果返回给用户
* 用户接口:用户通过用户接口，向搜索引擎发出询问（query），询问解析后送达检索器
* BOW Model，即 Bag of Words Model，中文叫做词袋模型
* 实现
    - 文本路径——》内容
    - 文本路径－》单词
    - 单词－》路径

In [1]:
class SearchEngineBase(object):
    def __init__(self):
        pass

    def add_corpus(self, file_path):
        with open(file_path, 'r') as fin:
            text = fin.read()
        self.process_corpus(file_path, text)

    def process_corpus(self, id, text):
        raise Exception('process_corpus not implemented.')

    def search(self, query):
        raise Exception('search not implemented.')

def main(search_engine):
    for file_path in ['../data/1.txt', '../data/2.txt', '../data/3.txt', '../data/4.txt', '../data/5.txt']:
        search_engine.add_corpus(file_path)
    while True:
        query = input()
        results = search_engine.search(query)
        print('found {} result(s):'.format(len(results)))
        for result in results:
            print(result)

In [None]:

class SimpleEngine(SearchEngineBase):
    def __init__(self):
        super(SimpleEngine, self).__init__()
        self.__id_to_texts = {}

    def process_corpus(self, id, text):
        self.__id_to_texts[id] = text

    def search(self, query):
        results = []
        for id, text in self.__id_to_texts.items():
            if query in text:
                results.append(id)
        return results

search_engine = SimpleEngine()
main(search_engine)

sdfads
I have a dream that my four little children will one day live in a nation where they will not be judged by the color of their skin but by the content of their character. I have a dream today.
I have a dream that one day down in Alabama, with its vicious racists, . . . one day right there in Alabama little black boys and black girls will be able to join hands with little white boys and white girls as sisters and brothers. I have a dream today.
I have a dream that one day every valley shall be exalted, every hill and mountain shall be made low, the rough places will be made plain, and the crooked places will be made straight, and the glory of the Lord shall be revealed, and all flesh shall see it together.
This is our hope. . . With this faith we will be able to hew out of the mountain of despair a stone of hope. With this faith we will be able to transform the jangling discords of our nation into a beautiful symphony of brotherhood. With this faith we will be able to work togethe

In [None]:
# 所有的搜索关键词都要出现在同一篇文章中 
import re

class BOWEngine(SearchEngineBase):
    def __init__(self):
        super(BOWEngine, self).__init__()
        self.__id_to_words = {}

    def process_corpus(self, id, text):
        self.__id_to_words[id] = self.parse_text_to_words(text)

    def search(self, query):
        query_words = self.parse_text_to_words(query)
        results = []
        for id, words in self.__id_to_words.items():
            if self.query_match(query_words, words):
                results.append(id)
        return results
    
    @staticmethod
    def query_match(query_words, words):
        for query_word in query_words:
            if query_word not in words:
                return False
        return True

    @staticmethod
    def parse_text_to_words(text):
        # 使用正则表达式去除标点符号和换行符
        text = re.sub(r'[^\w ]', ' ', text)
        # 转为小写
        text = text.lower()
        # 生成所有单词的列表
        word_list = text.split(' ')
        # 去除空白单词
        word_list = filter(None, word_list)
        # 返回单词的 set
        return set(word_list)

search_engine = BOWEngine()
main(search_engine)

home
found 0 result(s):
little
found 2 result(s):
../data/1.txt
../data/2.txt
henry
found 0 result(s):


In [None]:

import re

class BOWInvertedIndexEngine(SearchEngineBase):
    def __init__(self):
        super(BOWInvertedIndexEngine, self).__init__()
        self.inverted_index = {}

    def process_corpus(self, id, text):
        words = self.parse_text_to_words(text)
        for word in words:
            if word not in self.inverted_index:
                self.inverted_index[word] = []
            self.inverted_index[word].append(id)

    def search(self, query):
        query_words = list(self.parse_text_to_words(query))
        query_words_index = list()
        for query_word in query_words:
            query_words_index.append(0)
        
        # 如果某一个查询单词的倒序索引为空，我们就立刻返回
        for query_word in query_words:
            if query_word not in self.inverted_index:
                return []
        
        result = []
        while True:
            
            # 首先，获得当前状态下所有倒序索引的 index
            current_ids = []
            
            for idx, query_word in enumerate(query_words):
                current_index = query_words_index[idx]
                current_inverted_list = self.inverted_index[query_word]
                
                # 已经遍历到了某一个倒序索引的末尾，结束 search
                if current_index >= len(current_inverted_list):
                    return result

                current_ids.append(current_inverted_list[current_index])

            # 然后，如果 current_ids 的所有元素都一样，那么表明这个单词在这个元素对应的文档中都出现了
            if all(x == current_ids[0] for x in current_ids):
                result.append(current_ids[0])
                query_words_index = [x + 1 for x in query_words_index]
                continue
            
            # 如果不是，我们就把最小的元素加一
            min_val = min(current_ids)
            min_val_pos = current_ids.index(min_val)
            query_words_index[min_val_pos] += 1

    @staticmethod
    def parse_text_to_words(text):
        # 使用正则表达式去除标点符号和换行符
        text = re.sub(r'[^\w ]', ' ', text)
        # 转为小写
        text = text.lower()
        # 生成所有单词的列表
        word_list = text.split(' ')
        # 去除空白单词
        word_list = filter(None, word_list)
        # 返回单词的 set
        return set(word_list)

search_engine = BOWInvertedIndexEngine()
main(search_engine)

In [None]:
# 加缓存
import pylru

class LRUCache(object):
    def __init__(self, size=32):
        self.cache = pylru.lrucache(size)
    
    def has(self, key):
        return key in self.cache
    
    def get(self, key):
        return self.cache[key]
    
    def set(self, key, value):
        self.cache[key] = value

class BOWInvertedIndexEngineWithCache(BOWInvertedIndexEngine, LRUCache):
    def __init__(self):
        super(BOWInvertedIndexEngineWithCache, self).__init__()
        LRUCache.__init__(self)
    
    def search(self, query):
        if self.has(query):
            print('cache hit!')
            return self.get(query)
        # 强行调用被覆盖的父类的函数
        result = super(BOWInvertedIndexEngineWithCache, self).search(query)
        self.set(query, result)
        
        return result

search_engine = BOWInvertedIndexEngineWithCache()
main(search_engine)