In [1]:
from xml.sax.handler import ContentHandler
from xml.sax import parse
import os
paper_tag = ('article','inproceedings','proceedings','book',
                   'incollection','phdthesis','mastersthesis')

class ResolveHandler(ContentHandler):
    
    def __init__(self, author_dict, titles):
        super().__init__()
        self.author_dict = author_dict
        self.titles = titles
        self.authors = ''
        self.title =''
        self.count = 0
        self.author_flag = False
        self.title_flag = False
        self.paper_flag = False
        
    def startElement(self, name, attrs):
        if self.paper_flag:
            if name == 'author':
                self.author_flag = True
            elif name == 'title':
                self.title_flag = True      
        elif name in paper_tag:
            self.paper_flag = True
                
    def endElement(self, name):
        if self.paper_flag:
            if name == 'author':
                self.authors += ','
                self.author_flag = False
            if name =='title':
                self.title_flag = False
            if name in paper_tag:
                i = 1            #位序
                for author in self.authors.split(','):
                    if author != '':
                        temp_str = str(self.count) + " " + str(i) #索引 位序
                        self.author_dict.setdefault(author, []).append(temp_str)
                        i = i + 1
                self.titles.append(self.title)
                if self.count % 100000 == 0:
                    print('.', end='')
                self.count += 1
                self.authors = ''
                self.title = ''
                self.paper_flag = False
        
    def characters(self, s):
        if self.paper_flag:
            if self.author_flag:
                self.authors += s
            elif self.title_flag:
                self.title += s

class DBLP:
    def __init__(self):
        self.authors = {}
        self.titles = []
        self.default_path = 'dblp_index'
        
    def load(self, path=None):
        if not path:
            path = self.default_path
        if not os.path.exists(path):
            print('Failed to find path ' + path)
            return
        author_path = os.path.join(path, 'author.dat')
        title_path = os.path.join(path, 'title.dat')
        
        author_file = open(author_path, encoding = 'utf-8')
        title_file = open(title_path, encoding = 'utf-8')
        for line in author_file:
            L = line.split(':')
            self.authors[L[0]] = [int(x.split(" ")[0]) for x in L[1].split(',') ]
        for line in title_file:
            self.titles.append(line)
        print('done.')
            
    def search(self, author_name):
        titles = []
        if author_name in self.authors:
            for idx in self.authors[author_name]:
                titles.append(self.titles[idx])
        return titles
        
            
    def creat_index(self, raw_file='dblp.xml', path=None):
        if not os.path.exists(raw_file):
            print(raw_file, 'is not found')
            return
        if not path:
            path = self.default_path
        if not os.path.exists(path):
            os.makedirs(path)
        author_path = os.path.join(path, 'author.dat')
        title_path = os.path.join(path, 'title.dat')
        print('parsing dblp.xml', end='')
        self.author_dict = {}
        self.titles = []
        parse(raw_file, ResolveHandler(self.author_dict, self.titles))
        print('done.')
        print('create index ...', end='')
        
        author_file = open(author_path, 'w+', encoding='utf-8')
        for author in self.author_dict:
            author_file.write(author.lower()+':'+ ','.join([str(x) for x in self.author_dict[author]]) + '\n')
        author_file.close()
        print('done.')
        
        title_file = open(title_path, 'w+', encoding='utf-8')
        for title in self.titles:
            title_file.write(title.lower()+'\n')
        title_file.close()

In [2]:
dblp = DBLP()
dblp.creat_index()  #创建索引

parsing dblp.xml................................................done.
create index ...done.


In [3]:
dblp.load()#读入索引

done.


In [4]:
name = input('please enter a name: ')
titles = dblp.search(name.strip().lower())    #函数返回一个放入所查作者的title的列表
print(str(len(titles)) + ' papers were found.')
print('-'*50)
for title in titles:
    print(title)

please enter a name: jiaying wang
11 papers were found.
--------------------------------------------------
State-of-the-art in string similarity search and join.
LS-Join: Local Similarity Join on String Collections.
A Novel Resource Allocation and Spectrum Defragmentation Mechanism for IoT-Based Big Data in Smart Cities.
Efficient direct search on compressed genomic data.
An Efficient Trip Planning Algorithm under Constraints.
Research on Diabetes Management Strategy Based on Deep Belief Network.
An improved AdaBoost face detection algorithm based on optimizing skin color model.
An Adaptive Approach of Approximate Substring Matching.
Reducing Extension Edges of Concurrent Programs for Reachability Analysis.
Memory-Aware BWT by Segmenting Sequences to Support Subsequence Search.
Cache-aware parallel approximate matching and join algorithms using BWT.
