-
Notifications
You must be signed in to change notification settings - Fork 517
/
TitleExtractor.py
35 lines (29 loc) · 1.05 KB
/
TitleExtractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import re
from lxml.html import HtmlElement
from gne.defaults import TITLE_HTAG_XPATH, TITLE_SPLIT_CHAR_PATTERN
class TitleExtractor:
def extract_by_xpath(self, element, title_xpath):
if title_xpath:
title_list = element.xpath(title_xpath)
if title_list:
return title_list[0]
else:
return ''
return ''
def extract_by_title(self, element):
title_list = element.xpath('//title/text()')
if not title_list:
return ''
title = re.split(TITLE_SPLIT_CHAR_PATTERN, title_list[0])
if title:
return title[0]
else:
return ''
def extract_by_htag(self, element):
title_list = element.xpath(TITLE_HTAG_XPATH)
if not title_list:
return ''
return title_list[1]
def extract(self, element: HtmlElement, title_xpath: str=''):
title = self.extract_by_xpath(element, title_xpath) or self.extract_by_title(element) or self.extract_by_htag(element)
return title