Skip to content

Commit

Permalink
把配置文件独立出来,不要直接写在代码中
Browse files Browse the repository at this point in the history
  • Loading branch information
kingname committed Sep 11, 2019
1 parent 8f093d5 commit a688fe3
Show file tree
Hide file tree
Showing 5 changed files with 53 additions and 43 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ result = extractor.extract(html, noise_node_list=['//div[@class="comment-list"]'

## Todo

* 使用一个配置文件来存放常量数据,而不是直接 Hard Code 写在代码中。
* ~~使用一个配置文件来存放常量数据,而不是直接 Hard Code 写在代码中。~~
* 允许自定义时间、作者的提取Pattern
* 新闻文章列表页提取
* 对于多页的新闻,允许传入一个 HTML 列表,GNE 解析以后,自动拼接为完整的新闻正文
Expand Down
45 changes: 45 additions & 0 deletions gne/defaults.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
AUTHOR_PATTERN = [
"责编[:|:| |丨|/]\s*([\u4E00-\u9FA5]{2,5})[^\u4E00-\u9FA5|:|:]",
"作者[:|:| |丨|/]\s*([\u4E00-\u9FA5]{2,5})[^\u4E00-\u9FA5|:|:]",
"编辑[:|:| |丨|/]\s*([\u4E00-\u9FA5]{2,5})[^\u4E00-\u9FA5|:|:]",
"文[:|:| |丨|/]\s*([\u4E00-\u9FA5]{2,5})[^\u4E00-\u9FA5|:|:]",
"撰文[:|:| |丨|/]\s*([\u4E00-\u9FA5]{2,5})[^\u4E00-\u9FA5|:|:]",
"来源[:|:| |丨|/]\s*([\u4E00-\u9FA5]{2,5})[^\u4E00-\u9FA5|:|:]"]


DATETIME_PATTERN = [
"(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[0-1]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])",
"(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[2][0-3]:[0-5]?[0-9]:[0-5]?[0-9])",
"(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[0-1]?[0-9]:[0-5]?[0-9])",
"(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[2][0-3]:[0-5]?[0-9])",
"(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[1-24]\d时[0-60]\d分)([1-24]\d时)",
"(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[0-1]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])",
"(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[2][0-3]:[0-5]?[0-9]:[0-5]?[0-9])",
"(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[0-1]?[0-9]:[0-5]?[0-9])",
"(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[2][0-3]:[0-5]?[0-9])",
"(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[1-24]\d时[0-60]\d分)([1-24]\d时)",
"(\d{4}年\d{1,2}月\d{1,2}日\s*?[0-1]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])",
"(\d{4}年\d{1,2}月\d{1,2}日\s*?[2][0-3]:[0-5]?[0-9]:[0-5]?[0-9])",
"(\d{4}年\d{1,2}月\d{1,2}日\s*?[0-1]?[0-9]:[0-5]?[0-9])",
"(\d{4}年\d{1,2}月\d{1,2}日\s*?[2][0-3]:[0-5]?[0-9])",
"(\d{4}年\d{1,2}月\d{1,2}日\s*?[1-24]\d时[0-60]\d分)([1-24]\d时)",
"(\d{2}年\d{1,2}月\d{1,2}日\s*?[0-1]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])",
"(\d{2}年\d{1,2}月\d{1,2}日\s*?[2][0-3]:[0-5]?[0-9]:[0-5]?[0-9])",
"(\d{2}年\d{1,2}月\d{1,2}日\s*?[0-1]?[0-9]:[0-5]?[0-9])",
"(\d{2}年\d{1,2}月\d{1,2}日\s*?[2][0-3]:[0-5]?[0-9])",
"(\d{2}年\d{1,2}月\d{1,2}日\s*?[1-24]\d时[0-60]\d分)([1-24]\d时)",
"(\d{1,2}月\d{1,2}日\s*?[0-1]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])",
"(\d{1,2}月\d{1,2}日\s*?[2][0-3]:[0-5]?[0-9]:[0-5]?[0-9])",
"(\d{1,2}月\d{1,2}日\s*?[0-1]?[0-9]:[0-5]?[0-9])",
"(\d{1,2}月\d{1,2}日\s*?[2][0-3]:[0-5]?[0-9])",
"(\d{1,2}月\d{1,2}日\s*?[1-24]\d时[0-60]\d分)([1-24]\d时)",
"(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2})",
"(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2})",
"(\d{4}年\d{1,2}月\d{1,2}日)",
"(\d{2}年\d{1,2}月\d{1,2}日)",
"(\d{1,2}月\d{1,2}日)"
]

TITLE_HTAG_XPATH = '//h1//text() | //h2//text() | //h3//text() | //h4//text()'

TITLE_SPLIT_CHAR_PATTERN = '[-_|]'
10 changes: 2 additions & 8 deletions gne/extractor/AuthorExtractor.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,11 @@
import re
from lxml.html import HtmlElement
from gne.defaults import AUTHOR_PATTERN


class AuthorExtractor:
def __init__(self):
self.author_pattern = [
"责编[:|:| |丨|/]\s*([\u4E00-\u9FA5]{2,5})[^\u4E00-\u9FA5|:|:]",
"作者[:|:| |丨|/]\s*([\u4E00-\u9FA5]{2,5})[^\u4E00-\u9FA5|:|:]",
"编辑[:|:| |丨|/]\s*([\u4E00-\u9FA5]{2,5})[^\u4E00-\u9FA5|:|:]",
"文[:|:| |丨|/]\s*([\u4E00-\u9FA5]{2,5})[^\u4E00-\u9FA5|:|:]",
"撰文[:|:| |丨|/]\s*([\u4E00-\u9FA5]{2,5})[^\u4E00-\u9FA5|:|:]",
"来源[:|:| |丨|/]\s*([\u4E00-\u9FA5]{2,5})[^\u4E00-\u9FA5|:|:]"
]
self.author_pattern = AUTHOR_PATTERN

def extractor(self, element: HtmlElement):
text = ''.join(element.xpath('.//text()'))
Expand Down
34 changes: 2 additions & 32 deletions gne/extractor/TimeExtractor.py
Original file line number Diff line number Diff line change
@@ -1,41 +1,11 @@
import re
from lxml.html import HtmlElement
from gne.defaults import DATETIME_PATTERN


class TimeExtractor:
def __init__(self):
self.time_pattern = [
"(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[0-1]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])",
"(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[2][0-3]:[0-5]?[0-9]:[0-5]?[0-9])",
"(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[0-1]?[0-9]:[0-5]?[0-9])",
"(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[2][0-3]:[0-5]?[0-9])",
"(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[1-24]\d时[0-60]\d分)([1-24]\d时)",
"(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[0-1]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])",
"(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[2][0-3]:[0-5]?[0-9]:[0-5]?[0-9])",
"(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[0-1]?[0-9]:[0-5]?[0-9])",
"(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[2][0-3]:[0-5]?[0-9])",
"(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[1-24]\d时[0-60]\d分)([1-24]\d时)",
"(\d{4}年\d{1,2}月\d{1,2}日\s*?[0-1]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])",
"(\d{4}年\d{1,2}月\d{1,2}日\s*?[2][0-3]:[0-5]?[0-9]:[0-5]?[0-9])",
"(\d{4}年\d{1,2}月\d{1,2}日\s*?[0-1]?[0-9]:[0-5]?[0-9])",
"(\d{4}年\d{1,2}月\d{1,2}日\s*?[2][0-3]:[0-5]?[0-9])",
"(\d{4}年\d{1,2}月\d{1,2}日\s*?[1-24]\d时[0-60]\d分)([1-24]\d时)",
"(\d{2}年\d{1,2}月\d{1,2}日\s*?[0-1]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])",
"(\d{2}年\d{1,2}月\d{1,2}日\s*?[2][0-3]:[0-5]?[0-9]:[0-5]?[0-9])",
"(\d{2}年\d{1,2}月\d{1,2}日\s*?[0-1]?[0-9]:[0-5]?[0-9])",
"(\d{2}年\d{1,2}月\d{1,2}日\s*?[2][0-3]:[0-5]?[0-9])",
"(\d{2}年\d{1,2}月\d{1,2}日\s*?[1-24]\d时[0-60]\d分)([1-24]\d时)",
"(\d{1,2}月\d{1,2}日\s*?[0-1]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])",
"(\d{1,2}月\d{1,2}日\s*?[2][0-3]:[0-5]?[0-9]:[0-5]?[0-9])",
"(\d{1,2}月\d{1,2}日\s*?[0-1]?[0-9]:[0-5]?[0-9])",
"(\d{1,2}月\d{1,2}日\s*?[2][0-3]:[0-5]?[0-9])",
"(\d{1,2}月\d{1,2}日\s*?[1-24]\d时[0-60]\d分)([1-24]\d时)",
"(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2})",
"(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2})",
"(\d{4}年\d{1,2}月\d{1,2}日)",
"(\d{2}年\d{1,2}月\d{1,2}日)",
"(\d{1,2}月\d{1,2}日)"
]
self.time_pattern = DATETIME_PATTERN

def extractor(self, element: HtmlElement):
text = ''.join(element.xpath('.//text()'))
Expand Down
5 changes: 3 additions & 2 deletions gne/extractor/TitleExtractor.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import re
from lxml.html import HtmlElement
from gne.defaults import TITLE_HTAG_XPATH, TITLE_SPLIT_CHAR_PATTERN


class TitleExtractor:
Expand All @@ -16,14 +17,14 @@ def extract_by_title(self, element):
title_list = element.xpath('//title/text()')
if not title_list:
return ''
title = re.split('[-_|]', title_list[0])
title = re.split(TITLE_SPLIT_CHAR_PATTERN, title_list[0])
if title:
return title[0]
else:
return ''

def extract_by_htag(self, element):
title_list = element.xpath('//h1//text() | //h2//text() | //h3//text() | //h4//text()')
title_list = element.xpath(TITLE_HTAG_XPATH)
if not title_list:
return ''
return title_list[1]
Expand Down

0 comments on commit a688fe3

Please sign in to comment.