-
Notifications
You must be signed in to change notification settings - Fork 51
/
readability.ex
232 lines (187 loc) · 6.11 KB
/
readability.ex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
defmodule Readability do
@moduledoc """
Readability library for extracting & curating articles.
## Example
```elixir
@type html :: binary
# Just pass url
%Readability.Summary{title: title, authors: authors, article_html: article} = Readability.summarize(url)
# Extract title
Readability.title(html)
# Extract authors.
Readability.authors(html)
# Extract only text from article
article = html
|> Readability.article
|> Readability.readable_text
# Extract article with transformed html
article = html
|> Readability.article
|> Readability.raw_html
```
"""
alias Readability.ArticleBuilder
alias Readability.AuthorFinder
alias Readability.Helper
alias Readability.Summary
alias Readability.TitleFinder
@default_options [
retry_length: 250,
min_text_length: 25,
remove_unlikely_candidates: true,
weight_classes: true,
clean_conditionally: true,
remove_empty_nodes: true,
min_image_width: 130,
min_image_height: 80,
ignore_image_format: [],
blacklist: nil,
whitelist: nil,
page_url: nil
]
@regexes [
unlikely_candidate:
~r/combx|comment|community|disqus|extra|foot|header|hidden|lightbox|modal|menu|meta|nav|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
ok_maybe_its_a_candidate: ~r/and|article|body|column|main|shadow/i,
positive: ~r/article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
negative:
~r/hidden|^hid|combx|comment|com-|contact|foot|footer|footnote|link|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|utility|widget/i,
div_to_p_elements: ~r/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
replace_brs: ~r/(<br[^>]*>[ \n\r\t]*){2,}/i,
replace_fonts: ~r/<(\/?)font[^>]*>/i,
replace_xml_version: ~r/<\?xml.*\?>/i,
normalize: ~r/\s{2,}/,
video: ~r/\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i,
protect_attrs: ~r/^(?!id|rel|for|summary|title|href|src|alt|srcdoc)/i,
img_tag_src: ~r/(<img.*src=['"])([^'"]+)(['"][^>]*>)/Ui
]
@markup_mimes ~r/^(application|text)\/[a-z\-_\.\+]+ml(;\s*charset=.*)?$/i
@type html_tree :: tuple | list
@type raw_html :: binary
@type url :: binary
@type options :: list
@type headers :: list[tuple]
@doc """
Summarize the primary readable content of a webpage.
"""
@spec summarize(url, options) :: Summary.t()
def summarize(url, opts \\ []) do
opts = Keyword.merge(opts, page_url: url)
httpoison_options = Application.get_env(:readability, :httpoison_options, [])
%{status_code: _, body: raw, headers: headers} = HTTPoison.get!(url, [], httpoison_options)
case is_response_markup(headers) do
true ->
html_tree = Helper.normalize(raw, url: url)
article_tree = ArticleBuilder.build(html_tree, opts)
%Summary{
title: title(html_tree),
authors: authors(html_tree),
article_html: readable_html(article_tree),
article_text: readable_text(article_tree)
}
_ ->
%Summary{title: nil, authors: nil, article_html: nil, article_text: raw}
end
end
@doc """
Extract MIME Type from headers.
## Example
iex> mime = Readability.mime(headers_list)
"text/html"
"""
@spec mime(headers) :: String.t()
def mime(headers \\ []) do
headers
|> Enum.find(
# default
{"Content-Type", "text/plain"},
fn {key, _} -> String.downcase(key) == "content-type" end
)
|> elem(1)
end
@doc """
Returns true if Content-Type in provided headers list is a markup type,
else false.
## Example
iex> Readability.is_response_markup?([{"Content-Type", "text/html"}])
true
"""
@spec is_response_markup(headers) :: boolean
def is_response_markup(headers) do
mime(headers) =~ @markup_mimes
end
@doc """
Extract title
## Example
iex> title = Readability.title(html_str)
"Some title in html"
"""
@spec title(binary | html_tree) :: binary
def title(raw_html) when is_binary(raw_html) do
raw_html
|> Floki.parse_document()
|> title
end
def title(html_tree), do: TitleFinder.title(html_tree)
@doc """
Extract authors.
## Example
iex> authors = Readability.authors(html_str)
["José Valim", "chrismccord"]
"""
@spec authors(binary | html_tree) :: list[binary]
def authors(html) when is_binary(html), do: html |> Floki.parse_document!() |> authors
def authors(html_tree), do: AuthorFinder.find(html_tree)
@doc """
Using a variety of metrics (content score, classname, element types), find the content that is
most likely to be the stuff a user wants to read.
## Example
iex> article_tree = Redability(html_str)
# returns article that is tuple
"""
@spec article(binary, options) :: html_tree
def article(raw_html, opts \\ []) do
opts = Keyword.merge(@default_options, opts)
raw_html
|> Helper.normalize()
|> ArticleBuilder.build(opts)
end
@doc """
Returns attributes, tags cleaned HTML.
"""
@spec readable_html(html_tree) :: binary
def readable_html(html_tree) do
html_tree
|> Helper.remove_attrs(regexes(:protect_attrs))
|> raw_html
end
@doc """
Returns only text binary from `html_tree`.
"""
@spec readable_text(html_tree) :: binary
def readable_text(html_tree) do
# @TODO: Remove image caption when extract only text
tags_to_br = ~r/<\/(p|div|article|h\d)/i
html_str = html_tree |> raw_html
tags_to_br
|> Regex.replace(html_str, &"\n#{&1}")
|> Floki.parse_fragment!()
|> Floki.text()
|> String.trim()
end
@doc """
Returns raw HTML binary from `html_tree`.
"""
@spec raw_html(html_tree) :: binary
def raw_html(html_tree) do
html_tree |> Floki.raw_html(encode: false)
end
@deprecated "Use `Floki.parse_document/1` or `Floki.parse_fragment/1` instead."
def parse(raw_html) when is_binary(raw_html) do
with {:ok, document} <- Floki.parse_document(raw_html) do
document
end
end
def regexes(key), do: @regexes[key]
def default_options, do: @default_options
end