/
page.rb
195 lines (155 loc) · 5.55 KB
/
page.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
module Wikipedia
class Page
attr_reader :json
def initialize(json)
require 'json'
@json = json
@data = JSON.parse(json)
end
def page
@data['query']['pages'].values.first if @data['query']['pages']
end
def content
page['revisions'].first['*'] if page['revisions']
end
def sanitized_content
self.class.sanitize(content)
end
def redirect?
content && content.match(/\#REDIRECT\s*\[\[(.*?)\]\]/i)
end
def redirect_title
redirect?[1] rescue nil
end
def title
page['title']
end
def fullurl
page['fullurl']
end
def editurl
page['editurl']
end
def text
page['extract']
end
def summary
page['extract'].split('==')[0].strip if page['extract'] && page['extract'] != ''
end
def categories
page['categories'].map { |c| c['title'] } if page['categories']
end
def links
page['links'].map { |c| c['title'] } if page['links']
end
def extlinks
page['extlinks'].map { |c| c['*'] } if page['extlinks']
end
def langlinks
Hash[page['langlinks'].collect { |c| [c['lang'], c['*']] }] if page['langlinks']
end
def images
page['images'].map { |c| c['title'] } if page['images']
end
def image_url
page['imageinfo'].first['url'] if page['imageinfo']
end
def image_thumburl
page['imageinfo'].first['thumburl'] if page['imageinfo']
end
def image_descriptionurl
page['imageinfo'].first['descriptionurl'] if page['imageinfo']
end
def image_urls
image_metadata.map(&:image_url) unless image_metadata.nil?
end
def image_thumburls( width = nil )
options = width.nil? ? {} : { iiurlwidth: width }
image_metadata( options ).map(&:image_thumburl) unless image_metadata( options ).nil?
end
def image_descriptionurls
image_metadata.map(&:image_descriptionurl) unless image_metadata.nil?
end
def main_image_url
page['thumbnail']['source'].sub(/\/thumb/, '').sub(/\/[^\/]*$/, '') if page['thumbnail']
end
def main_image_thumburl
page['thumbnail']['source'] if page['thumbnail']
end
def coordinates
page['coordinates'].first.values if page['coordinates']
end
def raw_data
@data
end
def image_metadata( options = {} )
unless @cached_image_metadata
return if images.nil?
filtered = images.select { |i| i =~ /:.+\.(jpg|jpeg|png|gif|svg)$/i && !i.include?('LinkFA-star') }
@cached_image_metadata = filtered.map { |title| Wikipedia.find_image(title, options) }
end
@cached_image_metadata || []
end
def templates
page['templates'].map { |c| c['title'] } if page['templates']
end
# rubocop:disable Metrics/MethodLength
# rubocop:disable Metrics/AbcSize
def self.sanitize(s)
return unless s
# Transform punctuation templates
# Em dash (https://en.wikipedia.org/wiki/Template:Em_dash)
s.gsub!(/\{\{(em dash|emdash)\}\}/i, '—')
# En dash (https://en.wikipedia.org/wiki/Template:En_dash)
s.gsub!(/\{\{(en dash|ndash|nsndns)\}\}/i, '–')
# Spaced en dashes (https://en.wikipedia.org/wiki/Template:Spaced_en_dash_space)
s.gsub!(/\{\{(spaced e?n\s?dash( space)?|snds?|spndsp|sndashs|spndashsp)\}\}/i, ' – ')
# Bold middot
s.gsub!(/\{\{(·|dot|middot|\,)\}\}/i, ' <b>·</b>')
# Bullets
s.gsub!(/\{\{(•|bull(et)?)\}\}/i, ' •')
# Forward Slashes (https://en.wikipedia.org/wiki/Template:%5C)
s.gsub!(/\{\{\\\}\}/i, ' /')
# Transform language specific blocks
s.gsub!(/\{\{lang[\-\|]([a-z]+)\|([^\|\{\}]+)(\|[^\{\}]+)?\}\}/i, '<span lang="\1">\2</span>')
# Parse Old Style Date template blocks
# Old Style Dates (https://en.wikipedia.org/wiki/Template:OldStyleDate)
s.gsub!(/\{\{OldStyleDate\|([^\|]*)\|([^\|]*)\|([^\|]*)\}\}/i, '\1 [<abbr title="Old Style">O.S.</abbr> \3] \2')
# Old Style Dates with different years (https://en.wikipedia.org/wiki/Template:OldStyleDateDY)
s.gsub!(/\{\{OldStyleDateDY\|([^\|]*)\|([^\|]*)\|([^\|]*)\}\}/i, '\1 \2 [<abbr title="Old Style">O.S.</abbr> \3]')
# Old Style Dates with no year (https://en.wikipedia.org/wiki/Template:OldStyleDateNY)
s.gsub!(/\{\{OldStyleDateNY\|([^\|]*)\|([^\|]*)\}\}/i, '\1 [<abbr title="Old Style">O.S.</abbr> \2]')
# strip anything else inside curly braces!
s.gsub!(/\{\{[^\{\}]+?\}\}[\;\,]?/, '') while s =~ /\{\{[^\{\}]+?\}\}[\;\,]?/
# strip info box
s.sub!(/^\{\|[^\{\}]+?\n\|\}\n/, '')
# strip internal links
s.gsub!(/\[\[([^\]\|]+?)\|([^\]\|]+?)\]\]/, '\2')
s.gsub!(/\[\[([^\]\|]+?)\]\]/, '\1')
# strip images and file links
s.gsub!(/\[\[Image:(.*?(?=\]\]))??\]\]/, '')
s.gsub!(/\[\[File:(.*?(?=\]\]))??\]\]/, '')
# convert bold/italic to html
s.gsub!(/'''''(.+?)'''''/, '<b><i>\1</i></b>')
s.gsub!(/'''(.+?)'''/, '<b>\1</b>')
s.gsub!(/''(.+?)''/, '<i>\1</i>')
# misc
s.gsub!(/(\d)<ref[^<>]*>[\s\S]*?<\/ref>(\d)/, '\1 – \2')
s.gsub!(/<ref[^<>]*>[\s\S]*?<\/ref>/, '')
s.gsub!(/<ref(.*?(?=\/>))??\/>/, '')
s.gsub!(/<!--[^>]+?-->/, '')
s.gsub!(/\(\s+/, '(')
s.gsub!(' ', ' ')
s.strip!
# create paragraphs
sections = s.split("\n\n")
s =
if sections.size > 1
sections.map { |paragraph| "<p>#{paragraph.strip}</p>" }.join("\n")
else
"<p>#{s}</p>"
end
s
end
end
end