-
Notifications
You must be signed in to change notification settings - Fork 76
/
page.rb
109 lines (87 loc) · 2.28 KB
/
page.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
module Wikipedia
class Page
def initialize(json)
require 'json'
@json = json
@data = JSON::load(json)
end
def page
@data['query']['pages'].values.first
end
def content
page['revisions'].first.fetch('*') if page['revisions']
end
def sanitized_content
self.class.sanitize(content)
end
def redirect?
content && content.match(/\#REDIRECT\s+\[\[(.*?)\]\]/i)
end
def redirect_title
if matches = redirect?
matches[1]
end
end
def title
page['title']
end
def categories
page['categories'].map {|c| c['title'] } if page['categories']
end
def links
page['links'].map {|c| c['title'] } if page['links']
end
def images
page['images'].map {|c| c['title'] } if page['images']
end
def image_url
page['imageinfo'].first['url'] if page['imageinfo']
end
def image_urls
if list = images
filtered = list.select {|i| i =~ /^file:.+\.(jpg|jpeg|png|gif)$/i && !i.include?("LinkFA-star") }
filtered.map do |title|
Wikipedia.find_image( title ).image_url
end
end
end
def raw_data
@data
end
def json
@json
end
def self.sanitize( s )
if s
s = s.dup
# strip anything inside curly braces!
while s =~ /\{\{[^\{\}]+?\}\}/
s.gsub!(/\{\{[^\{\}]+?\}\}/, '')
end
# strip info box
s.sub!(/^\{\|[^\{\}]+?\n\|\}\n/, '')
# strip internal links
s.gsub!(/\[\[([^\]\|]+?)\|([^\]\|]+?)\]\]/, '\2')
s.gsub!(/\[\[([^\]\|]+?)\]\]/, '\1')
# strip images and file links
s.gsub!(/\[\[Image:[^\[\]]+?\]\]/, '')
s.gsub!(/\[\[File:[^\[\]]+?\]\]/, '')
# convert bold/italic to html
s.gsub!(/'''''(.+?)'''''/, '<b><i>\1</i></b>')
s.gsub!(/'''(.+?)'''/, '<b>\1</b>')
s.gsub!(/''(.+?)''/, '<i>\1</i>')
# misc
s.gsub!(/<ref[^<>]*>[\s\S]*?<\/ref>/, '')
s.gsub!(/<!--[^>]+?-->/, '')
s.gsub!(' ', ' ')
s.strip!
# create paragraphs
sections = s.split("\n\n")
if sections.size > 1
s = sections.map {|s| "<p>#{s.strip}</p>" }.join("\n")
end
s
end
end
end
end