-
Notifications
You must be signed in to change notification settings - Fork 320
/
Copy pathtumblr.rb
303 lines (284 loc) · 11.3 KB
/
tumblr.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
module JekyllImport
module Importers
class Tumblr < Importer
def self.require_deps
JekyllImport.require_with_fallback(%w(
rubygems
fileutils
open-uri
nokogiri
json
uri
time
jekyll
))
end
def self.specify_options(c)
c.option "url", "--url URL", "Tumblr URL"
c.option "format", "--format FORMAT", 'Output format (default: "html")'
c.option "grab_images", "--grab_images", "Whether to grab images (default: false)"
c.option "add_highlights", "--add_highlights", "Whether to add highlights (default: false)"
c.option "rewrite_urls", "--rewrite_urls", "Whether to rewrite URLs (default: false)"
end
def self.process(options)
url = options.fetch("url")
format = options.fetch("format", "html")
grab_images = options.fetch("grab_images", false)
add_highlights = options.fetch("add_highlights", false)
rewrite_urls = options.fetch("rewrite_urls", false)
@grab_images = grab_images
FileUtils.mkdir_p "_posts/tumblr"
url += "/api/read/json/"
per_page = 50
posts = []
# Two passes are required so that we can rewrite URLs.
# First pass builds up an array of each post as a hash.
begin
current_page = (current_page || -1) + 1
feed_url = url + "?num=#{per_page}&start=#{current_page * per_page}"
puts "Fetching #{feed_url}"
feed = open(feed_url)
contents = feed.readlines.join("\n")
blog = extract_json(contents)
puts "Page: #{current_page + 1} - Posts: #{blog["posts"].size}"
batch = blog["posts"].map { |post| post_to_hash(post, format) }
# If we're rewriting, save the posts for later. Otherwise, go ahead and
# dump these to disk now
if rewrite_urls
posts += batch
else
batch.each { |post| write_post(post, format == "md", add_highlights) }
end
end until blog["posts"].size < per_page
# Rewrite URLs, create redirects and write out out posts if necessary
if rewrite_urls
posts = rewrite_urls_and_redirects posts
posts.each { |post| write_post(post, format == "md", add_highlights) }
end
end
private
class << self
def extract_json(contents)
beginning = contents.index("{")
ending = contents.rindex("}") + 1
json = contents[beginning...ending] # Strip Tumblr's JSONP chars.
JSON.parse(json)
end
# Writes a post out to disk
def write_post(post, use_markdown, add_highlights)
content = post[:content]
if content
if use_markdown
content = html_to_markdown content
if add_highlights
tumblr_url = URI.parse(post[:slug]).path
redirect_dir = tumblr_url.sub(%r!\/!, "") + "/"
FileUtils.mkdir_p redirect_dir
content = add_syntax_highlights(content, redirect_dir)
end
end
File.open("_posts/tumblr/#{post[:name]}", "w") do |f|
f.puts post[:header].to_yaml + "---\n" + content
end
end
end
# Converts each type of Tumblr post to a hash with all required
# data for Jekyll.
def post_to_hash(post, format)
case post["type"]
when "regular"
title = post["regular-title"]
content = post["regular-body"]
when "link"
title = post["link-text"] || post["link-url"]
content = "<a href=\"#{post["link-url"]}\">#{title}</a>"
unless post["link-description"].nil?
content << "<br/>" + post["link-description"]
end
when "photo"
title = post["slug"].tr("-", " ")
if post["photos"].size > 1
content = ""
post["photos"].each do |post_photo|
photo = fetch_photo post_photo
content << photo + "<br/>"
content << post_photo["caption"]
end
else
content = fetch_photo post
end
content << "<br/>" + post["photo-caption"]
when "audio"
if !post["id3-title"].nil?
title = post["id3-title"]
content = post["audio-player"] + "<br/>" + post["audio-caption"]
else
title = post["audio-caption"]
content = post["audio-player"]
end
when "quote"
title = post["quote-text"]
content = "<blockquote>#{post["quote-text"]}</blockquote>"
unless post["quote-source"].nil?
content << "—" + post["quote-source"]
end
when "conversation"
title = post["conversation-title"]
content = "<section><dialog>"
post["conversation"].each do |line|
content << "<dt>#{line["label"]}</dt><dd>#{line["phrase"]}</dd>"
end
content << "</dialog></section>"
when "video"
title = post["video-title"]
content = post["video-player"]
unless post["video-caption"].nil?
if content
content << "<br/>" + post["video-caption"]
else
content = post["video-caption"]
end
end
when "answer"
title = post["question"]
content = post["answer"]
end
date = Date.parse(post["date"]).to_s
title = Nokogiri::HTML(title).text
title = "no title" if title.empty?
slug = if post["slug"] && post["slug"].strip != ""
post["slug"]
elsif title && title.downcase.gsub(%r![^a-z0-9\-]!, "") != "" && title != "no title"
slug = title.downcase.strip.tr(" ", "-").gsub(%r![^a-z0-9\-]!, "")
slug.length > 200 ? slug.slice(0..200) : slug
else
post["id"]
end
{
:name => "#{date}-#{slug}.#{format}",
:header => {
"layout" => "post",
"title" => title,
"date" => Time.parse(post["date"]).xmlschema,
"tags" => (post["tags"] || []),
"tumblr_url" => post["url-with-slug"],
},
:content => content,
:url => post["url"],
:slug => post["url-with-slug"],
}
end
# Attempts to fetch the largest version of a photo available for a post.
# If that file fails, it tries the next smaller size until all available
# photo URLs are exhausted. If they all fail, the import is aborted.
def fetch_photo(post)
sizes = post.keys.map { |k| k.gsub("photo-url-", "").to_i }
sizes.sort! { |a, b| b <=> a }
_ext_key, ext_val = post.find do |k, v|
k =~ %r!^photo-url-! && v.split("/").last =~ %r!\.!
end
ext = "." + ext_val.split(".").last
sizes.each do |size|
url = post["photo-url"] || post["photo-url-#{size}"]
next if url.nil?
begin
return "<img src=\"#{save_photo(url, ext)}\"/>"
rescue OpenURI::HTTPError
puts "Failed to grab photo"
end
end
abort "Failed to fetch photo for post #{post["url"]}"
end
# Create a Hash of old urls => new urls, for rewriting and
# redirects, and replace urls in each post. Instantiate Jekyll
# site/posts to get the correct permalink format.
def rewrite_urls_and_redirects(posts)
site = Jekyll::Site.new(Jekyll.configuration({}))
urls = Hash[posts.map do |post|
# Create an initial empty file for the post so that
# we can instantiate a post object.
File.write("_posts/tumblr/#{post[:name]}", "")
tumblr_url = URI.parse(URI.encode(post[:slug])).path
jekyll_url = if Jekyll.const_defined? :Post
Jekyll::Post.new(site, Dir.pwd, "", "tumblr/" + post[:name]).url
else
Jekyll::Document.new(File.expand_path("_posts/tumblr/#{post[:name]}"), :site => site, :collection => site.posts).url
end
redirect_dir = tumblr_url.sub(%r!\/!, "") + "/"
FileUtils.mkdir_p redirect_dir
File.open(redirect_dir + "index.html", "w") do |f|
f.puts "<html><head><link rel=\"canonical\" href=\"" \
"#{jekyll_url}\"><meta http-equiv=\"refresh\" content=\"0; " \
"url=#{jekyll_url}\"></head><body></body></html>"
end
[tumblr_url, jekyll_url]
end]
posts.map do |post|
urls.each do |tumblr_url, jekyll_url|
post[:content].gsub!(%r!#{tumblr_url}!i, jekyll_url)
end
post
end
end
# Convert preserving HTML tables as per the markdown docs.
def html_to_markdown(content)
preserve = %w(table tr th td)
preserve.each do |tag|
content.gsub!(%r!<#{tag}!i, "$$" + tag)
content.gsub!(%r!<\/#{tag}!i, "||" + tag)
end
content = Nokogiri::HTML(content.gsub("'", "''")).text
preserve.each do |tag|
content.gsub!("$$" + tag, "<" + tag)
content.gsub!("||" + tag, "</" + tag)
end
content
end
# Adds pygments highlight tags to code blocks in posts that use
# markdown format. This doesn't guess the language of the code
# block, so you should modify this to suit your own content.
# For example, my code block only contain Python and JavaScript,
# so I can assume the block is JavaScript if it contains a
# semi-colon.
def add_syntax_highlights(content, redirect_dir)
lines = content.split("\n")
block = false
indent = %r!^ !
lang = nil
start = nil
lines.each_with_index do |line, i|
if !block && line =~ indent
block = true
lang = "python"
start = i
elsif block
lang = "javascript" if line =~ %r!;$!
block = line =~ indent && i < lines.size - 1 # Also handle EOF
unless block
lines[start] = "{% highlight #{lang} %}"
lines[i - 1] = "{% endhighlight %}"
end
FileUtils.cp(redirect_dir + "index.html", redirect_dir + "../" + "index.html")
lines[i] = lines[i].sub(indent, "")
end
end
lines.join("\n")
end
def save_photo(url, ext)
if @grab_images
path = "tumblr_files/#{url.split("/").last}"
path += ext unless path =~ %r!#{ext}$!
FileUtils.mkdir_p "tumblr_files"
# Don't fetch if we've already cached this file
unless File.size? path
puts "Fetching photo #{url}"
File.open(path, "wb") { |f| f.write(open(url).read) }
end
url = "/" + path
end
url
end
end
end
end
end