-
Notifications
You must be signed in to change notification settings - Fork 334
Expand file tree
/
Copy pathwordpressdotcom.rb
More file actions
233 lines (200 loc) · 7.61 KB
/
wordpressdotcom.rb
File metadata and controls
233 lines (200 loc) · 7.61 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
# frozen_string_literal: true
module JekyllImport
module Importers
class WordpressDotCom < Importer
def self.require_deps
JekyllImport.require_with_fallback(%w(
rubygems
fileutils
safe_yaml
nokogiri
time
open-uri
open_uri_redirections
))
end
def self.specify_options(c)
c.option "source", "--source FILE", "WordPress export XML file (default: 'wordpress.xml')"
c.option "no_fetch_images", "--no-fetch-images", "Do not fetch the images referenced in the posts (default: false)"
c.option "assets_folder", "--assets_folder FOLDER", "Folder where assets such as images will be downloaded to (default: 'assets')"
end
# Will modify post DOM tree
def self.download_images(title, post_doc, assets_folder)
images = post_doc.css("img")
return if images.empty?
Jekyll.logger.info "Downloading:", "images for #{title}"
images.each do |i|
uri = URI::DEFAULT_PARSER.escape(i["src"])
dst = File.join(assets_folder, File.basename(uri))
i["src"] = File.join("{{site.baseurl}}", dst)
Jekyll.logger.info uri
if File.exist?(dst)
Jekyll.logger.info "Already in cache. Clean assets folder if you want a redownload."
next
end
begin
FileUtils.mkdir_p assets_folder
OpenURI.open_uri(uri, :allow_redirections => :safe) do |f|
File.open(dst, "wb") do |out|
out.puts f.read
end
end
Jekyll.logger.info "OK!"
rescue StandardError => e
Jekyll.logger.error "Error: #{e.message}"
Jekyll.logger.error e.backtrace.join("\n")
end
end
end
class Item
def initialize(node)
raise "Node is nil" if node.nil?
@node = node
end
def text_for(path)
subnode = @node.at_xpath("./#{path}") || @node.at(path) || @node.children.find { |child| child.name == path }
subnode.text
end
def title
@title ||= text_for("title").strip
end
def permalink_title
post_name = text_for("wp:post_name")
# Fallback to "prettified" title if post_name is empty (can happen)
@permalink_title ||= if post_name.empty?
WordpressDotCom.sluggify(title)
else
post_name
end
end
def permalink
@permalink ||= begin
uri = text_for("link")
uri = @node.at("link").next_sibling.text if uri.empty?
URI(uri.to_s.strip).path
end
end
def published_at
@published_at ||= Time.parse(text_for("wp:post_date")) if published?
end
def status
@status ||= text_for("wp:status")
end
def post_password
@post_password ||= text_for("wp:post_password")
end
def post_type
@post_type ||= text_for("wp:post_type")
end
def parent_id
@parent_id ||= text_for("wp:post_parent")
end
def file_name
@file_name ||= if published?
"#{published_at.strftime("%Y-%m-%d")}-#{permalink_title}.html"
else
"#{permalink_title}.html"
end
end
def directory_name
@directory_name ||= if !published? && post_type == "post"
"_drafts"
else
"_#{post_type}s"
end
end
def published?
@published ||= (status == "publish")
end
def excerpt
@excerpt ||= begin
text = Nokogiri::HTML(text_for("excerpt:encoded")).text
text.empty? ? nil : text
end
end
end
def self.process(options)
source = options.fetch("source", "wordpress.xml")
fetch = !options.fetch("no_fetch_images", false)
assets_folder = options.fetch("assets_folder", "assets")
FileUtils.mkdir_p(assets_folder)
import_count = Hash.new(0)
doc = Nokogiri::XML(File.read(source))
# Fetch authors data from header
authors = Hash[
doc.xpath("//channel/wp:author").map do |author|
[
author.xpath("./wp:author_login").text.strip,
{
"login" => author.xpath("./wp:author_login").text.strip,
"email" => author.xpath("./wp:author_email").text,
"display_name" => author.xpath("./wp:author_display_name").text,
"first_name" => author.xpath("./wp:author_first_name").text,
"last_name" => author.xpath("./wp:author_last_name").text,
},
]
end
] rescue {}
doc.css("channel > item").each do |node|
item = Item.new(node)
categories = node.css('category[domain="category"]').map(&:text).reject { |c| c == "Uncategorized" }.uniq
tags = node.css('category[domain="post_tag"]').map(&:text).uniq
metas = {}
node.xpath("./wp:postmeta").each do |meta|
key = meta.at_xpath("./wp:meta_key").text
value = meta.at_xpath("./wp:meta_value").text
metas[key] = value
end
author_login = item.text_for("dc:creator").strip
header = {
"layout" => item.post_type,
"title" => item.title,
"date" => item.published_at,
"type" => item.post_type,
"parent_id" => item.parent_id,
"published" => item.published?,
"password" => item.post_password,
"status" => item.status,
"categories" => categories,
"tags" => tags,
"meta" => metas,
"author" => authors[author_login],
"permalink" => item.permalink,
}
begin
content = Nokogiri::HTML(item.text_for("content:encoded"))
header["excerpt"] = item.excerpt if item.excerpt
if fetch
# Put the images into a /yyyy/mm/ subfolder to reduce clashes
assets_dir_path = if item.published_at
File.join(assets_folder, item.published_at.strftime("/%Y/%m"))
else
assets_folder
end
download_images(item.title, content, assets_dir_path)
end
FileUtils.mkdir_p item.directory_name
File.open(File.join(item.directory_name, item.file_name), "w") do |f|
f.puts header.to_yaml
f.puts "---"
f.puts Util.wpautop(content.to_html)
end
rescue StandardError => e
Jekyll.logger.error "Couldn't import post!"
Jekyll.logger.error "Title: #{item.title}"
Jekyll.logger.error "Name/Slug: #{item.file_name}\n"
Jekyll.logger.error "Error: #{e.message}"
next
end
import_count[item.post_type] += 1
end
import_count.each do |key, value|
Jekyll.logger.info "Imported", "#{value} #{Util.pluralize(key, value)}"
end
end
def self.sluggify(title)
title.gsub(%r![^[:alnum:]]+!, "-").downcase
end
end
end
end