Skip to content

Commit

Permalink
url fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
Sanath Kumar committed Apr 8, 2009
1 parent a40e703 commit 563d68a
Show file tree
Hide file tree
Showing 3 changed files with 60 additions and 0 deletions.
51 changes: 51 additions & 0 deletions fix_broken_url.rb
@@ -0,0 +1,51 @@
def fix_broken_url
logger = Logger.new('broken_url_fix.log')
all_posts = WpPost.find(:all)

broken_url_match = ['http://en.wikipedia.org/wiki/Hawthorne_effect',
'http://www.nytimes.com/library/review/120698science-myths-review.html',
]

all_posts.each do |post|
html_doc = Nokogiri::HTML(post.post_content)
(html_doc/"a").each do |link|
link_ref = link.attributes['href'].to_s
if link_ref.match(Regexp.new(broken_url_match[0]))
link.attributes['href'].value = broken_url_match[0]
post.post_content = html_doc.inner_html
puts "Updating post content for -#{post.post_name} - #{link.attributes['href']}"
logger.error "Updating post content for -#{post.post_name} - #{link.attributes['href']}"
post.save
end
if link_ref.match(Regexp.new(broken_url_match[1]))
link.attributes['href'].value = broken_url_match[1]
post.post_content = html_doc.inner_html
puts "Updating post content for -#{post.post_name} - #{link.attributes['href']}"
logger.error "Updating post content for -#{post.post_name} - #{link.attributes['href']}"
post.save
end
if link_ref.to_s.strip.match(/^www/)
link.attributes['href'].value = "#http://{link.attributes['href']}"
post.post_content = html_doc.inner_html
puts "Updating post content for -#{post.post_name}"
logger.error "Updating post content for -#{post.post_name} - #{link.attributes['href']}"
post.save
end
end
end

all_comments = WpComment.find(:all)
all_comments.each do |comment|
html_doc = Nokogiri::HTML(comment.comment_content)
(html_doc/"a").each do |link|
link_ref = link.attributes['href'].to_s
if link_ref.to_s.strip.match(/^www/)
link.attributes['href'].value = "#http://{link.attributes['href']}"
comment.comment_content = html_doc.inner_html
puts "Updating comment content for -#{comment.comment_ID} - #{link.attributes['href']}"
logger.error "Updating post content for -#{comment.comment_ID} - #{link.attributes['href']}"
comment.save
end
end
end
end
5 changes: 5 additions & 0 deletions global_settings_example.rb
Expand Up @@ -52,6 +52,11 @@ class WpUser < ActiveRecord::Base
set_table_name "wp_users"
end

class WpComment < ActiveRecord::Base
set_primary_key 'comment_ID'
set_table_name "wp_comments"
end

#moneky patch to avoid timeout error
module Net
class BufferedIO
Expand Down
4 changes: 4 additions & 0 deletions process_script.rb
Expand Up @@ -5,6 +5,7 @@
require 'update_links.rb'
require 'configure_wordpress.rb'
require 'update_users.rb'
require 'fix_broken_url.rb'

puts "Starting import file split"
dest_files = split($wordpress_export_filename, $split_file_path, $allowed_length)
Expand Down Expand Up @@ -33,3 +34,6 @@
puts "Configuring wordpress"
configure_wordpress

puts "Fixing broken urls"
fix_broken_url

0 comments on commit 563d68a

Please sign in to comment.