Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse code

New script to export bbPress content to pure WordPress objects.

  • Loading branch information...
commit 701d5dfa160aca642fc0ff68eccf584f7a08d958 1 parent d1094b7
Kevin Deldycke authored

Showing 1 changed file with 241 additions and 0 deletions. Show diff stats Hide diff stats

  1. +241 0 bbpress-to-wordpress.py
241 bbpress-to-wordpress.py
... ... @@ -0,0 +1,241 @@
  1 +#!/usr/bin/python
  2 +# -*- coding: UTF-8 -*-
  3 +
  4 +"""
  5 +This scripts allows you to export bbPress content (forums, topics & replies) to pure WordPress objects (pages & comments).
  6 +
  7 +It browse the MySQL database of a bbPress instance and generate an XML file. The XML produced is a WXR file (WordPress eXtended RSS), which can be imported into a plain WordPress site.
  8 +
  9 +A bbPress thread is imported as an empty page with the thread's title. All its replies are imported as comments of that page.
  10 +A top-level page is then created for each forum, and all its threads are linked from that parent page.
  11 +
  12 +The script requires the following python modules:
  13 + * lxml
  14 + * PyMySQL
  15 +
  16 +These can easely be installed on Debian with the following commands:
  17 + $ aptitude install python-pip python-lxml
  18 + $ pip install PyMySQL
  19 +"""
  20 +
  21 +import time
  22 +import random
  23 +import pymysql
  24 +import operator
  25 +import email.utils
  26 +import unicodedata
  27 +from lxml import etree
  28 +from datetime import datetime
  29 +
  30 +
  31 +## Configuration
  32 +
  33 +MYSQL_HOST = '127.0.0.1'
  34 +MYSQL_PORT = 3306
  35 +MYSQL_USER = 'root'
  36 +MYSQL_PASSWORD = ''
  37 +
  38 +BBPRESS_DB = 'mysite'
  39 +BBPRESS_TABLE_PREFIX ='wp_'
  40 +
  41 +WORDPRESS_ROOT_URL = 'http://mysite.example.com'
  42 +
  43 +XML_FILEPATH = './bbpress-export.xml'
  44 +
  45 +# List of user IDs which can create a page
  46 +REGISTERED_USER_IDS = [1, 3, 4, 5, 6, 7, 76, 77]
  47 +
  48 +## End of configuration
  49 +
  50 +
  51 +NS_EXCERPT = "http://wordpress.org/export/1.2/excerpt/"
  52 +NS_CONTENT = "http://purl.org/rss/1.0/modules/content/"
  53 +NS_WFW = "http://wellformedweb.org/CommentAPI/"
  54 +NS_DC = "http://purl.org/dc/elements/1.1/"
  55 +NS_WP = "http://wordpress.org/export/1.2/"
  56 +
  57 +EXCERPT = "{%s}" % NS_EXCERPT
  58 +CONTENT = "{%s}" % NS_CONTENT
  59 +WFW = "{%s}" % NS_WFW
  60 +DC = "{%s}" % NS_DC
  61 +WP = "{%s}" % NS_WP
  62 +
  63 +NSMAP = {
  64 + 'excerpt': NS_EXCERPT,
  65 + 'content': NS_CONTENT,
  66 + 'wfw': NS_WFW,
  67 + 'dc': NS_DC,
  68 + 'wp': NS_WP,
  69 + }
  70 +
  71 +conn = pymysql.connect(host=MYSQL_HOST, port=MYSQL_PORT, user=MYSQL_USER, passwd=MYSQL_PASSWORD, db=BBPRESS_DB)
  72 +cr = conn.cursor()
  73 +
  74 +def query(table_name, columns, extra=''):
  75 + """
  76 + Utility method to query the database
  77 + """
  78 + results = []
  79 + q = "SELECT %s FROM %s%s %s" % (
  80 + ', '.join(["`%s`" % c for c in columns]),
  81 + BBPRESS_TABLE_PREFIX,
  82 + table_name,
  83 + extra,
  84 + )
  85 + cr.execute(q)
  86 + for row in cr.fetchall():
  87 + cleaned_row_values = []
  88 + for r in row:
  89 + if isinstance(r, str):
  90 + try:
  91 + r = r.decode('UTF-8')
  92 + except UnicodeDecodeError:
  93 + r = r.decode('latin-1')
  94 + cleaned_row_values.append(r)
  95 + results.append(dict(zip(columns, cleaned_row_values)))
  96 + return results
  97 +
  98 +# Utility method to clean up multi-line HTML text.
  99 +clean_text = lambda s: s.replace('\r\n', '\n').strip().replace('\n', "<br />")
  100 +rfc822_date = lambda d: email.utils.formatdate(time.mktime(d.timetuple()))
  101 +normalize_url = lambda s: '-'.join([w for w in ''.join([c.isalnum() and c or '-' for c in unicodedata.normalize('NFKD', s).encode('ascii', 'ignore').lower()]).split('-') if w])
  102 +
  103 +# XML items
  104 +items = []
  105 +
  106 +# Database extraction
  107 +topics = query('posts', ['ID', 'post_title', 'post_content', 'post_author', 'post_date', 'post_date_gmt', 'post_status'], "WHERE post_type='topic' AND post_status='publish'")
  108 +replies = query('posts', ['ID', 'post_title', 'post_content', 'post_author', 'post_date', 'post_date_gmt', 'post_parent'], "WHERE post_type='reply' AND post_status='publish'")
  109 +users = dict([(u['ID'], u) for u in query('users', ['ID', 'user_login', 'display_name', 'user_email', 'user_url'])])
  110 +# Add an anonymous user
  111 +users.update({0: {'ID': 0, 'display_name': "Anonymous", 'user_email': "", 'user_url': ""}})
  112 +
  113 +# Utility to get a new ID wil avoiding collisions
  114 +reserved_ids = [t['ID'] for t in topics]
  115 +reserved_ids.extend([r['ID'] for r in replies])
  116 +reserved_ids.extend(users.keys())
  117 +def get_new_id():
  118 + while True:
  119 + new_id = random.randint(0, 99999999)
  120 + if new_id not in reserved_ids:
  121 + reserved_ids.append(new_id)
  122 + return new_id
  123 +
  124 +# Create a top-level page to be the common parent of all threads
  125 +now = datetime.now()
  126 +forum_page_id = str(get_new_id())
  127 +forum_url = "%s/?p=%s" % (WORDPRESS_ROOT_URL, forum_page_id)
  128 +forum_slug = "forum"
  129 +forum_link = "%s/%s/" % (WORDPRESS_ROOT_URL, forum_slug)
  130 +forum_page = etree.Element("item")
  131 +etree.SubElement(forum_page, "title").text = "Forum Archive"
  132 +etree.SubElement(forum_page, "link").text = forum_link
  133 +etree.SubElement(forum_page, WP + "post_name").text = forum_slug
  134 +etree.SubElement(forum_page, "guid", attrib={"isPermaLink": "false"}).text = forum_url
  135 +etree.SubElement(forum_page, "pubDate").text = rfc822_date(now)
  136 +etree.SubElement(forum_page, DC + "creator").text = 'admin'
  137 +etree.SubElement(forum_page, WP + "post_id").text = forum_page_id
  138 +etree.SubElement(forum_page, WP + "post_date").text = now.isoformat(' ')
  139 +etree.SubElement(forum_page, WP + "post_date_gmt").text = now.isoformat(' ')
  140 +etree.SubElement(forum_page, WP + "status").text = "publish"
  141 +etree.SubElement(forum_page, WP + "post_type").text = "page"
  142 +etree.SubElement(forum_page, WP + "comment_status").text = "closed"
  143 +etree.SubElement(forum_page, WP + "ping_status").text = "closed"
  144 +
  145 +topic_list = []
  146 +
  147 +for topic in topics:
  148 + # Prepare content
  149 + topic_author = users[topic['post_author']]
  150 + topic_replies = [r for r in replies if r['post_parent'] == topic['ID']]
  151 + topic_url = "%s/?p=%s" % (WORDPRESS_ROOT_URL, topic['ID'])
  152 + topic_slug = normalize_url(topic['post_title'])
  153 + topic_link = "%s%s/" % (forum_link, topic_slug)
  154 +
  155 + # Save topics as pages
  156 + page = etree.Element("item")
  157 + etree.SubElement(page, "title").text = topic['post_title']
  158 + etree.SubElement(page, "link").text = topic_link
  159 + etree.SubElement(page, WP + "post_name").text = topic_slug
  160 + etree.SubElement(page, "guid", attrib={"isPermaLink": "false"}).text = topic_url
  161 + etree.SubElement(page, "pubDate").text = rfc822_date(topic['post_date'])
  162 + etree.SubElement(page, WP + "post_id").text = str(topic['ID'])
  163 + etree.SubElement(page, WP + "post_date").text = topic['post_date'].isoformat(' ')
  164 + etree.SubElement(page, WP + "post_date_gmt").text = topic['post_date_gmt'].isoformat(' ')
  165 + etree.SubElement(page, WP + "status").text = topic['post_status']
  166 + etree.SubElement(page, WP + "post_type").text = "page"
  167 + etree.SubElement(page, WP + "post_parent").text = forum_page_id
  168 + etree.SubElement(page, WP + "comment_status").text = "open"
  169 + etree.SubElement(page, WP + "ping_status").text = "closed"
  170 +
  171 + # If the user is allowed to create a page, then the content of the topic is but in the page itself,
  172 + # else we create a new comment.
  173 + if topic_author['ID'] in REGISTERED_USER_IDS:
  174 + etree.SubElement(page, DC + "creator").text = topic_author['user_login']
  175 + etree.SubElement(page, CONTENT + "encoded").text = etree.CDATA(clean_text(topic['post_content']))
  176 + else:
  177 + etree.SubElement(page, DC + "creator").text = 'admin'
  178 + topic_replies.insert(0, {
  179 + 'post_author': topic['post_author'],
  180 + 'ID': get_new_id(),
  181 + 'post_date': topic['post_date'],
  182 + 'post_date_gmt': topic['post_date_gmt'],
  183 + 'post_content': topic['post_content'],
  184 + })
  185 +
  186 + # Save replies as comments
  187 + for reply in topic_replies:
  188 + author = users[reply['post_author']]
  189 + comment = etree.Element(WP + "comment")
  190 + etree.SubElement(comment, WP + "comment_id").text = str(reply['ID'])
  191 + etree.SubElement(comment, WP + "comment_approved").text = "1"
  192 + etree.SubElement(comment, WP + "comment_author").text = etree.CDATA(author['display_name'])
  193 + etree.SubElement(comment, WP + "comment_author_email").text = author['user_email']
  194 + etree.SubElement(comment, WP + "comment_author_url").text = author['user_url']
  195 + etree.SubElement(comment, WP + "comment_author_IP").text = "127.0.0.1"
  196 + etree.SubElement(comment, WP + "comment_type").text = ""
  197 + etree.SubElement(comment, WP + "comment_parent").text = ""
  198 + etree.SubElement(comment, WP + "comment_date").text = reply['post_date'].isoformat(' ')
  199 + etree.SubElement(comment, WP + "comment_date_gmt").text = reply['post_date_gmt'].isoformat(' ')
  200 + etree.SubElement(comment, WP + "comment_content").text = etree.CDATA(clean_text(reply['post_content']))
  201 + page.append(comment)
  202 +
  203 + items.append(page)
  204 +
  205 + # Save some topic information for later
  206 + topic_list.append({
  207 + 'link': topic_link,
  208 + 'title': topic['post_title'],
  209 + 'date': topic['post_date_gmt'],
  210 + })
  211 +
  212 +# As content of top-level forum page, list all topics
  213 +current_year = None
  214 +forum_content = ""
  215 +topic_list.sort(key=operator.itemgetter('date'))
  216 +topic_list.reverse()
  217 +for t in topic_list:
  218 + if current_year != t['date'].year:
  219 + if current_year is not None:
  220 + forum_content += "</ul>\n\n"
  221 + current_year = t['date'].year
  222 + forum_content += "<h2>%s</h2>\n<ul>\n" % current_year
  223 + forum_content += "<li><a href='%s'>%s</a></li>\n" % (t['link'], t['title'])
  224 +forum_content += "</ul>\n"
  225 +etree.SubElement(forum_page, CONTENT + "encoded").text = etree.CDATA(forum_content)
  226 +items.append(forum_page)
  227 +
  228 +# Generate the final XML document
  229 +channel = etree.Element("channel")
  230 +etree.SubElement(channel, WP + "wxr_version").text = "1.2"
  231 +for item in items:
  232 + channel.append(item)
  233 +root = etree.Element("rss", attrib={"version": "2.0"}, nsmap=NSMAP)
  234 +root.append(channel)
  235 +
  236 +f = open(XML_FILEPATH, 'w')
  237 +f.write(etree.tostring(root, xml_declaration=True, pretty_print=True, encoding='UTF-8'))
  238 +f.close()
  239 +
  240 +cr.close()
  241 +conn.close()

0 comments on commit 701d5df

Please sign in to comment.
Something went wrong with that request. Please try again.