Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

Merge pull request #56 from markheath/blogger_import

Blogger import
  • Loading branch information...
commit 958cb116c9828c4edae693b366f588f64e05571a 2 parents 7bcd0e1 + 0233ecc
Paul Jenkins authored
View
125 src/Pretzel.Logic/Import/BloggerImport.cs
@@ -0,0 +1,125 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.IO.Abstractions;
+using System.Xml.Linq;
+using Pretzel.Logic.Extensions;
+using System.IO;
+using System.Xml;
+using System.Text.RegularExpressions;
+
+namespace Pretzel.Logic.Import
+{
+ public class BloggerImport
+ {
+ private readonly IFileSystem fileSystem;
+ private readonly string pathToSite;
+ private readonly string pathToImportFile;
+
+ public BloggerImport(IFileSystem fileSystem, string pathToSite, string pathToImportFile)
+ {
+ this.fileSystem = fileSystem;
+ this.pathToSite = pathToSite;
+ this.pathToImportFile = pathToImportFile;
+ }
+
+ public void Import()
+ {
+ var xml = fileSystem.File.ReadAllText(pathToImportFile);
+ var root = XElement.Parse(xml);
+
+ // key bits of the atom xml format:
+ // <feed>
+ // <category /> - can be many
+ // <title /> = title of blog
+ // <author><name /></author>
+ // <entry>
+ // <id/>
+ // <category scheme='http://schemas.google.com/g/2005#kind' term='http://schemas.google.com/blogger/2008/kind#post'/>
+ // <category scheme='http://www.blogger.com/atom/ns#' term='A Category'/>
+ // <published/> formatted like this: 2007-02-01T14:01:23.326Z
+ // <updated/>
+ // <title/>
+ // <content type='html'/>
+ // <author><name /></author>
+ // <entry>
+
+ XNamespace atom = "http://www.w3.org/2005/Atom";
+ var count = root.Descendants(atom + "entry").Count();
+
+ var posts = from e in root.Descendants(atom + "entry")
+ where e.Elements(atom + "category").Where(x => x.Attribute("term").Value == "http://schemas.google.com/blogger/2008/kind#post").Count() > 0
+ select new BloggerPost
+ {
+ Title = e.Element(atom + "title").Value,
+ //PostName = e.Element(wp + "post_name").Value,
+ Published = Convert.ToDateTime(e.Element(atom + "published").Value),
+ Updated = Convert.ToDateTime(e.Element(atom + "updated").Value),
+ Content = ConvertToMarkdown(e.Element(atom + "content").Value),
+ /*Tags = from t in e.Elements(atom + "category")
+ where t.Attribute("domain").Value == "post_tag"
+ select t.Value,*/
+ // blogger categories are more like tags
+ Tags = from t in e.Elements(atom + "category")
+ where t.Attribute("scheme").Value == "http://www.blogger.com/atom/ns#"
+ select t.Attribute("term").Value
+ };
+
+ foreach (var p in posts)
+ {
+ ImportPost(p);
+ }
+ }
+
+ private string ConvertToMarkdown(string content)
+ {
+ var converter = new HtmlToMarkdownConverter();
+ return converter.Convert(content);
+ }
+
+ private void ImportPost(BloggerPost post)
+ {
+ var header = new
+ {
+ title = post.Title,
+ date = post.Published,
+ layout = "post",
+ categories = post.Categories,
+ tags = post.Tags
+ };
+
+ var yamlHeader = string.Format("---\r\n{0}---\r\n\r\n", header.ToYaml());
+ var postContent = yamlHeader + post.Content;
+
+ string fileName = string.Format(@"{0}-{1}.md", post.Published.ToString("yyyy-MM-dd"), post.Title); //not sure about post name
+ foreach (char c in System.IO.Path.GetInvalidFileNameChars())
+ {
+ fileName = fileName.Replace(c, '_');
+ }
+ // replace some valid ones too
+ fileName = fileName.Replace(' ', '-');
+ fileName = fileName.Replace('\u00A0', '-');
+
+ try
+ {
+ fileSystem.File.WriteAllText(Path.Combine(pathToSite, Path.Combine("_posts", fileName)), postContent);
+ }
+ catch (Exception e)
+ {
+ Tracing.Info(String.Format("Failed to write out {0}", fileName));
+ Tracing.Debug(e.Message);
+ }
+ }
+
+ protected class BloggerPost
+ {
+ public string Title { get; set; }
+ public DateTime Updated { get; set; }
+ public DateTime Published { get; set; }
+ public string Content { get; set; }
+ public IEnumerable<string> Tags { get; set; }
+ public IEnumerable<string> Categories { get; set; }
+ }
+ }
+}
View
133 src/Pretzel.Logic/Import/HtmlToMarkdownConverter.cs
@@ -0,0 +1,133 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using HtmlAgilityPack;
+using System.Text.RegularExpressions;
+using Pretzel.Logic.Extensions;
+
+namespace Pretzel.Logic.Import
+{
+ /// <summary>
+ /// This one uses the HTML agility pack
+ /// </summary>
+ public class HtmlToMarkdownConverter
+ {
+ public string Convert(string html)
+ {
+ HtmlDocument doc = new HtmlDocument();
+ StringBuilder markdown = new StringBuilder();
+ doc.LoadHtml(html);
+ ProcessNodes(markdown, doc.DocumentNode.ChildNodes);
+ return markdown.ToString();
+ }
+
+ private static Regex regexBr = new Regex(@"\<br\s*/?\>",
+ RegexOptions.IgnoreCase
+ | RegexOptions.CultureInvariant
+ | RegexOptions.IgnorePatternWhitespace
+ | RegexOptions.Compiled);
+
+ private int listNestingLevel;
+
+ private void ProcessNodes(StringBuilder markdown, IEnumerable<HtmlNode> htmlNodes)
+ {
+ foreach (var htmlNode in htmlNodes)
+ {
+ switch (htmlNode.Name)
+ {
+ case "#comment":
+ break;
+ case "#text":
+ markdown.Append(htmlNode.InnerText);
+ break;
+ case "h1":
+ case "h2":
+ case "h3":
+ case "h4":
+ case "h5":
+ case "h6":
+ string hashes = new string('#', htmlNode.Name[1] - '0');
+ markdown.AppendLine();
+ markdown.AppendFormat("{0} {1}", hashes, htmlNode.InnerText);
+ markdown.AppendLine();
+ break;
+ case "ul":
+ markdown.AppendLine();
+ listNestingLevel++;
+ ProcessNodes(markdown, htmlNode.ChildNodes);
+ listNestingLevel--;
+ if (listNestingLevel < 0) listNestingLevel = 0;
+ markdown.AppendLine();
+ break;
+ case "li":
+ // n.b. don't yet support nested lists:
+ markdown.AppendLine();
+ if (listNestingLevel == 0) // missing ul
+ listNestingLevel = 1;
+ markdown.AppendFormat("{0}* ", new string(' ', 4 * (listNestingLevel - 1)));
+ listNestingLevel++;
+ ProcessNodes(markdown, htmlNode.ChildNodes);
+ listNestingLevel--;
+ if (listNestingLevel < 0) listNestingLevel = 0;
+ break;
+ case "p":
+ markdown.AppendLine();
+ ProcessNodes(markdown, htmlNode.ChildNodes);
+ markdown.AppendLine();
+ break;
+ case "b":
+ case "strong":
+ var boldText = htmlNode.InnerText;
+ bool addSpace = false;
+ if (boldText.EndsWith(" "))
+ {
+ boldText = boldText.Substring(0, boldText.Length - 1);
+ addSpace = true;
+ }
+ markdown.AppendFormat("**{0}**{1}", htmlNode.InnerText, addSpace ? " " : "");
+ break;
+ case "i":
+ case "em":
+ markdown.AppendFormat("*{0}*", htmlNode.InnerText);
+ break;
+ case "br":
+ markdown.AppendLine();
+ break;
+ case "a":
+ markdown.AppendFormat("[{0}]({1})", htmlNode.InnerText, htmlNode.Attributes["href"].Value);
+ break;
+ case "img":
+ case "blockquote":
+ // leave html unchanged for now, maybe revisit later
+ markdown.Append(htmlNode.OuterHtml);
+ break;
+ case "object":
+ case "table":
+ case "div":
+ case "span":
+ case "iframe":
+ case "embed":
+ // leave html unchanged
+ markdown.Append(htmlNode.OuterHtml);
+ break;
+ case "pre":
+ case "code":
+ var code = htmlNode.InnerText;
+ // a bit hacky, but we need to sort out where lines of code end
+ code = code.Replace("\r\n", "\n");
+ code = code.Replace("\r", "\n");
+ code = regexBr.Replace(code, "\n");
+ var lines = code.Split('\n');
+ markdown.Append(Environment.NewLine + " ");
+ markdown.Append(string.Join(Environment.NewLine + " ", lines));
+ break;
+ default:
+ ProcessNodes(markdown, htmlNode.ChildNodes);
+ Tracing.Info(String.Format("{0}", htmlNode.OuterHtml));
+ break;
+ }
+ }
+ }
+ }
+}
View
2  src/Pretzel.Logic/Pretzel.Logic.csproj
@@ -83,6 +83,8 @@
<Compile Include="Extensions\StringExtensions.cs" />
<Compile Include="Extensions\Tracing.cs" />
<Compile Include="Extensions\YamlExtensions.cs" />
+ <Compile Include="Import\BloggerImport.cs" />
+ <Compile Include="Import\HtmlToMarkdownConverter.cs" />
<Compile Include="Import\WordpressImport.cs" />
<Compile Include="Minification\FileSystemExtensions.cs" />
<Compile Include="Minification\LessTransform.cs" />
View
83 src/Pretzel.Logic/Templating/Context/SiteContextGenerator.cs
@@ -88,38 +88,69 @@ private void BuildPosts(Dictionary<string, object> config, SiteContext context)
{
foreach (var file in fileSystem.Directory.GetFiles(postsFolder, "*.*", SearchOption.AllDirectories))
{
- var contents = SafeReadContents(file);
- var header = contents.YamlHeader();
- var post = new Page
- {
- Title = header.ContainsKey("title") ? header["title"].ToString() : "this is a post",
- // NOTE: should this be the Site title?
- Date =
- header.ContainsKey("date")
- ? DateTime.Parse(header["date"].ToString())
- : file.Datestamp(),
- Content = Markdown.Transform(contents.ExcludeHeader()),
- Filepath = GetPathWithTimestamp(context.OutputFolder, file),
- File = file,
- Bag = header,
- };
-
- if (header.ContainsKey("permalink"))
- post.Url = EvaluatePermalink(header["permalink"].ToString(), post);
- else if (config.ContainsKey("permalink"))
- post.Url = EvaluatePermalink(config["permalink"].ToString(), post);
-
- if (string.IsNullOrEmpty(post.Url))
- {
- Tracing.Info("whaaa");
- }
- context.Posts.Add(post);
+ BuildPost(config, context, file);
}
context.Posts = context.Posts.OrderByDescending(p => p.Date).ToList();
}
}
+ private void BuildPost(Dictionary<string, object> config, SiteContext context, string file)
+ {
+ try
+ {
+ var contents = SafeReadContents(file);
+ var header = contents.YamlHeader();
+ var post = new Page
+ {
+ Title = header.ContainsKey("title") ? header["title"].ToString() : "this is a post",
+ // NOTE: should this be the Site title?
+ Date =
+ header.ContainsKey("date")
+ ? DateTime.Parse(header["date"].ToString())
+ : file.Datestamp(),
+ Content = GetContent(file, contents),
+ Filepath = GetPathWithTimestamp(context.OutputFolder, file),
+ File = file,
+ Bag = header,
+ };
+
+ if (header.ContainsKey("permalink"))
+ post.Url = EvaluatePermalink(header["permalink"].ToString(), post);
+ else if (config.ContainsKey("permalink"))
+ post.Url = EvaluatePermalink(config["permalink"].ToString(), post);
+
+ if (string.IsNullOrEmpty(post.Url))
+ {
+ Tracing.Info("whaaa");
+ }
+ context.Posts.Add(post);
+ }
+ catch (Exception e)
+ {
+ Tracing.Info(String.Format("Failed to build post from File: {0}", file));
+ Tracing.Info(e.Message);
+ Tracing.Debug(e.ToString());
+ }
+
+ }
+
+ private static string GetContent(string file, string contents)
+ {
+ string html;
+ try
+ {
+ html = Markdown.Transform(contents.ExcludeHeader());
+ }
+ catch (Exception e)
+ {
+ Tracing.Info(String.Format("Error ({0}) converting {1}", e.Message, file));
+ Tracing.Debug(e.ToString());
+ html = String.Format("<p><b>Error converting markdown</b></p><pre>{0}</pre>", contents);
+ }
+ return html;
+ }
+
private string SafeReadLine(string file)
{
string postFirstLine;
View
91 src/Pretzel.Tests/Import/BloggerImportTests.cs
@@ -0,0 +1,91 @@
+using System;
+using System.Collections.Generic;
+using Xunit;
+using System.IO.Abstractions.TestingHelpers;
+using Pretzel.Logic.Import;
+using Pretzel.Logic.Extensions;
+
+namespace Pretzel.Tests.Import
+{
+ public class BloggerImportTests
+ {
+ const string BaseSite = @"c:\site\";
+ const string ImportFile = @"c:\import.xml";
+ // test data based on feed from helloworld.blogspot.com
+ const string ImportContent = @"<?xml version='1.0' encoding='UTF-8'?>
+<?xml-stylesheet href=""http://www.blogger.com/styles/atom.css"" type=""text/css""?>
+<feed xmlns='http://www.w3.org/2005/Atom' xmlns:openSearch='http://a9.com/-/spec/opensearchrss/1.0/' xmlns:georss='http://www.georss.org/georss' xmlns:gd='http://schemas.google.com/g/2005' xmlns:thr='http://purl.org/syndication/thread/1.0'>
+ <id>tag:blogger.com,1999:blog-786740</id>
+ <updated>2012-02-06T23:20:56.200+08:00</updated>
+ <title type='text'>Hello, world</title>
+ <subtitle type='html'>Testing blogger</subtitle>
+ <link rel='http://schemas.google.com/g/2005#feed' type='application/atom+xml' href='http://helloworld.blogspot.com/feeds/posts/default'/>
+ <link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/786740/posts/default'/>
+ <link rel='alternate' type='text/html' href='http://helloworld.blogspot.com/'/>
+ <author><name>Trevor</name><uri>http://www.blogger.com/profile/11478756950805515790</uri><email>noreply@blogger.com</email>
+ <gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author>
+ <generator version='7.00' uri='http://www.blogger.com'>Blogger</generator>
+ <openSearch:totalResults>2</openSearch:totalResults><openSearch:startIndex>1</openSearch:startIndex>
+ <openSearch:itemsPerPage>25</openSearch:itemsPerPage>
+ <entry>
+ <id>tag:blogger.com,1999:blog-786740.post-786756</id>
+ <category scheme='http://schemas.google.com/g/2005#kind' term='http://schemas.google.com/blogger/2008/kind#post'/>
+ <published>2000-09-07T13:25:00.000+08:00</published>
+ <updated>2000-09-07T13:25:05.590+08:00</updated>
+ <title type='text'>Hello World 1</title>
+ <content type='html'>hello again&lt;div class=""blogger-post-footer""&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/786740-786756?l=helloworld.blogspot.com' alt='' /&gt;&lt;/div&gt;</content>
+ <link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/786740/posts/default/786756'/>
+ <link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/786740/posts/default/786756'/>
+ <link rel='alternate' type='text/html' href='http://helloworld.blogspot.com/2000_09_03_archive.html#786756' title=''/>
+ <author>
+ <name>Trevor</name>
+ <uri>http://www.blogger.com/profile/11478756950805515790</uri>
+ <email>noreply@blogger.com</email>
+ <gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/>
+ </author>
+ </entry>
+ <entry>
+ <id>tag:blogger.com,1999:blog-786740.post-786751</id>
+ <category scheme='http://schemas.google.com/g/2005#kind' term='http://schemas.google.com/blogger/2008/kind#post'/>
+ <published>2000-09-07T13:24:00.000+08:00</published>
+ <updated>2000-09-07T13:24:23.890+08:00</updated>
+ <title type='text'>Hello World 2</title>
+ <content type='html'>hello world&lt;div class=""blogger-post-footer""&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/786740-786751?l=helloworld.blogspot.com' alt='' /&gt;&lt;/div&gt;</content>
+ <link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/786740/posts/default/786751'/>
+ <link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/786740/posts/default/786751'/>
+ <link rel='alternate' type='text/html' href='http://helloworld.blogspot.com/2000_09_03_archive.html#786751' title=''/>
+ <author>
+ <name>Trevor</name>
+ <uri>http://www.blogger.com/profile/11478756950805515790</uri>
+ <email>noreply@blogger.com</email>
+ <gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/>
+ </author>
+ </entry>
+</feed>";
+
+ public BloggerImportTests()
+ {
+ //ImportContent = System.IO.File.ReadAllText(@"path-to-test-data.xml");
+ }
+
+ [Fact]
+ public void Posts_Are_Imported()
+ {
+ var fileSystem = new MockFileSystem(new Dictionary<string, MockFileData>
+ {
+ { ImportFile, new MockFileData(ImportContent) }
+ });
+
+ var bloggerImporter = new BloggerImport(fileSystem, BaseSite, ImportFile);
+ bloggerImporter.Import();
+
+ string expectedPost = @"_posts\2000-09-07-Hello-World-1.md";
+ Assert.True(fileSystem.File.Exists(BaseSite + expectedPost));
+
+ var postContent = fileSystem.File.ReadAllText(BaseSite + expectedPost);
+ var header = postContent.YamlHeader();
+
+ Assert.Equal("Hello World 1", header["title"].ToString());
+ }
+ }
+}
View
91 src/Pretzel.Tests/Import/HtmlToMarkdownConverterTests.cs
@@ -0,0 +1,91 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using Xunit;
+using Pretzel.Logic.Import;
+
+namespace Pretzel.Tests.Import
+{
+ public class HtmlToMarkdownConverterTests
+ {
+ private HtmlToMarkdownConverter converter = new HtmlToMarkdownConverter();
+
+ [Fact]
+ public void Plain_text_is_copied_straight_through()
+ {
+ string markdown = converter.Convert("hello world");
+ Assert.Equal("hello world", markdown);
+ }
+
+ [Fact]
+ public void H1_headings_are_converted()
+ {
+ string markdown = converter.Convert("<h1>hello world</h1>");
+ Assert.Equal(Environment.NewLine + "# hello world" + Environment.NewLine, markdown);
+ }
+
+ [Fact]
+ public void H2_headings_are_converted()
+ {
+ string markdown = converter.Convert("<h2>heading 2</h2>");
+ Assert.Equal(Environment.NewLine + "## heading 2" + Environment.NewLine, markdown);
+ }
+
+ [Fact]
+ public void Paragraphs_have_a_new_line_before_and_after()
+ {
+ string markdown = converter.Convert("<p>Paragraph 1</p>");
+ Assert.Equal(Environment.NewLine + "Paragraph 1" + Environment.NewLine, markdown);
+ }
+
+ [Fact]
+ public void Links_are_converted()
+ {
+ string markdown = converter.Convert("<a href=\"http://foo.com/bar\">Link text</a>");
+ Assert.Equal("[Link text](http://foo.com/bar)", markdown);
+ }
+
+ [Fact]
+ public void Images_are_left_unconverted()
+ {
+ string markdown = converter.Convert("<img src=\"http://foo.com/bar\">");
+ Assert.Equal("<img src=\"http://foo.com/bar\">", markdown);
+ }
+
+ [Fact]
+ public void Code_blocks_are_converted()
+ {
+ string markdown = converter.Convert("<pre>hello</pre>");
+ Assert.Equal(Environment.NewLine + " hello", markdown);
+ }
+
+ [Fact]
+ public void Unordered_lists_are_converted()
+ {
+ string markdown = converter.Convert("<ul><li>first</li><li>second</li></ul>");
+ Assert.Equal(Environment.NewLine + Environment.NewLine + "* first" + Environment.NewLine + "* second" + Environment.NewLine, markdown);
+ }
+
+ [Fact]
+ public void Unordered_lists_can_be_nested()
+ {
+ string markdown = converter.Convert("<ul><li>first</li><li>second</li><ul><li>second nested</li></ul></ul>");
+ Assert.Contains(Environment.NewLine + " * second nested" + Environment.NewLine, markdown);
+ }
+
+ [Fact]
+ public void Unordered_lists_can_be_nested_inside_li()
+ {
+ string markdown = converter.Convert("<ul><li>first</li><li>second<li>second nested</li></li></ul>");
+ Assert.Contains(Environment.NewLine + " * second nested" + Environment.NewLine, markdown);
+ }
+
+ [Fact]
+ public void Code_blocks_with_new_lines_are_converted()
+ {
+ string markdown = converter.Convert("<pre>hello" + Environment.NewLine + "world</pre>");
+ Assert.Equal(Environment.NewLine + " hello" + Environment.NewLine + " world", markdown);
+ }
+ }
+}
View
2  src/Pretzel.Tests/Pretzel.Tests.csproj
@@ -55,6 +55,8 @@
<ItemGroup>
<Compile Include="CommandParameterOutputTests.cs" />
<Compile Include="CommandParameterTests.cs" />
+ <Compile Include="Import\BloggerImportTests.cs" />
+ <Compile Include="Import\HtmlToMarkdownConverterTests.cs" />
<Compile Include="Import\WordpressImportTests.cs" />
<Compile Include="Minification\LessTransformTests.cs" />
<Compile Include="Minification\JsMinificationTests.cs" />
View
7 src/Pretzel/Commands/ImportCommand.cs
@@ -14,7 +14,7 @@ namespace Pretzel.Commands
[CommandInfo(CommandName = "import")]
class ImportCommand : ICommand
{
- readonly static List<string> Importers = new List<string>(new[] { "wordpress" });
+ readonly static List<string> Importers = new List<string>(new[] { "wordpress", "blogger" });
#pragma warning disable 649
[Import] IFileSystem fileSystem;
@@ -37,6 +37,11 @@ public void Execute(IEnumerable<string> arguments)
{
var wordpressImporter = new WordpressImport(fileSystem, parameters.Path, parameters.ImportPath);
wordpressImporter.Import();
+ }
+ else if (string.Equals("blogger", parameters.ImportType, StringComparison.InvariantCultureIgnoreCase))
+ {
+ var bloggerImporter = new BloggerImport(fileSystem, parameters.Path, parameters.ImportPath);
+ bloggerImporter.Import();
}
Tracing.Info("Import complete");
Please sign in to comment.
Something went wrong with that request. Please try again.