Skip to content

Commit

Permalink
fix: failing html parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
edenstrom committed Oct 6, 2021
1 parent a867b11 commit 2a2259a
Showing 1 changed file with 26 additions and 23 deletions.
49 changes: 26 additions & 23 deletions libs/api-skolplattformen/lib/parseHtml.ts
@@ -1,7 +1,7 @@
import * as h2m from 'h2m'
import { htmlDecode } from 'js-htmlencode'
import h2m from 'h2m'
import { decode } from 'he'
import { parse, HTMLElement, TextNode } from 'node-html-parser'
import { htmlDecode } from 'js-htmlencode'
import { HTMLElement, parse, TextNode } from 'node-html-parser'

const noChildren = ['strong', 'b', 'em', 'i', 'u', 's']
const trimNodes = [...noChildren, 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'a']
Expand Down Expand Up @@ -40,45 +40,48 @@ const deepClean = (node: HTMLElement): HTMLElement => {
return cleaned
}

const rearrangeWhitespace = (html: string = ''): string => {
const rearrangeWhitespace = (html = ''): string => {
let content = html
.replace(/<span[^>]*>/gm, '')
.split('</span>').join('')
.replace(/<div[^>]*>/gm, '')
.split('</div>').join('')
.split('&#160;').join('&amp;nbsp;')

.replace(/<span[^>]*>/gm, '')
.split('</span>')
.join('')
.replace(/<div[^>]*>/gm, '')
.split('</div>')
.join('')
.split('&#160;')
.join('&amp;nbsp;')

// FIXME: Make a loop that doesn't break linting
trimNodes.forEach((trimNode) => {
content = content.split(`<${trimNode}> `).join(` <${trimNode}>`)
content = content.split(` </${trimNode}>`).join(`</${trimNode}> `)
content = content.split(`<${trimNode}>&amp;nbsp;`).join(` <${trimNode}>`)
content = content.split(`&amp;nbsp;</${trimNode}>`).join(`</${trimNode}> `)
content = content.split(`&amp;nbsp;</${trimNode}>`).join(`</${trimNode}> `)
})

trimNodes.forEach((trimNode) => {
content = content.split(`<${trimNode}> `).join(` <${trimNode}>`)
content = content.split(` </${trimNode}>`).join(`</${trimNode}> `)
content = content.split(`<${trimNode}>&amp;nbsp;`).join(` <${trimNode}>`)
content = content.split(`&amp;nbsp;</${trimNode}>`).join(`</${trimNode}> `)
content = content.split(`&amp;nbsp;</${trimNode}>`).join(`</${trimNode}> `)
})
trimNodes.forEach((trimNode) => {
content = content.split(`<${trimNode}> `).join(` <${trimNode}>`)
content = content.split(` </${trimNode}>`).join(`</${trimNode}> `)
content = content.split(`<${trimNode}>&amp;nbsp;`).join(` <${trimNode}>`)
content = content.split(`&amp;nbsp;</${trimNode}>`).join(`</${trimNode}> `)
content = content.split(`&amp;nbsp;</${trimNode}>`).join(`</${trimNode}> `)
})
trimNodes.forEach((trimNode) => {
content = content.split(`<${trimNode}> `).join(` <${trimNode}>`)
content = content.split(` </${trimNode}>`).join(`</${trimNode}> `)
content = content.split(`<${trimNode}>&amp;nbsp;`).join(` <${trimNode}>`)
content = content.split(`&amp;nbsp;</${trimNode}>`).join(`</${trimNode}> `)
content = content.split(`&amp;nbsp;</${trimNode}>`).join(`</${trimNode}> `)
})

return content
}

export const clean = (html: string = ''): string =>
export const clean = (html = ''): string =>
deepClean(parse(decode(html))).outerHTML

interface Node {
Expand All @@ -93,15 +96,15 @@ const overides = {
img: (node: Node) => `![${node.attrs.title || ''}](${node.attrs.src})`,
i: (node: Node) => `*${node.md}*`,
b: (node: Node) => `**${node.md}**`,
'h1': (node: Node) => `# ${node.md}\n`,
'h2': (node: Node) => `## ${node.md}\n`,
'h3': (node: Node) => `### ${node.md}\n`,
'h4': (node: Node) => `#### ${node.md}\n`,
'h5': (node: Node) => `##### ${node.md}\n`,
'h6': (node: Node) => `###### ${node.md}\n`,
h1: (node: Node) => `# ${node.md}\n`,
h2: (node: Node) => `## ${node.md}\n`,
h3: (node: Node) => `### ${node.md}\n`,
h4: (node: Node) => `#### ${node.md}\n`,
h5: (node: Node) => `##### ${node.md}\n`,
h6: (node: Node) => `###### ${node.md}\n`,
}

export const toMarkdown = (html: string): string => {
export const toMarkdown = (html?: string): string => {
const rearranged = rearrangeWhitespace(html)
const trimmed = clean(rearranged)
const markdown = h2m(trimmed, { overides, converter })
Expand Down

0 comments on commit 2a2259a

Please sign in to comment.