Skip to content

Commit

Permalink
enable web parser to return plaintext directly for matching headers
Browse files Browse the repository at this point in the history
  • Loading branch information
nsarrazin committed Feb 10, 2024
1 parent a200884 commit 59066f2
Showing 1 changed file with 28 additions and 21 deletions.
49 changes: 28 additions & 21 deletions src/lib/server/websearch/parseWeb.ts
Expand Up @@ -3,30 +3,37 @@ import { JSDOM, VirtualConsole } from "jsdom";
export async function parseWeb(url: string) {
const abortController = new AbortController();
setTimeout(() => abortController.abort(), 10000);
const htmlString = await fetch(url, { signal: abortController.signal })
.then((response) => response.text())
.catch();
const r = await fetch(url, { signal: abortController.signal }).catch();

const virtualConsole = new VirtualConsole();
virtualConsole.on("error", () => {
// No-op to skip console errors.
});
if (r.headers.get("content-type")?.includes("text/html")) {
const virtualConsole = new VirtualConsole();
virtualConsole.on("error", () => {
// No-op to skip console errors.
});

// put the html string into a DOM
const dom = new JSDOM(htmlString ?? "", {
virtualConsole,
});
// put the html string into a DOM
const dom = new JSDOM((await r.text()) ?? "", {
virtualConsole,
});

const { document } = dom.window;
const textElTags = "p";
const paragraphs = document.querySelectorAll(textElTags);
if (!paragraphs.length) {
throw new Error(`webpage doesn't have any "${textElTags}" element`);
}
const paragraphTexts = Array.from(paragraphs).map((p) => p.textContent);
const { document } = dom.window;
const textElTags = "p";
const paragraphs = document.querySelectorAll(textElTags);
if (!paragraphs.length) {
throw new Error(`webpage doesn't have any "${textElTags}" element`);
}
const paragraphTexts = Array.from(paragraphs).map((p) => p.textContent);

// combine text contents from paragraphs and then remove newlines and multiple spaces
const text = paragraphTexts.join(" ").replace(/ {2}|\r\n|\n|\r/gm, "");
// combine text contents from paragraphs and then remove newlines and multiple spaces
const text = paragraphTexts.join(" ").replace(/ {2}|\r\n|\n|\r/gm, "");

return text;
return text;
} else if (
r.headers.get("content-type")?.includes("text/plain") ||
r.headers.get("content-type")?.includes("text/markdown")
) {
return r.text();
} else {
throw new Error("Unsupported content type");
}
}

0 comments on commit 59066f2

Please sign in to comment.