enable web parser to return plaintext directly for matching headers

huggingface · Feb 10, 2024 · 59066f2 · 59066f2
1 parent a200884
commit 59066f2
Showing 1 changed file with 28 additions and 21 deletions.
diff --git a/src/lib/server/websearch/parseWeb.ts b/src/lib/server/websearch/parseWeb.ts
@@ -3,30 +3,37 @@ import { JSDOM, VirtualConsole } from "jsdom";
 export async function parseWeb(url: string) {
 	const abortController = new AbortController();
 	setTimeout(() => abortController.abort(), 10000);
-	const htmlString = await fetch(url, { signal: abortController.signal })
-		.then((response) => response.text())
-		.catch();
+	const r = await fetch(url, { signal: abortController.signal }).catch();
 
-	const virtualConsole = new VirtualConsole();
-	virtualConsole.on("error", () => {
-		// No-op to skip console errors.
-	});
+	if (r.headers.get("content-type")?.includes("text/html")) {
+		const virtualConsole = new VirtualConsole();
+		virtualConsole.on("error", () => {
+			// No-op to skip console errors.
+		});
 
-	// put the html string into a DOM
-	const dom = new JSDOM(htmlString ?? "", {
-		virtualConsole,
-	});
+		// put the html string into a DOM
+		const dom = new JSDOM((await r.text()) ?? "", {
+			virtualConsole,
+		});
 
-	const { document } = dom.window;
-	const textElTags = "p";
-	const paragraphs = document.querySelectorAll(textElTags);
-	if (!paragraphs.length) {
-		throw new Error(`webpage doesn't have any "${textElTags}" element`);
-	}
-	const paragraphTexts = Array.from(paragraphs).map((p) => p.textContent);
+		const { document } = dom.window;
+		const textElTags = "p";
+		const paragraphs = document.querySelectorAll(textElTags);
+		if (!paragraphs.length) {
+			throw new Error(`webpage doesn't have any "${textElTags}" element`);
+		}
+		const paragraphTexts = Array.from(paragraphs).map((p) => p.textContent);
 
-	// combine text contents from paragraphs and then remove newlines and multiple spaces
-	const text = paragraphTexts.join(" ").replace(/ {2}|\r\n|\n|\r/gm, "");
+		// combine text contents from paragraphs and then remove newlines and multiple spaces
+		const text = paragraphTexts.join(" ").replace(/ {2}|\r\n|\n|\r/gm, "");
 
-	return text;
+		return text;
+	} else if (
+		r.headers.get("content-type")?.includes("text/plain") ||
+		r.headers.get("content-type")?.includes("text/markdown")
+	) {
+		return r.text();
+	} else {
+		throw new Error("Unsupported content type");
+	}
 }