From 558efc08a720ca1744c1e9b46d3e08bf9db2c087 Mon Sep 17 00:00:00 2001 From: Josh Terrill Date: Sun, 7 Aug 2022 22:59:09 -0700 Subject: [PATCH] added support for bloomberg via search API, added autofocus to article page, added additional article text sanitization, reformatted mapping file --- .env.example | 4 ++- README.md | 16 +++++++-- index.js | 4 +-- news-source-map.json | 19 ++++++----- package.json | 2 +- parser.js | 72 ++++++++++++++++++++++++++++++---------- views/article.handlebars | 2 +- views/home.handlebars | 42 ++++++++++++----------- views/read.handlebars | 2 +- 9 files changed, 108 insertions(+), 55 deletions(-) diff --git a/.env.example b/.env.example index 5350574..c1b3260 100644 --- a/.env.example +++ b/.env.example @@ -1 +1,3 @@ -PORT=3030 \ No newline at end of file +PORT=3030 +GOOGLE_SEARCH_ID=xxxxxxxxxx +GOOGLE_SEARCH_KEY=yyyyyyyyyy \ No newline at end of file diff --git a/README.md b/README.md index 5c67081..284c4d9 100644 --- a/README.md +++ b/README.md @@ -13,10 +13,19 @@ URL: https://reader.dangerous.dev * LA Times (latimes.com) * The Athletic (theathletic.com) * Business Insider (businessinsider.com) +* Bloomberg (bloomberg.com) * Vogue (vogue.com) -### Unsupported Sites -* Wall Street Journal (wsj.com) +### Prerequisites + +1. Register a custom Google search engine by going to https://developers.google.com/custom-search/v1/introduction and pressing "Get A Key" +2. As per their site: + +>Once it is created, you can find the engine's ID in the Setup > Basics > Search Engine ID section of the Control Panel + +This is where you will configure what sites your search will search in. For the purposes of this app in its current state, we will just use bloomberg.com + +Take note of your API key and Search Engine ID. ### Installation @@ -24,6 +33,7 @@ URL: https://reader.dangerous.dev git clone https://github.com/joshterrill/paywall-reader cd paywall-reader/ cp .env.example .env +# replace GOOGLE_API_KEY and GOOGLE_SEARCH_ID with values from prerequisites section npm i npm start ``` @@ -37,4 +47,4 @@ Pull requests would gladly be accepted for adding support for more sites (as lon ### Todo * Fix relative and absolute links in embedded html to point to domain they should be coming from * Add dom sanitization for incoming HTML -* Add checks for source and URL fields on requests to ensure the URL matches the URL in the `news-source-map.json` file \ No newline at end of file +* Add checks for source and URL fields on requests to ensure the URL matches the URL in the `news-source-map.json` file diff --git a/index.js b/index.js index a8c648f..078f450 100644 --- a/index.js +++ b/index.js @@ -34,8 +34,8 @@ app.get('/read', async (req, res) => { if (!source || !url) { throw new Error('Source or URL not provided'); } - const direct = newsSourceMapping[source].direct; - const { articleText, articleHeadline } = await parse.getContent(source, url, direct); + const sourceMapping = newsSourceMapping[source]; + const { articleText, articleHeadline } = await parse.getContent(source, url, sourceMapping.method); res.render('read', {source, sourceText: newsSourceMapping[source].name, articleText, articleHeadline}); } catch (error) { console.log(error); diff --git a/news-source-map.json b/news-source-map.json index 1c0bf02..3a9cb78 100644 --- a/news-source-map.json +++ b/news-source-map.json @@ -1,11 +1,12 @@ { - "nyt": {"name": "New York Times", "url": "nytimes.com"}, - "nytcooking": {"name": "New York Times Cooking", "url": "cooking.nytimes.com"}, - "newyorker": {"name": "The New Yorker", "url": "newyorker.com"}, - "economist": {"name": "The Economist", "url": "economist.com"}, - "washingtonpost": {"name": "Washington Post", "url": "washingtonpost.com"}, - "latimes": {"name": "LA Times", "url": "latimes.com", "direct": true}, - "theathletic": {"name": "The Athletic", "url": "theathletic.com", "direct": true}, - "businessinsider": {"name": "Business Insider", "url": "businessinsider.com"}, - "vogue": {"name": "Vogue", "url": "vogue.com"} + "nytimes.com": {"name": "New York Times", "method": "ARCHIVE"}, + "cooking.nytimes.com": {"name": "New York Times Cooking", "method": "ARCHIVE"}, + "newyorker.com": {"name": "The New Yorker", "method": "ARCHIVE"}, + "economist.com": {"name": "The Economist", "method": "ARCHIVE"}, + "washingtonpost.com": {"name": "Washington Post", "method": "ARCHIVE"}, + "latimes.com": {"name": "LA Times", "method": "DIRECT"}, + "theathletic.com": {"name": "The Athletic", "method": "DIRECT"}, + "businessinsider.com": {"name": "Business Insider", "method": "ARCHIVE"}, + "bloomberg.com": {"name": "Bloomberg", "method": "GOOGLE"}, + "vogue.com": {"name": "Vogue", "method": "ARCHIVE"} } \ No newline at end of file diff --git a/package.json b/package.json index 2630f5f..4946d92 100644 --- a/package.json +++ b/package.json @@ -1,7 +1,7 @@ { "name": "paywall-reader", "description": "A web app that lets you read articles on popular news sites that get hidden behind paywalls.", - "version": "1.0.0", + "version": "1.1.2", "main": "index.js", "scripts": { "start": "node index.js", diff --git a/parser.js b/parser.js index 4beed3c..61cc886 100644 --- a/parser.js +++ b/parser.js @@ -6,7 +6,7 @@ function sanitizeUrl(url) { return url.split('?')[0]; } -async function checkUrl(url) { +async function checkUrlArchive(url) { url = sanitizeUrl(url); const data = await fetch(`https://archive.org/wayback/available?url=${url}`); if (!data) { @@ -19,6 +19,19 @@ async function checkUrl(url) { return json.archived_snapshots.closest.url; } +async function checkUrlGoogle(url, site) { + url = sanitizeUrl(url); + const searchTerm = url.split('/')[url.split('/').length - 1]; + const res = await fetch(`https://content-customsearch.googleapis.com/customsearch/v1?cx=${process.env.GOOGLE_SEARCH_ID}&key=${process.env.GOOGLE_SEARCH_KEY}&q=${searchTerm}`) + const json = await res.json(); + if (!json?.items?.length) { + throw new Error('Unable to get result from search engine'); + } + const { cacheId } = json.items[0]; + const webCacheUrl = `http://webcache.googleusercontent.com/search?q=cache:${cacheId}:${site}`; + return webCacheUrl; +} + async function nyt(url) { const rawHtml = await fetch(url); const html = await rawHtml.text(); @@ -41,10 +54,10 @@ async function newyorker(url) { const rawHtml = await fetch(url); const html = await rawHtml.text(); const $ = cheerio.load(html); - const scriptTag = $('script[type="application/ld+json"]').text().split(',\'keywords\':'); - const badJsonFixer = JSON.parse(`${scriptTag[0]}}`); // wtf - const articleText = marked.parse(badJsonFixer.articleBody); - const articleHeadline = badJsonFixer.headline; + const scriptTag = $('script[type="application/ld+json"]').first().text().split(',\'keywords\':'); + const json = JSON.parse(scriptTag[0]);; + const articleText = marked.parse(json.articleBody); + const articleHeadline = json.headline; return { articleText, articleHeadline }; } @@ -118,11 +131,23 @@ async function businessInsider(url) { const rootUrl = Object.keys(imageJson)[0]; image.parent().html(``); }); + $('.inline-newsletter-signup').remove(); const articleHtml = $('.content-lock-content').html(); const articleText = articleHtml; return { articleText, articleHeadline }; } +async function bloomberg(url) { + const rawHtml = await fetch(url); + const html = await rawHtml.text(); + const $ = cheerio.load(html); + const scriptTag = $('script[data-component-props="ArticleBody"]').text(); + const json = JSON.parse(scriptTag); + const articleText = json.story.body.replace(/60x-1/g, '1200x-1'); // replace low res images with higher res + const articleHeadline = json.story.seoHeadline; + return { articleText, articleHeadline }; +} + async function vogue(url) { const rawHtml = await fetch(url); const html = await rawHtml.text(); @@ -138,59 +163,70 @@ async function vogue(url) { return { articleText, articleHeadline }; } -async function getContent(source, url, direct) { +async function getContent(source, url, method) { + console.log(source, url, method); let articleText = null; let articleHeadline = null; - if (!direct) { - url = await checkUrl(url); + if (method === 'ARCHIVE') { + url = await checkUrlArchive(url); + } else if (method === 'GOOGLE') { + url = await checkUrlGoogle(url, source); } + console.log(url); switch(source) { - case 'nyt': + case 'nytimes.com': const nytRes = await nyt(url); articleText = nytRes.articleText; articleHeadline = nytRes.articleHeadline; break; - case 'nytcooking': + case 'cooking.nytimes.com': const nytCookingRes = await nytCooking(url); articleText = nytCookingRes.articleText; articleHeadline = nytCookingRes.articleHeadline; break; - case 'newyorker': + case 'newyorker.com': const newyorkerRes = await newyorker(url); articleText = newyorkerRes.articleText; articleHeadline = newyorkerRes.articleHeadline; break; - case 'economist': + case 'economist.com': const economistRes = await economist(url); articleText = economistRes.articleText; articleHeadline = economistRes.articleHeadline; break; - case 'washingtonpost': + case 'washingtonpost.com': const washingtonPostRes = await washingtonPost(url); articleText = washingtonPostRes.articleText; articleHeadline = washingtonPostRes.articleHeadline; break; - case 'latimes': + case 'latimes.com': const laTimesRes = await latimes(url); articleText = laTimesRes.articleText; articleHeadline = laTimesRes.articleHeadline; break; - case 'theathletic': + case 'theathletic.com': const theAthleticRes = await theAthletic(url); articleText = theAthleticRes.articleText; articleHeadline = theAthleticRes.articleHeadline; break; - case 'businessinsider': + case 'businessinsider.com': const businessInsiderRes = await businessInsider(url); articleText = businessInsiderRes.articleText; articleHeadline = businessInsiderRes.articleHeadline; break; - case 'vogue': + case 'bloomberg.com': + const bloombergRes = await bloomberg(url); + articleText = bloombergRes.articleText; + articleHeadline = bloombergRes.articleHeadline; + break; + case 'vogue.com': const vogueRes = await vogue(url); articleText = vogueRes.articleText; articleHeadline = vogueRes.articleHeadline; break; - + default: + articleText = 'No article found'; + articleHeadline = '404'; } return {articleText, articleHeadline}; } diff --git a/views/article.handlebars b/views/article.handlebars index 67eb4de..990de01 100644 --- a/views/article.handlebars +++ b/views/article.handlebars @@ -2,6 +2,6 @@
- +
\ No newline at end of file diff --git a/views/home.handlebars b/views/home.handlebars index 746d7b4..3e02d1a 100644 --- a/views/home.handlebars +++ b/views/home.handlebars @@ -1,18 +1,8 @@ 

A web app that lets you read articles on popular news sites that get hidden behind paywalls.

- {{!--
-
-
-
Card title
-

This is another card with title and supporting text below. This card has some - additional content to make it slightly taller overall.

-

Last updated 3 mins ago

-
-
-
--}}