diff --git a/.env.example b/.env.example
index 5350574..c1b3260 100644
--- a/.env.example
+++ b/.env.example
@@ -1 +1,3 @@
-PORT=3030
\ No newline at end of file
+PORT=3030
+GOOGLE_SEARCH_ID=xxxxxxxxxx
+GOOGLE_SEARCH_KEY=yyyyyyyyyy
\ No newline at end of file
diff --git a/README.md b/README.md
index 5c67081..284c4d9 100644
--- a/README.md
+++ b/README.md
@@ -13,10 +13,19 @@ URL: https://reader.dangerous.dev
* LA Times (latimes.com)
* The Athletic (theathletic.com)
* Business Insider (businessinsider.com)
+* Bloomberg (bloomberg.com)
* Vogue (vogue.com)
-### Unsupported Sites
-* Wall Street Journal (wsj.com)
+### Prerequisites
+
+1. Register a custom Google search engine by going to https://developers.google.com/custom-search/v1/introduction and pressing "Get A Key"
+2. As per their site:
+
+>Once it is created, you can find the engine's ID in the Setup > Basics > Search Engine ID section of the Control Panel
+
+This is where you will configure what sites your search will search in. For the purposes of this app in its current state, we will just use bloomberg.com
+
+Take note of your API key and Search Engine ID.
### Installation
@@ -24,6 +33,7 @@ URL: https://reader.dangerous.dev
git clone https://github.com/joshterrill/paywall-reader
cd paywall-reader/
cp .env.example .env
+# replace GOOGLE_API_KEY and GOOGLE_SEARCH_ID with values from prerequisites section
npm i
npm start
```
@@ -37,4 +47,4 @@ Pull requests would gladly be accepted for adding support for more sites (as lon
### Todo
* Fix relative and absolute links in embedded html to point to domain they should be coming from
* Add dom sanitization for incoming HTML
-* Add checks for source and URL fields on requests to ensure the URL matches the URL in the `news-source-map.json` file
\ No newline at end of file
+* Add checks for source and URL fields on requests to ensure the URL matches the URL in the `news-source-map.json` file
diff --git a/index.js b/index.js
index a8c648f..078f450 100644
--- a/index.js
+++ b/index.js
@@ -34,8 +34,8 @@ app.get('/read', async (req, res) => {
if (!source || !url) {
throw new Error('Source or URL not provided');
}
- const direct = newsSourceMapping[source].direct;
- const { articleText, articleHeadline } = await parse.getContent(source, url, direct);
+ const sourceMapping = newsSourceMapping[source];
+ const { articleText, articleHeadline } = await parse.getContent(source, url, sourceMapping.method);
res.render('read', {source, sourceText: newsSourceMapping[source].name, articleText, articleHeadline});
} catch (error) {
console.log(error);
diff --git a/news-source-map.json b/news-source-map.json
index 1c0bf02..3a9cb78 100644
--- a/news-source-map.json
+++ b/news-source-map.json
@@ -1,11 +1,12 @@
{
- "nyt": {"name": "New York Times", "url": "nytimes.com"},
- "nytcooking": {"name": "New York Times Cooking", "url": "cooking.nytimes.com"},
- "newyorker": {"name": "The New Yorker", "url": "newyorker.com"},
- "economist": {"name": "The Economist", "url": "economist.com"},
- "washingtonpost": {"name": "Washington Post", "url": "washingtonpost.com"},
- "latimes": {"name": "LA Times", "url": "latimes.com", "direct": true},
- "theathletic": {"name": "The Athletic", "url": "theathletic.com", "direct": true},
- "businessinsider": {"name": "Business Insider", "url": "businessinsider.com"},
- "vogue": {"name": "Vogue", "url": "vogue.com"}
+ "nytimes.com": {"name": "New York Times", "method": "ARCHIVE"},
+ "cooking.nytimes.com": {"name": "New York Times Cooking", "method": "ARCHIVE"},
+ "newyorker.com": {"name": "The New Yorker", "method": "ARCHIVE"},
+ "economist.com": {"name": "The Economist", "method": "ARCHIVE"},
+ "washingtonpost.com": {"name": "Washington Post", "method": "ARCHIVE"},
+ "latimes.com": {"name": "LA Times", "method": "DIRECT"},
+ "theathletic.com": {"name": "The Athletic", "method": "DIRECT"},
+ "businessinsider.com": {"name": "Business Insider", "method": "ARCHIVE"},
+ "bloomberg.com": {"name": "Bloomberg", "method": "GOOGLE"},
+ "vogue.com": {"name": "Vogue", "method": "ARCHIVE"}
}
\ No newline at end of file
diff --git a/package.json b/package.json
index 2630f5f..4946d92 100644
--- a/package.json
+++ b/package.json
@@ -1,7 +1,7 @@
{
"name": "paywall-reader",
"description": "A web app that lets you read articles on popular news sites that get hidden behind paywalls.",
- "version": "1.0.0",
+ "version": "1.1.2",
"main": "index.js",
"scripts": {
"start": "node index.js",
diff --git a/parser.js b/parser.js
index 4beed3c..61cc886 100644
--- a/parser.js
+++ b/parser.js
@@ -6,7 +6,7 @@ function sanitizeUrl(url) {
return url.split('?')[0];
}
-async function checkUrl(url) {
+async function checkUrlArchive(url) {
url = sanitizeUrl(url);
const data = await fetch(`https://archive.org/wayback/available?url=${url}`);
if (!data) {
@@ -19,6 +19,19 @@ async function checkUrl(url) {
return json.archived_snapshots.closest.url;
}
+async function checkUrlGoogle(url, site) {
+ url = sanitizeUrl(url);
+ const searchTerm = url.split('/')[url.split('/').length - 1];
+ const res = await fetch(`https://content-customsearch.googleapis.com/customsearch/v1?cx=${process.env.GOOGLE_SEARCH_ID}&key=${process.env.GOOGLE_SEARCH_KEY}&q=${searchTerm}`)
+ const json = await res.json();
+ if (!json?.items?.length) {
+ throw new Error('Unable to get result from search engine');
+ }
+ const { cacheId } = json.items[0];
+ const webCacheUrl = `http://webcache.googleusercontent.com/search?q=cache:${cacheId}:${site}`;
+ return webCacheUrl;
+}
+
async function nyt(url) {
const rawHtml = await fetch(url);
const html = await rawHtml.text();
@@ -41,10 +54,10 @@ async function newyorker(url) {
const rawHtml = await fetch(url);
const html = await rawHtml.text();
const $ = cheerio.load(html);
- const scriptTag = $('script[type="application/ld+json"]').text().split(',\'keywords\':');
- const badJsonFixer = JSON.parse(`${scriptTag[0]}}`); // wtf
- const articleText = marked.parse(badJsonFixer.articleBody);
- const articleHeadline = badJsonFixer.headline;
+ const scriptTag = $('script[type="application/ld+json"]').first().text().split(',\'keywords\':');
+ const json = JSON.parse(scriptTag[0]);;
+ const articleText = marked.parse(json.articleBody);
+ const articleHeadline = json.headline;
return { articleText, articleHeadline };
}
@@ -118,11 +131,23 @@ async function businessInsider(url) {
const rootUrl = Object.keys(imageJson)[0];
image.parent().html(``);
});
+ $('.inline-newsletter-signup').remove();
const articleHtml = $('.content-lock-content').html();
const articleText = articleHtml;
return { articleText, articleHeadline };
}
+async function bloomberg(url) {
+ const rawHtml = await fetch(url);
+ const html = await rawHtml.text();
+ const $ = cheerio.load(html);
+ const scriptTag = $('script[data-component-props="ArticleBody"]').text();
+ const json = JSON.parse(scriptTag);
+ const articleText = json.story.body.replace(/60x-1/g, '1200x-1'); // replace low res images with higher res
+ const articleHeadline = json.story.seoHeadline;
+ return { articleText, articleHeadline };
+}
+
async function vogue(url) {
const rawHtml = await fetch(url);
const html = await rawHtml.text();
@@ -138,59 +163,70 @@ async function vogue(url) {
return { articleText, articleHeadline };
}
-async function getContent(source, url, direct) {
+async function getContent(source, url, method) {
+ console.log(source, url, method);
let articleText = null;
let articleHeadline = null;
- if (!direct) {
- url = await checkUrl(url);
+ if (method === 'ARCHIVE') {
+ url = await checkUrlArchive(url);
+ } else if (method === 'GOOGLE') {
+ url = await checkUrlGoogle(url, source);
}
+ console.log(url);
switch(source) {
- case 'nyt':
+ case 'nytimes.com':
const nytRes = await nyt(url);
articleText = nytRes.articleText;
articleHeadline = nytRes.articleHeadline;
break;
- case 'nytcooking':
+ case 'cooking.nytimes.com':
const nytCookingRes = await nytCooking(url);
articleText = nytCookingRes.articleText;
articleHeadline = nytCookingRes.articleHeadline;
break;
- case 'newyorker':
+ case 'newyorker.com':
const newyorkerRes = await newyorker(url);
articleText = newyorkerRes.articleText;
articleHeadline = newyorkerRes.articleHeadline;
break;
- case 'economist':
+ case 'economist.com':
const economistRes = await economist(url);
articleText = economistRes.articleText;
articleHeadline = economistRes.articleHeadline;
break;
- case 'washingtonpost':
+ case 'washingtonpost.com':
const washingtonPostRes = await washingtonPost(url);
articleText = washingtonPostRes.articleText;
articleHeadline = washingtonPostRes.articleHeadline;
break;
- case 'latimes':
+ case 'latimes.com':
const laTimesRes = await latimes(url);
articleText = laTimesRes.articleText;
articleHeadline = laTimesRes.articleHeadline;
break;
- case 'theathletic':
+ case 'theathletic.com':
const theAthleticRes = await theAthletic(url);
articleText = theAthleticRes.articleText;
articleHeadline = theAthleticRes.articleHeadline;
break;
- case 'businessinsider':
+ case 'businessinsider.com':
const businessInsiderRes = await businessInsider(url);
articleText = businessInsiderRes.articleText;
articleHeadline = businessInsiderRes.articleHeadline;
break;
- case 'vogue':
+ case 'bloomberg.com':
+ const bloombergRes = await bloomberg(url);
+ articleText = bloombergRes.articleText;
+ articleHeadline = bloombergRes.articleHeadline;
+ break;
+ case 'vogue.com':
const vogueRes = await vogue(url);
articleText = vogueRes.articleText;
articleHeadline = vogueRes.articleHeadline;
break;
-
+ default:
+ articleText = 'No article found';
+ articleHeadline = '404';
}
return {articleText, articleHeadline};
}
diff --git a/views/article.handlebars b/views/article.handlebars
index 67eb4de..990de01 100644
--- a/views/article.handlebars
+++ b/views/article.handlebars
@@ -2,6 +2,6 @@
A web app that lets you read articles on popular news sites that get hidden behind paywalls.
This is another card with title and supporting text below. This card has some - additional content to make it slightly taller overall.
-Last updated 3 mins ago
-