From 558efc08a720ca1744c1e9b46d3e08bf9db2c087 Mon Sep 17 00:00:00 2001
From: Josh Terrill <joshterrill.dev@gmail.com>
Date: Sun, 7 Aug 2022 22:59:09 -0700
Subject: [PATCH] added support for bloomberg via search API, added autofocus
 to article page, added additional article text sanitization, reformatted
 mapping file

---
 .env.example             |  4 ++-
 README.md                | 16 +++++++--
 index.js                 |  4 +--
 news-source-map.json     | 19 ++++++-----
 package.json             |  2 +-
 parser.js                | 72 ++++++++++++++++++++++++++++++----------
 views/article.handlebars |  2 +-
 views/home.handlebars    | 42 ++++++++++++-----------
 views/read.handlebars    |  2 +-
 9 files changed, 108 insertions(+), 55 deletions(-)

diff --git a/.env.example b/.env.example
index 5350574..c1b3260 100644
--- a/.env.example
+++ b/.env.example
@@ -1 +1,3 @@
-﻿PORT=3030
\ No newline at end of file
+﻿PORT=3030
+GOOGLE_SEARCH_ID=xxxxxxxxxx
+GOOGLE_SEARCH_KEY=yyyyyyyyyy
\ No newline at end of file
diff --git a/README.md b/README.md
index 5c67081..284c4d9 100644
--- a/README.md
+++ b/README.md
@@ -13,10 +13,19 @@ URL: https://reader.dangerous.dev
 * LA Times (latimes.com)
 * The Athletic (theathletic.com)
 * Business Insider (businessinsider.com)
+* Bloomberg (bloomberg.com)
 * Vogue (vogue.com)
 
-### Unsupported Sites
-* Wall Street Journal (wsj.com)
+### Prerequisites
+
+1. Register a custom Google search engine by going to https://developers.google.com/custom-search/v1/introduction and pressing "Get A Key"
+2. As per their site:
+
+>Once it is created, you can find the engine's ID in the Setup > Basics > Search Engine ID section of the Control Panel
+
+This is where you will configure what sites your search will search in. For the purposes of this app in its current state, we will just use bloomberg.com
+
+Take note of your API key and Search Engine ID.
 
 ### Installation
 
@@ -24,6 +33,7 @@ URL: https://reader.dangerous.dev
 git clone https://github.com/joshterrill/paywall-reader
 cd paywall-reader/
 cp .env.example .env
+# replace GOOGLE_API_KEY and GOOGLE_SEARCH_ID with values from prerequisites section
 npm i
 npm start
 ```
@@ -37,4 +47,4 @@ Pull requests would gladly be accepted for adding support for more sites (as lon
 ### Todo
 * Fix relative and absolute links in embedded html to point to domain they should be coming from
 * Add dom sanitization for incoming HTML
-* Add checks for source and URL fields on requests to ensure the URL matches the URL in the `news-source-map.json` file
\ No newline at end of file
+* Add checks for source and URL fields on requests to ensure the URL matches the URL in the `news-source-map.json` file
diff --git a/index.js b/index.js
index a8c648f..078f450 100644
--- a/index.js
+++ b/index.js
@@ -34,8 +34,8 @@ app.get('/read', async (req, res) => {
         if (!source || !url) {
             throw new Error('Source or URL not provided');
         }
-        const direct = newsSourceMapping[source].direct;
-        const { articleText, articleHeadline } = await parse.getContent(source, url, direct);
+        const sourceMapping = newsSourceMapping[source];
+        const { articleText, articleHeadline } = await parse.getContent(source, url, sourceMapping.method);
         res.render('read', {source, sourceText: newsSourceMapping[source].name, articleText, articleHeadline});
     } catch (error) {
         console.log(error);
diff --git a/news-source-map.json b/news-source-map.json
index 1c0bf02..3a9cb78 100644
--- a/news-source-map.json
+++ b/news-source-map.json
@@ -1,11 +1,12 @@
 ﻿{
-    "nyt": {"name": "New York Times", "url": "nytimes.com"},
-    "nytcooking": {"name": "New York Times Cooking", "url": "cooking.nytimes.com"},
-    "newyorker": {"name": "The New Yorker", "url": "newyorker.com"},
-    "economist": {"name": "The Economist", "url": "economist.com"},
-    "washingtonpost": {"name": "Washington Post", "url": "washingtonpost.com"},
-    "latimes": {"name": "LA Times", "url": "latimes.com", "direct": true},
-    "theathletic": {"name": "The Athletic", "url": "theathletic.com", "direct": true},
-    "businessinsider": {"name": "Business Insider", "url": "businessinsider.com"},
-    "vogue": {"name": "Vogue", "url": "vogue.com"}
+    "nytimes.com": {"name": "New York Times", "method": "ARCHIVE"},
+    "cooking.nytimes.com": {"name": "New York Times Cooking", "method": "ARCHIVE"},
+    "newyorker.com": {"name": "The New Yorker", "method": "ARCHIVE"},
+    "economist.com": {"name": "The Economist", "method": "ARCHIVE"},
+    "washingtonpost.com": {"name": "Washington Post", "method": "ARCHIVE"},
+    "latimes.com": {"name": "LA Times", "method": "DIRECT"},
+    "theathletic.com": {"name": "The Athletic", "method": "DIRECT"},
+    "businessinsider.com": {"name": "Business Insider", "method": "ARCHIVE"},
+    "bloomberg.com": {"name": "Bloomberg", "method": "GOOGLE"},
+    "vogue.com": {"name": "Vogue", "method": "ARCHIVE"}
 }
\ No newline at end of file
diff --git a/package.json b/package.json
index 2630f5f..4946d92 100644
--- a/package.json
+++ b/package.json
@@ -1,7 +1,7 @@
 {
   "name": "paywall-reader",
   "description": "A web app that lets you read articles on popular news sites that get hidden behind paywalls.",
-  "version": "1.0.0",
+  "version": "1.1.2",
   "main": "index.js",
   "scripts": {
     "start": "node index.js",
diff --git a/parser.js b/parser.js
index 4beed3c..61cc886 100644
--- a/parser.js
+++ b/parser.js
@@ -6,7 +6,7 @@ function sanitizeUrl(url) {
     return url.split('?')[0];
 }
 
-async function checkUrl(url) {
+async function checkUrlArchive(url) {
     url = sanitizeUrl(url);
     const data = await fetch(`https://archive.org/wayback/available?url=${url}`);
     if (!data) {
@@ -19,6 +19,19 @@ async function checkUrl(url) {
     return json.archived_snapshots.closest.url;
 }
 
+async function checkUrlGoogle(url, site) {
+    url = sanitizeUrl(url);
+    const searchTerm = url.split('/')[url.split('/').length - 1];
+    const res = await fetch(`https://content-customsearch.googleapis.com/customsearch/v1?cx=${process.env.GOOGLE_SEARCH_ID}&key=${process.env.GOOGLE_SEARCH_KEY}&q=${searchTerm}`)
+    const json = await res.json();
+    if (!json?.items?.length) {
+        throw new Error('Unable to get result from search engine');
+    }
+    const { cacheId } = json.items[0];
+    const webCacheUrl = `http://webcache.googleusercontent.com/search?q=cache:${cacheId}:${site}`;
+    return webCacheUrl;
+}
+
 async function nyt(url) {
     const rawHtml = await fetch(url);
     const html = await rawHtml.text();
@@ -41,10 +54,10 @@ async function newyorker(url) {
     const rawHtml = await fetch(url);
     const html = await rawHtml.text();
     const $ = cheerio.load(html);
-    const scriptTag = $('script[type="application/ld+json"]').text().split(',\'keywords\':');
-    const badJsonFixer = JSON.parse(`${scriptTag[0]}}`); // wtf
-    const articleText = marked.parse(badJsonFixer.articleBody);
-    const articleHeadline = badJsonFixer.headline;
+    const scriptTag = $('script[type="application/ld+json"]').first().text().split(',\'keywords\':');
+    const json = JSON.parse(scriptTag[0]);;
+    const articleText = marked.parse(json.articleBody);
+    const articleHeadline = json.headline;
     return { articleText, articleHeadline };
 }
 
@@ -118,11 +131,23 @@ async function businessInsider(url) {
         const rootUrl = Object.keys(imageJson)[0];
         image.parent().html(`<img src="${decodeURIComponent(rootUrl)}" />`);
     });
+    $('.inline-newsletter-signup').remove();
     const articleHtml = $('.content-lock-content').html();
     const articleText = articleHtml;
     return { articleText, articleHeadline };
 }
 
+async function bloomberg(url) {
+    const rawHtml = await fetch(url);
+    const html = await rawHtml.text();
+    const $ = cheerio.load(html);
+    const scriptTag = $('script[data-component-props="ArticleBody"]').text();
+    const json = JSON.parse(scriptTag);
+    const articleText = json.story.body.replace(/60x-1/g, '1200x-1'); // replace low res images with higher res
+    const articleHeadline = json.story.seoHeadline;
+    return { articleText, articleHeadline };
+}
+
 async function vogue(url) {
     const rawHtml = await fetch(url);
     const html = await rawHtml.text();
@@ -138,59 +163,70 @@ async function vogue(url) {
     return { articleText, articleHeadline };
 }
 
-async function getContent(source, url, direct) {
+async function getContent(source, url, method) {
+    console.log(source, url, method);
     let articleText = null;
     let articleHeadline = null;
-    if (!direct) {
-        url = await checkUrl(url);
+    if (method === 'ARCHIVE') {
+        url = await checkUrlArchive(url);
+    } else if (method === 'GOOGLE') {
+        url = await checkUrlGoogle(url, source);
     }
+    console.log(url);
     switch(source) {
-        case 'nyt':
+        case 'nytimes.com':
             const nytRes = await nyt(url);
             articleText = nytRes.articleText;
             articleHeadline = nytRes.articleHeadline;
             break;
-        case 'nytcooking':
+        case 'cooking.nytimes.com':
             const nytCookingRes = await nytCooking(url);
             articleText = nytCookingRes.articleText;
             articleHeadline = nytCookingRes.articleHeadline;
             break;
-        case 'newyorker':
+        case 'newyorker.com':
             const newyorkerRes = await newyorker(url);
             articleText = newyorkerRes.articleText;
             articleHeadline = newyorkerRes.articleHeadline;
             break;
-        case 'economist':
+        case 'economist.com':
             const economistRes = await economist(url);
             articleText = economistRes.articleText;
             articleHeadline = economistRes.articleHeadline;
             break;
-        case 'washingtonpost':
+        case 'washingtonpost.com':
             const washingtonPostRes = await washingtonPost(url);
             articleText = washingtonPostRes.articleText;
             articleHeadline = washingtonPostRes.articleHeadline;
             break;
-        case 'latimes':
+        case 'latimes.com':
             const laTimesRes = await latimes(url);
             articleText = laTimesRes.articleText;
             articleHeadline = laTimesRes.articleHeadline;
             break;
-        case 'theathletic':
+        case 'theathletic.com':
             const theAthleticRes = await theAthletic(url);
             articleText = theAthleticRes.articleText;
             articleHeadline = theAthleticRes.articleHeadline;
             break;
-        case 'businessinsider':
+        case 'businessinsider.com':
             const businessInsiderRes = await businessInsider(url);
             articleText = businessInsiderRes.articleText;
             articleHeadline = businessInsiderRes.articleHeadline;
             break;
-        case 'vogue':
+        case 'bloomberg.com':
+            const bloombergRes = await bloomberg(url);
+            articleText = bloombergRes.articleText;
+            articleHeadline = bloombergRes.articleHeadline;
+            break;
+        case 'vogue.com':
             const vogueRes = await vogue(url);
             articleText = vogueRes.articleText;
             articleHeadline = vogueRes.articleHeadline;
             break;
-        
+        default:
+            articleText = 'No article found';
+            articleHeadline = '404';
     }
     return {articleText, articleHeadline};
 }
diff --git a/views/article.handlebars b/views/article.handlebars
index 67eb4de..990de01 100644
--- a/views/article.handlebars
+++ b/views/article.handlebars
@@ -2,6 +2,6 @@
 
 <form action="/read" method="GET">
     <input type="hidden" name="source" value="{{source}}" />
-    <input class="form-control mb-2" type="text" name="url" placeholder="Type {{sourceText}} URL here" />
+    <input class="form-control mb-2" type="text" name="url" placeholder="Type {{sourceText}} URL here" autofocus />
     <button class="btn btn-success">Submit</button>
 </form>
\ No newline at end of file
diff --git a/views/home.handlebars b/views/home.handlebars
index 746d7b4..3e02d1a 100644
--- a/views/home.handlebars
+++ b/views/home.handlebars
@@ -1,18 +1,8 @@
 ﻿<p class="lead">A web app that lets you read articles on popular news sites that get hidden behind paywalls.</p>
 
 <div class="row">
-    {{!-- <div class="col-sm-6 col-lg-4 mb-4">
-        <div class="card">
-            <div class="card-body">
-                <h5 class="card-title">Card title</h5>
-                <p class="card-text">This is another card with title and supporting text below. This card has some
-                    additional content to make it slightly taller overall.</p>
-                <p class="card-text"><small class="text-muted">Last updated 3 mins ago</small></p>
-            </div>
-        </div>
-    </div> --}}
     <div class="col-sm-6 col-lg-4 mb-4">
-        <a href="/article?source=nyt">
+        <a href="/article?source=nytimes.com">
             <div class="card p-5 text-white bg-primary rounded-3">
                 <figure class="mb-0">
                     <blockquote class="blockquote">
@@ -26,7 +16,7 @@
         </a>
     </div>
     <div class="col-sm-6 col-lg-4 mb-4">
-        <a href="/article?source=nytcooking">
+        <a href="/article?source=cooking.nytimes.com">
             <div class="card p-5 text-white bg-success rounded-3">
                 <figure class="mb-0">
                     <blockquote class="blockquote">
@@ -40,7 +30,7 @@
         </a>
     </div>
     <div class="col-sm-6 col-lg-4 mb-4">
-        <a href="/article?source=newyorker">
+        <a href="/article?source=newyorker.com">
             <div class="card p-5 text-white bg-info rounded-3">
                 <figure class="mb-0">
                     <blockquote class="blockquote">
@@ -54,7 +44,7 @@
         </a>
     </div>
     <div class="col-sm-6 col-lg-4 mb-4">
-        <a href="/article?source=economist">
+        <a href="/article?source=economist.com">
             <div class="card p-5 text-white bg-danger rounded-3">
                 <figure class="mb-0">
                     <blockquote class="blockquote">
@@ -68,7 +58,7 @@
         </a>
     </div>
     <div class="col-sm-6 col-lg-4 mb-4">
-        <a href="/article?source=washingtonpost">
+        <a href="/article?source=washingtonpost.com">
             <div class="card p-5 bg-warning border rounded-3">
                 <figure class="mb-0">
                     <blockquote class="blockquote text-dark">
@@ -82,7 +72,7 @@
         </a>
     </div>
     <div class="col-sm-6 col-lg-4 mb-4">
-        <a href="/article?source=latimes">
+        <a href="/article?source=latimes.com">
             <div class="card p-5 bg-secondary border rounded-3">
                 <figure class="mb-0">
                     <blockquote class="blockquote text-white">
@@ -96,7 +86,7 @@
         </a>
     </div>
     <div class="col-sm-6 col-lg-4 mb-4">
-        <a href="/article?source=theathletic">
+        <a href="/article?source=theathletic.com">
             <div class="card p-5 bg-dark border rounded-3">
                 <figure class="mb-0">
                     <blockquote class="blockquote text-white">
@@ -110,7 +100,7 @@
         </a>
     </div>
     <div class="col-sm-6 col-lg-4 mb-4">
-        <a href="/article?source=businessinsider">
+        <a href="/article?source=businessinsider.com">
             <div class="card p-5 bg-light border rounded-3">
                 <figure class="mb-0">
                     <blockquote class="blockquote text-dark">
@@ -124,8 +114,22 @@
         </a>
     </div>
     <div class="col-sm-6 col-lg-4 mb-4">
-        <a href="/article?source=vogue">
+        <a href="/article?source=bloomberg.com">
             <div class="card p-5 bg-primary border rounded-3">
+                <figure class="mb-0">
+                    <blockquote class="blockquote text-white">
+                        <p>Bloomberg</p>
+                    </blockquote>
+                    <figcaption class="mb-0 text-white">
+                        bloomberg.com
+                    </figcaption>
+                </figure>
+            </div>
+        </a>
+    </div>
+    <div class="col-sm-6 col-lg-4 mb-4">
+        <a href="/article?source=vogue.com">
+            <div class="card p-5 bg-success border rounded-3">
                 <figure class="mb-0">
                     <blockquote class="blockquote text-white">
                         <p>Vogue</p>
diff --git a/views/read.handlebars b/views/read.handlebars
index bf6bd89..657c988 100644
--- a/views/read.handlebars
+++ b/views/read.handlebars
@@ -1,4 +1,4 @@
-﻿<h3 class="text-secondary">{{articleHeadline}} <span class="badge bg-dark source-badge">{{sourceText}}</span></h3>
+﻿<h3 class="text-secondary">{{{articleHeadline}}} <span class="badge bg-dark source-badge">{{sourceText}}</span></h3>
 <hr />
 <div class="article-text-container">
     {{{articleText}}}