From 0c08a071207f70a49c608f5b724794742fb82d59 Mon Sep 17 00:00:00 2001 From: Kelly Miyashiro Date: Mon, 6 Apr 2020 15:14:58 -0700 Subject: [PATCH] Fix --- .prettierrc | 4 ++ package.json | 7 ++- scraper.js | 160 ++++++++++++++++++++++++++++++--------------------- yarn.lock | 93 +++++++++++++++++++++++++++++- 4 files changed, 196 insertions(+), 68 deletions(-) create mode 100644 .prettierrc diff --git a/.prettierrc b/.prettierrc new file mode 100644 index 0000000..46be197 --- /dev/null +++ b/.prettierrc @@ -0,0 +1,4 @@ +trailingComma: "es5" +tabWidth: 2 +semi: false +singleQuote: true diff --git a/package.json b/package.json index ca75c4a..d504a1a 100644 --- a/package.json +++ b/package.json @@ -2,9 +2,12 @@ "version": "0.0.1", "main": "scraper.js", "dependencies": { + "axios": "^0.19.2", + "axios-cookiejar-support": "^0.5.1", "cheerio": "latest", - "request": "latest", - "sqlite3": "latest" + "prettier": "^2.0.4", + "sqlite3": "latest", + "tough-cookie": "^4.0.0" }, "scripts": { "start": "node scraper.js" diff --git a/scraper.js b/scraper.js index 5a1fe6f..3b18fab 100644 --- a/scraper.js +++ b/scraper.js @@ -1,92 +1,124 @@ // This is a template for a Node.js scraper on morph.io (https://morph.io) -var cheerio = require("cheerio"); -var request = require("request"); -var sqlite3 = require("sqlite3").verbose(); +const cheerio = require('cheerio') +const querystring = require('querystring') +const axios = require('axios').default +const tough = require('tough-cookie') +const axiosCookieJarSupport = require('axios-cookiejar-support').default +const sqlite3 = require('sqlite3').verbose() + +const { Cookie } = tough + +axiosCookieJarSupport(axios) + +const cookieJar = new tough.CookieJar() +axios.defaults.jar = cookieJar +axios.defaults.withCredentials = true + +const LOGIN_URL = 'https://www.klwines.com/account/login' +const USERNAME = process.env.MORPH_KL_USER +const PASSWORD = process.env.MORPH_KL_PASSWORD + +async function getLoginRequestToken() { + const response = await axios(LOGIN_URL) + const $ = cheerio.load(response.data) + return $('[name="__RequestVerificationToken"]').val() +} + +async function getCookie() { + const loginRequestToken = await getLoginRequestToken() + const formData = { + __RequestVerificationToken: loginRequestToken, + Email: USERNAME, + Password: PASSWORD, + ReturnUrl: '', + 'Login.x': '15', + 'Login.y': '5', + } + + return axios(LOGIN_URL, { + method: 'POST', + headers: { + 'Content-Type': 'application/x-www-form-urlencoded', + 'User-Agent': + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/601.3.9 (KHTML, like Gecko) Version/9.0.2 Safari/601.3.9', + }, + data: querystring.stringify(formData), + }) +} function initDatabase(callback) { // Set up sqlite database. - var db = new sqlite3.Database("data.sqlite"); - db.serialize(function() { + const db = new sqlite3.Database('data.sqlite') + db.serialize(function () { db.run( - "CREATE TABLE IF NOT EXISTS data (key PRIMARY KEY, id TEXT, date TEXT, name TEXT, price INT)" - ); - callback(db); - }); + 'CREATE TABLE IF NOT EXISTS data (key PRIMARY KEY, id TEXT, date TEXT, name TEXT, price INT)' + ) + callback(db) + }) } function updateRow(db, id, name, price) { - let date = new Date(); + console.log('UPDATE', id, name, price) + let date = new Date() let dateString = - date.getFullYear() + "-" + (date.getMonth() + 1) + "-" + date.getDate(); + date.getFullYear() + '-' + (date.getMonth() + 1) + '-' + date.getDate() // Insert some data. - var statement = db.prepare(`INSERT INTO data(key, id, date, name, price) + const statement = db.prepare(`INSERT INTO data(key, id, date, name, price) VALUES (?, ?, ?, ?, ?) ON CONFLICT(key) DO UPDATE SET price=excluded.price - `); - statement.run(`${dateString}-${id}`, id, dateString, name, price); - statement.finalize(); + `) + statement.run(`${dateString}-${id}`, id, dateString, name, price) + statement.finalize() + console.log('FINALIZE', id, name, price) } function readRows(db) { // Read some data. - db.each("SELECT name, date, price FROM data", function(err, row) { - console.log(`${row.date} ${row.name}: ${row.price}`); - }); + db.each('SELECT name, date, price FROM data', function (err, row) { + console.log(`${row.date} ${row.name}: ${row.price}`) + }) } function getSourceId(url) { - return url.match(/i=([\d]+?)\&/)[1]; + return url.match(/i=([\d]+?)\&/)[1] } -function fetchPage(url, callback) { +async function run(db) { + console.log('running') + console.log('getting cookie') + await getCookie() // Use request to read in pages. - request(url, function(error, response, body) { - if (error) { - console.log("Error requesting page: " + error); - return; - } - - callback(body); - }); -} + console.log('fetching page') + const response = await axios( + 'https://www.klwines.com/Products?&filters=sv2_206!20&limit=500&offset=0' + ) + // Use cheerio to find things in the page with css selectors. + const $ = cheerio.load(response.data) -function run(db) { - // Use request to read in pages. - fetchPage( - "https://www.klwines.com/Products?&filters=sv2_206!20&limit=500&offset=0", - function(body) { - // Use cheerio to find things in the page with css selectors. - var $ = cheerio.load(body); - - var elements = $(".result"); - // console.log("Elements", elements); - elements.each(function() { - var link = $(this) - .find(".result-desc > a[href^='/p/i']") - .first(); - - var id = getSourceId(link.attr("href")); - var name = link.text().trim(); - var price = parseInt( - $(this) - .find(".price strong") - .text() - .trim() - .replace(/[\$\.]/g, "") - ); - console.log("id", id); - console.log("name", name); - console.log("price", price); - - updateRow(db, id, name, price); - }); - - db.close(); - } - ); + const elements = $('.tf-product') + console.log(`Found ${elements.length} elements`) + // console.log("Elements", elements); + elements.each(function () { + const link = $(this).find(".tf-product-header > a[href^='/p/i']").first() + + const id = getSourceId(link.attr('href')) + const name = link.text().trim() + const priceNode = $(this).find('.tf-price span:nth-of-type(2)') + const price = parseInt( + priceNode + .text() + .trim() + .replace(/[\$\.\*]/g, '') + ) + console.log('id:', id, 'name:', name, 'price:', price) + + updateRow(db, id, name, price) + }) + + db.close() } -initDatabase(run); +initDatabase(run) diff --git a/yarn.lock b/yarn.lock index e7e7276..5d97d56 100644 --- a/yarn.lock +++ b/yarn.lock @@ -7,6 +7,11 @@ resolved "https://registry.yarnpkg.com/@types/node/-/node-12.12.11.tgz#bec2961975888d964196bf0016a2f984d793d3ce" integrity sha512-O+x6uIpa6oMNTkPuHDa9MhMMehlxLAd5QcOvKRjAFsBVpeFWTOPnXbDvILvFgFFZfQ1xh1EZi1FbXxUix+zpsQ== +"@types/tough-cookie@^2.3.3": + version "2.3.7" + resolved "https://registry.yarnpkg.com/@types/tough-cookie/-/tough-cookie-2.3.7.tgz#979434b5900f9d710f5d4e15c466cadb8e9fdc47" + integrity sha512-rMQbgMGxnLsdn8e9aPVyuN+zMQLrZ2QW8xlv7eWS1mydfGXN+tsTKffcIzd8rGCcLdmi3xvQw2MDaZI1bBNTaw== + abbrev@1: version "1.1.1" resolved "https://registry.yarnpkg.com/abbrev/-/abbrev-1.1.1.tgz#f8f2c887ad10bf67f634f005b6987fed3179aac8" @@ -72,6 +77,23 @@ aws4@^1.8.0: resolved "https://registry.yarnpkg.com/aws4/-/aws4-1.8.0.tgz#f0e003d9ca9e7f59c7a508945d7b2ef9a04a542f" integrity sha512-ReZxvNHIOv88FlT7rxcXIIC0fPt4KZqZbOlivyWtXLt8ESx84zd3kMC6iK5jVeS2qt+g7ftS7ye4fi06X5rtRQ== +axios-cookiejar-support@^0.5.1: + version "0.5.1" + resolved "https://registry.yarnpkg.com/axios-cookiejar-support/-/axios-cookiejar-support-0.5.1.tgz#0622c2849cefbaf8424a50b630283231b62fc277" + integrity sha512-mmMbNDjpkAKlyxVOYjkpvV6rDRoSjBXwHbfkWvnsplRTGYCergbHvZInRB1G3lqumllUQwo0A4uPoqEsYfzq3A== + dependencies: + "@types/tough-cookie" "^2.3.3" + is-redirect "^1.0.0" + pify "^4.0.0" + tough-cookie "^3.0.1" + +axios@^0.19.2: + version "0.19.2" + resolved "https://registry.yarnpkg.com/axios/-/axios-0.19.2.tgz#3ea36c5d8818d0d5f8a8a97a6d36b86cdc00cb27" + integrity sha512-fjgm5MvRHLhx+osE2xoekY70AhARk3a6hkN+3Io1jc00jtquGvxYlKlsFUhmUET0V5te6CcZI7lcv2Ym61mjHA== + dependencies: + follow-redirects "1.5.10" + balanced-match@^1.0.0: version "1.0.0" resolved "https://registry.yarnpkg.com/balanced-match/-/balanced-match-1.0.0.tgz#89b4d199ab2bee49de164ea02b89ce462d71b767" @@ -168,6 +190,13 @@ dashdash@^1.12.0: dependencies: assert-plus "^1.0.0" +debug@=3.1.0: + version "3.1.0" + resolved "https://registry.yarnpkg.com/debug/-/debug-3.1.0.tgz#5bb5a0672628b64149566ba16819e61518c67261" + integrity sha512-OX8XqP7/1a9cqkxYw2yXss15f26NKWBpDXQd0/uK/KPqdQhxbPa994hnzjcE2VqQpDslf55723cKPUOGSmMY3g== + dependencies: + ms "2.0.0" + debug@^3.2.6: version "3.2.6" resolved "https://registry.yarnpkg.com/debug/-/debug-3.2.6.tgz#e83d17de16d8a7efb7717edbe5fb10135eee629b" @@ -287,6 +316,13 @@ fast-json-stable-stringify@^2.0.0: resolved "https://registry.yarnpkg.com/fast-json-stable-stringify/-/fast-json-stable-stringify-2.0.0.tgz#d5142c0caee6b1189f87d3a76111064f86c8bbf2" integrity sha1-1RQsDK7msRifh9OnYREGT4bIu/I= +follow-redirects@1.5.10: + version "1.5.10" + resolved "https://registry.yarnpkg.com/follow-redirects/-/follow-redirects-1.5.10.tgz#7b7a9f9aea2fdff36786a94ff643ed07f4ff5e2a" + integrity sha512-0V5l4Cizzvqt5D44aTXbFZz+FtyXV1vrDN6qrelxtfYQKW0KO0W2T/hkE8xvGa/540LkZlkaUjO4ailYTFtHVQ== + dependencies: + debug "=3.1.0" + forever-agent@~0.6.1: version "0.6.1" resolved "https://registry.yarnpkg.com/forever-agent/-/forever-agent-0.6.1.tgz#fbc71f0c41adeb37f96c577ad1ed42d8fdacca91" @@ -417,6 +453,11 @@ ini@~1.3.0: resolved "https://registry.yarnpkg.com/ini/-/ini-1.3.5.tgz#eee25f56db1c9ec6085e0c22778083f596abf927" integrity sha512-RZY5huIKCMRWDUqZlEi72f/lmXKMvuszcMBduliQ3nnWbx9X/ZBQO7DijMEYS9EhHBb2qacRUMtC7svLwe0lcw== +ip-regex@^2.1.0: + version "2.1.0" + resolved "https://registry.yarnpkg.com/ip-regex/-/ip-regex-2.1.0.tgz#fa78bf5d2e6913c911ce9f819ee5146bb6d844e9" + integrity sha1-+ni/XS5pE8kRzp+BnuUUa7bYROk= + is-fullwidth-code-point@^1.0.0: version "1.0.0" resolved "https://registry.yarnpkg.com/is-fullwidth-code-point/-/is-fullwidth-code-point-1.0.0.tgz#ef9e31386f031a7f0d643af82fde50c457ef00cb" @@ -429,6 +470,11 @@ is-fullwidth-code-point@^2.0.0: resolved "https://registry.yarnpkg.com/is-fullwidth-code-point/-/is-fullwidth-code-point-2.0.0.tgz#a3b30a5c4f199183167aaab93beefae3ddfb654f" integrity sha1-o7MKXE8ZkYMWeqq5O+764937ZU8= +is-redirect@^1.0.0: + version "1.0.0" + resolved "https://registry.yarnpkg.com/is-redirect/-/is-redirect-1.0.0.tgz#1d03dded53bd8db0f30c26e4f95d36fc7c87dc24" + integrity sha1-HQPd7VO9jbDzDCbk+V02/HyH3CQ= + is-typedarray@~1.0.0: version "1.0.0" resolved "https://registry.yarnpkg.com/is-typedarray/-/is-typedarray-1.0.0.tgz#e479c80858df0c1b11ddda6940f96011fcda4a9a" @@ -530,6 +576,11 @@ mkdirp@^0.5.0, mkdirp@^0.5.1: dependencies: minimist "0.0.8" +ms@2.0.0: + version "2.0.0" + resolved "https://registry.yarnpkg.com/ms/-/ms-2.0.0.tgz#5608aeadfc00be6c2901df5f9861788de0d597c8" + integrity sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g= + ms@^2.1.1: version "2.1.2" resolved "https://registry.yarnpkg.com/ms/-/ms-2.1.2.tgz#d09d1f357b443f493382a8eb3ccd183872ae6009" @@ -660,6 +711,16 @@ performance-now@^2.1.0: resolved "https://registry.yarnpkg.com/performance-now/-/performance-now-2.1.0.tgz#6309f4e0e5fa913ec1c69307ae364b4b377c9e7b" integrity sha1-Ywn04OX6kT7BxpMHrjZLSzd8nns= +pify@^4.0.0: + version "4.0.1" + resolved "https://registry.yarnpkg.com/pify/-/pify-4.0.1.tgz#4b2cd25c50d598735c50292224fd8c6df41e3231" + integrity sha512-uB80kBFb/tfd68bVleG9T5GGsGPjJrLAUpR5PZIrhBnIaRTQRjqdJSsIKkOP6OAIFbj7GOrcudc5pNjZ+geV2g== + +prettier@^2.0.4: + version "2.0.4" + resolved "https://registry.yarnpkg.com/prettier/-/prettier-2.0.4.tgz#2d1bae173e355996ee355ec9830a7a1ee05457ef" + integrity sha512-SVJIQ51spzFDvh4fIbCLvciiDMCrRhlN3mbZvv/+ycjvmF5E73bKdGfU8QDLNmjYJf+lsGnDBC4UUnvTe5OO0w== + process-nextick-args@~2.0.0: version "2.0.1" resolved "https://registry.yarnpkg.com/process-nextick-args/-/process-nextick-args-2.0.1.tgz#7820d9b16120cc55ca9ae7792680ae7dba6d7fe2" @@ -670,12 +731,17 @@ psl@^1.1.24: resolved "https://registry.yarnpkg.com/psl/-/psl-1.4.0.tgz#5dd26156cdb69fa1fdb8ab1991667d3f80ced7c2" integrity sha512-HZzqCGPecFLyoRj5HLfuDSKYTJkAfB5thKBIkRHtGjWwY7p1dAyveIbXIq4tO0KYfDF2tHqPUgY9SDnGm00uFw== +psl@^1.1.28, psl@^1.1.33: + version "1.8.0" + resolved "https://registry.yarnpkg.com/psl/-/psl-1.8.0.tgz#9326f8bcfb013adcc005fdff056acce020e51c24" + integrity sha512-RIdOzyoavK+hA18OGGWDqUTsCLhtA7IcZ/6NCs4fFJaHBDab+pDDmDIByWFRQJq2Cd7r1OoQxBGKOaztq+hjIQ== + punycode@^1.4.1: version "1.4.1" resolved "https://registry.yarnpkg.com/punycode/-/punycode-1.4.1.tgz#c0d5a63b2718800ad8e1eb0fa5269c84dd41845e" integrity sha1-wNWmOycYgArY4esPpSachN1BhF4= -punycode@^2.1.0: +punycode@^2.1.0, punycode@^2.1.1: version "2.1.1" resolved "https://registry.yarnpkg.com/punycode/-/punycode-2.1.1.tgz#b58b010ac40c22c5657616c8d2c2c02c7bf479ec" integrity sha512-XRsRjdf+j5ml+y/6GKHPZbrF/8p2Yga0JPtdqTIY2Xe5ohJPD9saDJJLPvp9+NSBprVvevdXZybnj2cv8OEd0A== @@ -717,7 +783,7 @@ readable-stream@^3.1.1: string_decoder "^1.1.1" util-deprecate "^1.0.1" -request@^2.87.0, request@latest: +request@^2.87.0: version "2.88.0" resolved "https://registry.yarnpkg.com/request/-/request-2.88.0.tgz#9c2fca4f7d35b592efe57c7f0a55e81052124fef" integrity sha512-NAqBSrijGLZdM0WZNsInLJpkJokL72XYjUpnB0iwsRgxh7dB6COrHnTBNwN0E+lHDAJzu7kLAkDeY08z2/A0hg== @@ -872,6 +938,24 @@ tar@^4: safe-buffer "^5.1.2" yallist "^3.0.3" +tough-cookie@^3.0.1: + version "3.0.1" + resolved "https://registry.yarnpkg.com/tough-cookie/-/tough-cookie-3.0.1.tgz#9df4f57e739c26930a018184887f4adb7dca73b2" + integrity sha512-yQyJ0u4pZsv9D4clxO69OEjLWYw+jbgspjTue4lTQZLfV0c5l1VmK2y1JK8E9ahdpltPOaAThPcp5nKPUgSnsg== + dependencies: + ip-regex "^2.1.0" + psl "^1.1.28" + punycode "^2.1.1" + +tough-cookie@^4.0.0: + version "4.0.0" + resolved "https://registry.yarnpkg.com/tough-cookie/-/tough-cookie-4.0.0.tgz#d822234eeca882f991f0f908824ad2622ddbece4" + integrity sha512-tHdtEpQCMrc1YLrMaqXXcj6AxhYi/xgit6mZu1+EDWUn+qhUf8wMQoFIy9NXuq23zAwtcB0t/MjACGR18pcRbg== + dependencies: + psl "^1.1.33" + punycode "^2.1.1" + universalify "^0.1.2" + tough-cookie@~2.4.3: version "2.4.3" resolved "https://registry.yarnpkg.com/tough-cookie/-/tough-cookie-2.4.3.tgz#53f36da3f47783b0925afa06ff9f3b165280f781" @@ -892,6 +976,11 @@ tweetnacl@^0.14.3, tweetnacl@~0.14.0: resolved "https://registry.yarnpkg.com/tweetnacl/-/tweetnacl-0.14.5.tgz#5ae68177f192d4456269d108afa93ff8743f4f64" integrity sha1-WuaBd/GS1EViadEIr6k/+HQ/T2Q= +universalify@^0.1.2: + version "0.1.2" + resolved "https://registry.yarnpkg.com/universalify/-/universalify-0.1.2.tgz#b646f69be3942dabcecc9d6639c80dc105efaa66" + integrity sha512-rBJeI5CXAlmy1pV+617WB9J63U6XcazHHF2f2dbJix4XzpUF0RS3Zbj0FGIOCAva5P/d/GBOYaACQ1w+0azUkg== + uri-js@^4.2.2: version "4.2.2" resolved "https://registry.yarnpkg.com/uri-js/-/uri-js-4.2.2.tgz#94c540e1ff772956e2299507c010aea6c8838eb0"