/
index.js
190 lines (155 loc) · 5.29 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
const puppeteer = require('puppeteer')
const credentials = require('./credentials')
const ora = require('ora')
const fs = require('fs')
const csvWriter = require('csv-write-stream')
const spinner = ora()
const JSON = Symbol.for('json')
const CSV = Symbol.for('csv')
async function run () {
// Setup ---------------------------------------------------------------------
const outputType = CSV
const outputPath = `./wishlists.${Symbol.keyFor(outputType)}`
const browser = await puppeteer.launch({
headless: true
})
const page = await browser.newPage()
// Login ---------------------------------------------------------------------
spinner.start(`Logging in as ${credentials.username}`)
await page.goto('https://www.bookdepository.com/account/login')
await page.click('#username')
await page.keyboard.type(credentials.username)
await page.click('#loginPassword')
await page.keyboard.type(credentials.password)
await page.click('form.form-horizontal.login-form button.btn.btn-primary')
await page.waitForNavigation()
// Wishlists -----------------------------------------------------------------
await page.goto('https://www.bookdepository.com/account/wishlist')
spinner.succeed('Logged in')
spinner.start('Getting list of wishlists')
const wishlists = await page.$$eval(
'ul.wishlist-links.sidebar-nav li a',
links => {
return links.reduce((acc, link) => {
const id = link.href.match(/wishlistId=(\d+)/)[1]
const name = link.children[0].innerText
acc[id] = { name: name }
return acc
}, {})
}
)
const ids = Object.keys(wishlists)
spinner.succeed(
`Found ${Object.keys(wishlists).length} wishlists: ${ids
.map(id => wishlists[id].name)
.join(', ')}`
)
var comp = 1
for (const id of ids) {
spinner.start(
`Scraping ${comp}/${ids.length}: ${wishlists[id].name} (${id})`
)
await page.goto(
`https://www.bookdepository.com/account/wishlist?wishlistId=${id}`
)
const pages = await page.$$('ul.pagination:nth-child(2) > li')
/* l is the number of pages. Wishlists with only 1 page will not have the
* pagination elements.
*
* n is the buffer for the next arrow found using the selector above.
*/
const [l, n] =
pages === null || pages.length === 0 ? [1, 0] : [pages.length, 1]
// - n because we're on the last page.
for (var i = 0; i < l - n; i++) {
spinner.start(
`Scraping ${comp}/${ids.length}: ${
wishlists[id].name
} (${id}) - Page ${i + 1} of ${l - 1}`
)
await page.waitFor('.wishlist-items')
const books = await page.$$eval(
'.wishlist-items .book-list-item .item-info-wrap',
items => {
return items.map(item => {
const itemInfo = Array.from(item.children).find(
value => value.className === 'item-info'
)
const itemInfoChildren = Array.from(itemInfo.children)
return {
title: itemInfoChildren.find(
value => value.className === 'item-title'
).innerText,
author: itemInfoChildren
.find(value => value.className === 'author')
.innerText.replace('By ', '')
}
})
}
)
if (wishlists[id].hasOwnProperty('books')) {
wishlists[id].books = wishlists[id].books.concat(books)
} else {
wishlists[id].books = books
}
// Check if there are any more pages in this wishlist
if (i === l - n) {
postSingleChk(spinner, wishlists[id], id)
break
}
const next = await page.$('#next-top > a')
if (next === null) {
postSingleChk(spinner, wishlists[id], id)
break
}
await next.click()
}
spinner.succeed(
`Scraped ${comp}/${ids.length}: ${wishlists[id].name} (${id})`
)
comp++
}
// Teardown ------------------------------------------------------------------
await browser.close()
// Export --------------------------------------------------------------------
spinner.start(`Writing results to ${outputPath}`)
fs.stat(outputPath, (err, stats) => {
if (err) {
return
}
fs.unlinkSync(outputPath)
})
switch (outputType) {
case JSON:
fs.writeFile(outputPath, JSON.stringify(wishlists, null, 2), err => {
if (err) {
spinner.fail(`Failed to write results to ${outputPath}: ${err}`)
return
}
spinner.succeed(`Wrote results to ${outputPath}`)
})
break
case CSV:
const writer = csvWriter({ headers: ['title', 'author', 'tags'] })
writer.pipe(fs.createWriteStream(outputPath))
ids.forEach(id => {
const tags = wishlists[id].name
wishlists[id].books.forEach(book => {
writer.write([book.title, book.author, tags])
})
})
writer.end()
break
default:
throw new Error(`Unknown output type ${outputType}`)
}
spinner.succeed(`Wrote results to ${outputPath}`)
}
run()
.then(() => {})
.catch(err => spinner.fail(`Failed to scrape wishlists: ${err}`))
const postSingleChk = (spinner, wishlist, id) => {
if (!wishlist.hasOwnProperty('books') || wishlist.books.length === 0) {
spinner.fail(`Found no books for ${wishlist.name} (${id})`)
}
}