-
Notifications
You must be signed in to change notification settings - Fork 0
/
index.js
222 lines (185 loc) · 5.59 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
const Nightmare = require('nightmare')
const vo = require('vo')
const fs = require('fs')
const stopWords = require('./stopWords')
const nightmare = Nightmare({
// openDevTools: {
// mode: 'detach'
// },
show: true
})
const rottenTomatoesURL = 'https://www.rottentomatoes.com/m/'
const movieTitle = 'hustlers_2019'
// go to the page
// select view all reviews
// loop
// grab the review text
// sort into good and bad reviews
// if there is a next button click it
// need to deal with last page (there will be no next button)
vo(run)(function(err, result) {
if (err) throw err
console.dir(result)
})
function *run() {
const movie = `${rottenTomatoesURL}${movieTitle}`
let reviewCompare = { fresh: {}, rotten: {} }
let nextExists = true
let currPage = 1
// go to the page and select view all reviews
yield getMovieReviews(movie)
nextExists = yield checkForValidNextButton()
while (nextExists && currPage <= 4) {
// grab review text and freshness
// add text to the correct reviewCompare object based on freshnness
let reviewName = yield handleReviews(reviewCompare)
reviewCompare = Object.assign(reviewCompare, reviewName.data)
// if there is a next button click it
yield clickNextButton()
currPage++
nextExists = yield checkForValidNextButton()
}
yield nightmare.end()
const popularWords = handleResult(reviewCompare)
return popularWords
}
function getMovieReviews(movie) {
return nightmare
.goto(movie)
.wait(500)
.click('.view_all_critic_reviews')
.wait('body')
}
// checkForValidNextButton returns a boolean indicating if the current page
// has a valid href leading to the next page
// FIX:
function checkForValidNextButton() {
return nightmare.evaluate(() => {
try {
var href = document.querySelector('span[class="pageInfo"] + a').href
} catch (error) {
console.log(error, 'movie only has one page of reviews')
throw new Error('movie only has one page of reviews')
}
return href.charAt(href.length - 1) !== '#'
})
}
function handleReviews(reviewCompare) {
return nightmare.evaluate(data => {
Array.from(document.querySelectorAll('.review_container')).forEach(
review => {
// let freshness = getFreshness(review);
// let content = getContent(review);
// let words = getWords(content);
let freshness = review
.querySelector('div')
.getAttribute('class')
.split(' ')[3]
let content = review.getElementsByClassName('the_review')[0].innerText
let words = content
.replace(/[.,/#!$%^&*'";:{}=\-_`~()]/g, '')
.toLowerCase()
.split(/\s+/)
// allocateWords(words, freshness, data)
words.forEach(word => {
if (data[freshness].hasOwnProperty(word)) {
data[freshness][word][1]++
} else {
data[freshness][word] = [word, 1]
}
})
}
)
return { data: data }
}, reviewCompare)
}
function getFreshness(review) {
return review
.querySelector('div')
.getAttribute('class')
.split(' ')[3]
}
function getContent(review) {
return review.getElementsByClassName('the_review')[0].innerText
}
function getWords(content) {
return content
.replace(/[.,/#!$%^&*'";:{}=\-_`~()]/g, '')
.toLowerCase()
.split(/\s+/)
}
function allocateWords(words, freshness, data) {
words.forEach(word => {
if (data[freshness].hasOwnProperty(word)) {
data[freshness][word][1]++
} else {
data[freshness][word] = [word, 1]
}
})
}
function clickNextButton() {
return nightmare.click('span[class="pageInfo"] + a').wait('body')
}
function handleResult(reviewCompare) {
let commonlyUsedWords = {
fresh: [],
rotten: []
}
// let commonlyUsedWords = {
// critics: {
// fresh: [],
// rotten: []
// },
// audience: {
// fresh: [],
// rotten: []
// }
// }
// filter out the stop words add non stop words to commonlyUsed words object
filterOutStopWords(reviewCompare, commonlyUsedWords)
let sortedFresh = sortReviews(commonlyUsedWords.fresh)
let sortedRotten = sortReviews(commonlyUsedWords.rotten)
let popularWords = getMostPopularWords(sortedFresh, sortedRotten, 16)
// for test purposes to better see what the returned text looked like
writeResultToFile('reviews.txt', JSON.stringify(popularWords))
return popularWords
}
function filterOutStopWords(reviews, list) {
Object.keys(reviews).forEach(freshness => {
Object.values(reviews[freshness]).forEach(word => {
let isStopWord = checkIfStopWord(word[0])
if (!isStopWord) {
addWordToList(list[freshness], word)
}
})
})
}
function checkIfStopWord(word) {
return stopWords.includes(word)
}
function addWordToList(list, word) {
list.push(word)
}
// sort most commonly used words to the front
const sortOccurances = (a, b) => b[1] - a[1]
function sortReviews(reviews) {
return reviews.sort(sortOccurances)
}
function getMostPopularWords(sortedFresh, sortedRotten, quantity) {
let popularWords = {
fresh: sortedFresh.slice(0, quantity),
rotten: sortedRotten.slice(0, quantity)
}
return popularWords
}
function writeResultToFile(fileName, result) {
fs.writeFile(fileName, result, err => {
if (err) throw err
console.log('The file has been saved!')
})
}
// things that need to be done:
// tweak the stop words: need to conditionally remove the names of directors and actors
// currently not getting the last page of reviews, which also means no results on movies that have only one page of reviews
// add CLI functionality
// pass movie name into function