-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #2 from batflyer/reviews
ReviewScraper and Major Reorganization
- Loading branch information
Showing
4 changed files
with
291 additions
and
131 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,95 @@ | ||
|
||
# Copyright (c) 2018 Alexander L. Hayes (@batflyer) | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
from __future__ import print_function | ||
from __future__ import division | ||
|
||
from bs4 import BeautifulSoup as bs | ||
|
||
import requests | ||
import time | ||
|
||
__author__ = 'Alexander L. Hayes (@batflyer)' | ||
__copyright__ = 'Copyright (c) 2018 Alexander L. Hayes' | ||
__license__ = 'Apache' | ||
__version__ = '0.0.1' | ||
__maintainer__ = __author__ | ||
__email__ = 'alexander@batflyer.net' | ||
__status__ = 'Prototype' | ||
|
||
def ReviewScraper(storyid, reviews_num, rate_limit=3): | ||
""" | ||
Scrapes the reviews for a certain story. | ||
@method ReviewScraper | ||
@param {str} storyid The id for a particular story. | ||
@param {int} reviews_num The number of reviews in metadata | ||
@param {int} rate_limit rate limit (in seconds) | ||
@return {} | ||
Discussion: | ||
* Reviews on FanFiction.Net may either be anonymous or tied to the user | ||
who left the review. | ||
* Reviews for a story are located at address: | ||
https://www.fanfiction.net/r/[sid]/0/1/ | ||
* /0/ represents all reviews for a story, /1/ is a page, and there are | ||
up to 15 reviews per page. | ||
* Incrementing the 0 gives the reviews for a particular chapter. | ||
Page Layout: | ||
* Reviews are stored in an html table of up to 15 elements. | ||
* A review may be thought of as a 4-tuple: | ||
(userid, chapter_reviewed, date, review_text) | ||
""" | ||
|
||
# There may be up to 15 reviews on a single page, therefore the number of | ||
# pages the reviews are stored on is equal to the following: | ||
number_of_pages = (reviews_num // 15) + 1 | ||
|
||
for p in range(number_of_pages): | ||
|
||
# Rate limit | ||
time.sleep(rate_limit) | ||
|
||
r = requests.get('https://www.fanfiction.net/r/' + storyid + '/0/' + str(p+1) + '/') | ||
html = r.text | ||
soup = bs(html, 'html.parser') | ||
|
||
# Get the tbody, which is where the review table is stored | ||
t = soup.find('tbody') | ||
|
||
# Loop over the table entries (td) | ||
for review in t.find_all('td'): | ||
|
||
# Reviews link to the profile of the user who reviewed it. | ||
for link in review.find_all('a', href=True): | ||
|
||
if '/u/' in str(link): | ||
print(str(link).split('"')[1].split('/')[2]) | ||
print() | ||
|
||
print(review.text) | ||
#exit() | ||
time.sleep(0.5) | ||
|
||
|
||
#exit() | ||
|
||
print(p+1) | ||
|
||
if __name__ == '__main__': | ||
|
||
raise(Exception('No main class in ReviewScraper.py')) | ||
exit(1) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,103 @@ | ||
|
||
# Copyright (c) 2018 Alexander L. Hayes (@batflyer) | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
from __future__ import print_function | ||
from __future__ import division | ||
|
||
from bs4 import BeautifulSoup as bs | ||
|
||
# progress.py is used under the terms of the MIT license | ||
from progress import progress | ||
|
||
import argparse | ||
import re | ||
import requests | ||
import time | ||
|
||
from FanfictionScraper import FanfictionScraper | ||
from ReviewScraper import ReviewScraper | ||
from Utils import * | ||
|
||
__author__ = 'Alexander L. Hayes (@batflyer)' | ||
__copyright__ = 'Copyright (c) 2018 Alexander L. Hayes' | ||
__license__ = 'Apache' | ||
__version__ = '0.0.1' | ||
__maintainer__ = __author__ | ||
__email__ = 'alexander@batflyer.net' | ||
__status__ = 'Prototype' | ||
|
||
if __name__ == '__main__': | ||
|
||
parser = argparse.ArgumentParser( | ||
description='''Scraper for FanFiction.Net.''', | ||
epilog='''Copyright (c) 2018 Alexander L. Hayes. Distributed under the | ||
terms of the Apache 2.0 License. A full copy of the license is | ||
available at the base of this repository.''' | ||
) | ||
|
||
mode = parser.add_mutually_exclusive_group() | ||
|
||
mode.add_argument('-s', '--sid', type=str, | ||
help='Scrape a single story.') | ||
mode.add_argument('-f', '--file', type=str, | ||
help='Scrape all sids contained in a file.') | ||
mode.add_argument('-r', '--review', type=str, | ||
help='Scrape the reviews for a particular story.') | ||
|
||
args = parser.parse_args() | ||
|
||
if args.sid: | ||
# Scrape the contents of a single file from FanFiction.Net | ||
story = FanfictionScraper(args.sid) | ||
|
||
predicates = [] | ||
predicates.append(PredicateLogicBuilder('author', story['aid'], story['sid'])) | ||
predicates.append(PredicateLogicBuilder('rating', story['sid'], story['rating'])) | ||
predicates.append(PredicateLogicBuilder('genre', story['sid'], story['genre'])) | ||
|
||
for p in predicates: | ||
print(p) | ||
|
||
elif args.review: | ||
# !!! In progress | ||
|
||
ReviewScraper(args.review, 16) | ||
exit() | ||
|
||
elif args.file: | ||
# Import the sids from the file and scrape each of them. | ||
|
||
sids = ImportStoryIDs(args.file) | ||
|
||
# Values for the progress bar. | ||
number_of_sids = len(sids) | ||
counter = 0 | ||
|
||
for sid in sids: | ||
|
||
# Helpful progress bar | ||
progress(counter, number_of_sids, status='Currently on: {0}'.format(sid)) | ||
counter += 1 | ||
|
||
story = FanfictionScraper(sid) | ||
|
||
predicates = [] | ||
predicates.append(PredicateLogicBuilder('author', story['aid'], story['sid'])) | ||
predicates.append(PredicateLogicBuilder('rating', story['sid'], story['rating'])) | ||
predicates.append(PredicateLogicBuilder('genre', story['sid'], story['genre'])) | ||
|
||
with open('facts.txt', 'a') as f: | ||
for p in predicates: | ||
f.write(p + '\n') |
Oops, something went wrong.