Skip to content

Commit

Permalink
Merge pull request #2 from batflyer/reviews
Browse files Browse the repository at this point in the history
ReviewScraper and Major Reorganization
  • Loading branch information
hayesall committed Apr 24, 2018
2 parents bcb2804 + bc5c118 commit 2ca4e23
Show file tree
Hide file tree
Showing 4 changed files with 291 additions and 131 deletions.
136 changes: 5 additions & 131 deletions scraper/FanfictionScraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
# limitations under the License.

from __future__ import print_function
from __future__ import division

from bs4 import BeautifulSoup as bs

Expand All @@ -25,6 +26,8 @@
import requests
import time

from ReviewScraper import ReviewScraper

__author__ = 'Alexander L. Hayes (@batflyer)'
__copyright__ = 'Copyright (c) 2018 Alexander L. Hayes'
__license__ = 'Apache'
Expand All @@ -33,18 +36,6 @@
__email__ = 'alexander@batflyer.net'
__status__ = 'Prototype'

def ReviewScraper(storyid, rate_limit=3):
"""
Scrapes the reviews for a certain story.
@method ReviewScraper
@param {str} storyid The id for a particular story.
@param {int} rate_limit rate limit (in seconds)
@return {}
"""

pass

def FanfictionScraper(storyid, rate_limit=3):
"""
Scrapes data from a story on FanFiction.Net
Expand Down Expand Up @@ -118,124 +109,7 @@ def FanfictionScraper(storyid, rate_limit=3):
#print(soup.prettify())
return story

def ReviewScraper(storyid, rate_limit=3):

# Rate limit
time.sleep(rate_limit)

pass

def PredicateLogicBuilder(type, id, value):
"""
Converts inputs into (id, value) pairs, creating positive examples
and facts in predicate-logic format.
@method PredicateLogicBuilder
@param {str} type type of the predicate
@param {str} id identifier attribute
@param {str} value value of the identifier
@return {str} ret string of the form 'A(B,C).'
Example:
>>> f = RelationalPredicateLogic('author', '123', '456')
>>> print(f)
author("123","456").
"""

ret = ''

if value:
ret += type
ret += '("'
ret += id.replace(' ', '')
ret += '","'
ret += value.replace(' ', '')
ret += '").'
else:
ret += type
ret += '("'
ret += id.replace(' ', '')
ret += '").'

return ret

def ImportStoryIDs(path_to_file):
"""
Reads FanFiction.Net story-ids from a file, where each story-id is on
a separate line. Returns a list of strings representing story-ids.
@method ImportStoryIDs
@param {str} path_to_file path to sid file.
@return {list} sids list of strings (sids)
Example:
$ cat sids.txt
123
344
$ python
>>> import FanfictionScraper as fs
>>> sids = fs.ImportStoryIDs('sids.txt')
>>> sids
['123', '344']
"""

with open(path_to_file) as f:
sids = f.read().splitlines()

return sids

if __name__ == '__main__':

parser = argparse.ArgumentParser(
description='''Scraper for FanFiction.Net.''',
epilog='''Copyright (c) 2018 Alexander L. Hayes. Distributed under the
terms of the Apache 2.0 License. A full copy of the license is
available at the base of this repository.'''
)

mode = parser.add_mutually_exclusive_group()

mode.add_argument('-s', '--sid', type=str,
help='Scrape a single story.')
mode.add_argument('-f', '--file', type=str,
help='Scrape all sids contained in a file.')

args = parser.parse_args()

if args.sid:
# Scrape the contents of a single file from FanFiction.Net
story = FanfictionScraper(args.sid)

predicates = []
predicates.append(PredicateLogicBuilder('author', story['aid'], story['sid']))
predicates.append(PredicateLogicBuilder('rating', story['sid'], story['rating']))
predicates.append(PredicateLogicBuilder('genre', story['sid'], story['genre']))

for p in predicates:
print(p)

elif args.file:
# Import the sids from the file and scrape each of them.

sids = ImportStoryIDs(args.file)

# Values for the progress bar.
number_of_sids = len(sids)
counter = 0

for sid in sids:

# Helpful progress bar
progress(counter, number_of_sids, status='Currently on: {0}'.format(sid))
counter += 1

story = FanfictionScraper(sid)

predicates = []
predicates.append(PredicateLogicBuilder('author', story['aid'], story['sid']))
predicates.append(PredicateLogicBuilder('rating', story['sid'], story['rating']))
predicates.append(PredicateLogicBuilder('genre', story['sid'], story['genre']))

with open('facts.txt', 'a') as f:
for p in predicates:
f.write(p + '\n')
raise(Exception('No main class in FanfictionScraper.py'))
exit(1)
95 changes: 95 additions & 0 deletions scraper/ReviewScraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@

# Copyright (c) 2018 Alexander L. Hayes (@batflyer)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import print_function
from __future__ import division

from bs4 import BeautifulSoup as bs

import requests
import time

__author__ = 'Alexander L. Hayes (@batflyer)'
__copyright__ = 'Copyright (c) 2018 Alexander L. Hayes'
__license__ = 'Apache'
__version__ = '0.0.1'
__maintainer__ = __author__
__email__ = 'alexander@batflyer.net'
__status__ = 'Prototype'

def ReviewScraper(storyid, reviews_num, rate_limit=3):
"""
Scrapes the reviews for a certain story.
@method ReviewScraper
@param {str} storyid The id for a particular story.
@param {int} reviews_num The number of reviews in metadata
@param {int} rate_limit rate limit (in seconds)
@return {}
Discussion:
* Reviews on FanFiction.Net may either be anonymous or tied to the user
who left the review.
* Reviews for a story are located at address:
https://www.fanfiction.net/r/[sid]/0/1/
* /0/ represents all reviews for a story, /1/ is a page, and there are
up to 15 reviews per page.
* Incrementing the 0 gives the reviews for a particular chapter.
Page Layout:
* Reviews are stored in an html table of up to 15 elements.
* A review may be thought of as a 4-tuple:
(userid, chapter_reviewed, date, review_text)
"""

# There may be up to 15 reviews on a single page, therefore the number of
# pages the reviews are stored on is equal to the following:
number_of_pages = (reviews_num // 15) + 1

for p in range(number_of_pages):

# Rate limit
time.sleep(rate_limit)

r = requests.get('https://www.fanfiction.net/r/' + storyid + '/0/' + str(p+1) + '/')
html = r.text
soup = bs(html, 'html.parser')

# Get the tbody, which is where the review table is stored
t = soup.find('tbody')

# Loop over the table entries (td)
for review in t.find_all('td'):

# Reviews link to the profile of the user who reviewed it.
for link in review.find_all('a', href=True):

if '/u/' in str(link):
print(str(link).split('"')[1].split('/')[2])
print()

print(review.text)
#exit()
time.sleep(0.5)


#exit()

print(p+1)

if __name__ == '__main__':

raise(Exception('No main class in ReviewScraper.py'))
exit(1)
103 changes: 103 additions & 0 deletions scraper/Scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@

# Copyright (c) 2018 Alexander L. Hayes (@batflyer)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import print_function
from __future__ import division

from bs4 import BeautifulSoup as bs

# progress.py is used under the terms of the MIT license
from progress import progress

import argparse
import re
import requests
import time

from FanfictionScraper import FanfictionScraper
from ReviewScraper import ReviewScraper
from Utils import *

__author__ = 'Alexander L. Hayes (@batflyer)'
__copyright__ = 'Copyright (c) 2018 Alexander L. Hayes'
__license__ = 'Apache'
__version__ = '0.0.1'
__maintainer__ = __author__
__email__ = 'alexander@batflyer.net'
__status__ = 'Prototype'

if __name__ == '__main__':

parser = argparse.ArgumentParser(
description='''Scraper for FanFiction.Net.''',
epilog='''Copyright (c) 2018 Alexander L. Hayes. Distributed under the
terms of the Apache 2.0 License. A full copy of the license is
available at the base of this repository.'''
)

mode = parser.add_mutually_exclusive_group()

mode.add_argument('-s', '--sid', type=str,
help='Scrape a single story.')
mode.add_argument('-f', '--file', type=str,
help='Scrape all sids contained in a file.')
mode.add_argument('-r', '--review', type=str,
help='Scrape the reviews for a particular story.')

args = parser.parse_args()

if args.sid:
# Scrape the contents of a single file from FanFiction.Net
story = FanfictionScraper(args.sid)

predicates = []
predicates.append(PredicateLogicBuilder('author', story['aid'], story['sid']))
predicates.append(PredicateLogicBuilder('rating', story['sid'], story['rating']))
predicates.append(PredicateLogicBuilder('genre', story['sid'], story['genre']))

for p in predicates:
print(p)

elif args.review:
# !!! In progress

ReviewScraper(args.review, 16)
exit()

elif args.file:
# Import the sids from the file and scrape each of them.

sids = ImportStoryIDs(args.file)

# Values for the progress bar.
number_of_sids = len(sids)
counter = 0

for sid in sids:

# Helpful progress bar
progress(counter, number_of_sids, status='Currently on: {0}'.format(sid))
counter += 1

story = FanfictionScraper(sid)

predicates = []
predicates.append(PredicateLogicBuilder('author', story['aid'], story['sid']))
predicates.append(PredicateLogicBuilder('rating', story['sid'], story['rating']))
predicates.append(PredicateLogicBuilder('genre', story['sid'], story['genre']))

with open('facts.txt', 'a') as f:
for p in predicates:
f.write(p + '\n')
Loading

0 comments on commit 2ca4e23

Please sign in to comment.