In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
# Show Python version
import platform
platform.python_version()

'3.8.8'

In [2]:
import scrapy
from scrapy.crawler import CrawlerProcess
import logging

In [3]:
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import Rule

In [4]:
from scrapy import Item, Field

In [5]:
class comment(Item):
    comment_author = Field()
    comment_id = Field()
    timestamp = Field() 
    comment_text = Field()
    num_likes = Field() # nullable -> default value is 0
    parent_id = Field() # nullable -> default value should be "N/A"+
    post_num = Field() # the post idea of the parent -> kind of important

In [6]:
class book_chapter(Item):
    book = Field()
    chapt_title = Field()
    published = Field()
    updated = Field() # this one can be empty -> makes sense to me
    chapt_author = Field()
    chapt_text = Field()
    num_comments = Field()
    num_likes = Field()
    post_num = Field()
    liked_by = Field()

In [7]:
# pipelines!
from scrapy.utils.serialize import ScrapyJSONEncoder # this imports a utility that enables us to serialize the item

from unidecode import unidecode

from itemloaders.processors import Compose, Join, MapCompose

import shortuuid

class chapter_proc_pipeline(object):
    
    def process_item(self, item, spider):
        
        # first to check the type of item we're working with here
        if not isinstance(item, book_chapter):
            return item # breaking if it's not a book_chapter
        
        standard_proc = Compose(str.strip, lambda v: unidecode(v), stop_on_none=False) # standard proc
        
        text_proc = Compose(Join(), str.strip, lambda v: unidecode(v), stop_on_none=False) # text proc
        
        likes_proc = Compose(lambda v: int(v) if v else 0, stop_on_none=False) # processes the number of likes
        
        num_comment_proc = Compose(lambda v: v.split()[0], int, stop_on_none=False) # proc for num comments
        
        liked_by_proc = MapCompose(str.strip, lambda v: unidecode(v), stop_on_none=False) # stand proc applied to each item
              

        # applying my functions
        item["book"] = standard_proc(item["book"]) # book (number/title)
        
        item["chapt_title"] = standard_proc(item["chapt_title"]) # chapter title
        
        item["chapt_author"] = standard_proc(item["chapt_author"]) # chapter author
        
        item["chapt_text"] = text_proc(item["chapt_text"]) # chapter text joined and cleaned
        
        item["num_comments"] = num_comment_proc(item["num_comments"]) # number of comments
        
        item["num_likes"] = likes_proc(item["num_likes"]) # number of likes
        
        item["liked_by"] = liked_by_proc(item["liked_by"]) # liked by
        
        item["post_num"] = standard_proc(item["post_num"]) # minimal proc on post num
               
        # no proc for published & updated b/c datetime format;
        
        return item 
        
# the comment proc pipeline
class comment_proc_pipeline(object):
       
    def process_item(self, item, spider):
        
        # checking if it's not a comment
        if not isinstance(item, comment):
            return item # just breaking this function & not processing the item if it doesn't belong
        
        # functions to process everything
        author_proc = Compose(str.strip, lambda v: unidecode(v), stop_on_none=False) # author proc

        # example of my text cleaning step
        text_proc = Compose(Join(), str.strip, lambda v: unidecode(v), stop_on_none=False) # comment text proc

        # example of dealing with the comment id
        comment_proc = Compose(lambda v: v.split("-")[1], str.strip, stop_on_none=False) # comment id proc

        likes_proc = Compose(lambda v: v[0].split()[0] if v else "0", int, stop_on_none=False) # likes proc; if none, 0

        parent_proc = Compose(lambda v: comment_proc(v) if v else None, stop_on_none=False) # parent id; else None
        
        # applying the functions we just made
        item["comment_author"] = author_proc(item["comment_author"]) # cleaning up the author (to be safe)
        
        item["comment_id"] =comment_proc(item["comment_id"]) # cleanning up comment_id
       
        item["comment_text"] =  text_proc(item["comment_text"]) # cleaning up the comment text
    
        item["timestamp"] = item["timestamp"] # timestamp doesn't need cleaning (it's totally fine :))
                
        item["num_likes"] = likes_proc(item["num_likes"]) # cleaning up the number of likes

        item["parent_id"] = parent_proc(item["parent_id"]) # cleaning up the parent id
        
        return item 
    
class user_privacy_proc(object):
    
    # this is meant to provide anonymization for the authors of comments as well as users who liked a post
    def process_item(self, item, spider):
            
        # starting by making a function to create a UUID
        def make_uuid(uname):
            
            uuid = shortuuid.uuid() # setting a uuid first
        
            # checking if the uuid is in the mapping alread -> unlikely but a good check
            while uuid in spider.user_mapping.values():
                uuid = shortuuid.uuid() # making a new uuid until it isn't in there -> hopefully won't take too long :(
            
            spider.user_mapping.setdefault(uname, uuid) # setting the uuid mapping
            return uuid # returning the uuid
        
        # function to check a uuid
        def check_uuid(uname):
            
            # first, check if a uname is in the mapping dict
            if uname in spider.user_mapping:
                return spider.user_mapping[uname] # returning that uuid
            
            else: # if it's not, we're going to make a mapping for it & return that new mapping
                return make_uuid(uname)
            
        # if it's a comment
        if isinstance(item, comment):
            
            # first check is if it's the author of the book
            if item["comment_author"] == "erraticerrata": # comment uname is lowercased
                    return item # no proc needed if book author
        
            else: # if not, anonymize it
                # first, we're getting the uuid
                user_id = check_uuid(item["comment_author"])

                # then we're replacing the uname with the uuid
                item["comment_author"] = user_id
                
                return item
        
        # else it's a book_chapter
        # here, we go through liked by
        else:
            unames = item["liked_by"]
            
            # looping through each uname and replacing it with a uuid
            for idx, uname in enumerate(unames):
                unames[idx] = check_uuid(uname)
            
            # setting the liked_by to the unames list
            item["liked_by"] = unames
            
            # returning the item
            return item

In [8]:
class book_spider(scrapy.Spider):
    name = "book scaper"
    start_urls = ["https://practicalguidetoevil.wordpress.com/table-of-contents/", 
                  "https://practicalguidetoevil.wordpress.com/extra-chapters/"] # the starting URLs we're working from
    
    # setting custom settings here
    custom_settings = {
        # first setting my feeds
        'FEEDS' : { # setting the feed settings here 
            'book_chapters.jl': {
                'format': 'jsonlines', 
                'overwrite': True,
                'item_classes': [book_chapter], 
                'encoding': 'utf8', 
                'indent' : 4},

                'comments.jl': {
                'format': 'jsonlines', 
                'overwrite': True,
                'item_classes': [comment],
                'encoding': 'utf8' ,
                'indent' : 4}
                },
        'LOG_LEVEL': logging.WARNING, # sets a min level for info to be written
        'ITEM_PIPELINES': {"__main__.chapter_proc_pipeline": 1,
                           "__main__.comment_proc_pipeline": 2,
                          "__main__.user_privacy_proc" : 3}, # setting pipeline order
        #  "DOWNLOAD_DELAY": .25, # the download delay -> here it's the default value for clarity
        "AUTOTHROTTLE_ENABLED " : True, # the autothrottle being enables
        "AUTOTHROTTLE_START_DELAY" : 1, # the starting delay for the autothrottle
        "AUTOTHROTTLE_MAX_DELAY": 60, # the maximum value for the autothrottle -> up to 1min seems fair
        'RETRY_HTTP_CODES' : [500, 502, 503, 504, 522, 524, 408, 429 , 404], # making a list of codes we'll retry on
        'RETRY_TIMES': 5, # max retry times for a page; don't know that we'll need this here
    }
     
    
    user_mapping = dict() # creating an attribute to hold the user mapping for anonymization
        
    # creating the parsing method -> outer page
    def parse(self, response):
        
        
        # checking if it's the extra Chapters
        if response.request.url == "https://practicalguidetoevil.wordpress.com/extra-chapters/":
            book = response.css("h1.entry-title::text").get()
            chapters = response.css("div.entry-content > ul > li > a::attr('href')").getall() # making a list of href chapters

            for chapter in chapters:
                chapt_info = book_chapter() # instantiating the book_chapter object
                chapt_info["book"] = book # the book is the book

                # calling the secondary parse method here
                yield scrapy.Request(chapter, callback = self.parse_chapt,
                                        cb_kwargs={"book_title": book}) # passing through the data
    
        else:
            books = response.css("div.entry-content > h2::text").getall() # making a list of the book titles
            chapters = response.css("div.entry-content > ul") # making a list of uls

            bc_zipped = tuple(zip(books, chapters)) # zipping those together

            # looping through each of those in sync -> best way I'd say

            for pair in bc_zipped:

                book_title = pair[0] # setting the book title
                book_chapters = pair[1].css("li > a::attr('href')").getall() # pulling the hrefs -> list of them

                for chapter_url in book_chapters: # for each url in each ul          

                    # calling the secondary parse method here
                    yield scrapy.Request(chapter_url, callback = self.parse_chapt,
                                        cb_kwargs={"book_title": book_title}) # passing through the booktitle
            
    # creating the chapter parsing methods -> inner pages
    def parse_chapt(self, response, book_title):
        
        chapt_info = book_chapter() # instantiating a book chapter
        
        chapt_info["book"] = book_title # putting in the book title we pased through
        
        # pulling chapter title
        chapt_info["chapt_title"] = response.css("header.entry-header > h1::text").get() 
        
        # posted on datetime
        chapt_info["published"] = response.css("header > div > span.posted-on > a> time.entry-date.published::attr('datetime')").get() 
        
        # updated on datetime
        chapt_info["updated"] = response.css("header > div > span.posted-on > a > time.updated::attr('datetime')").get()
        
        # byline = author
        chapt_info["chapt_author"] = response.css("header > div > span.byline > span.author.vcard > a::text").get()
        
        # pulls the list of paragraphs that'll need to be joined
        chapt_info["chapt_text"] = response.css("div.entry-content > p ::text").getall()
                
            
        # pulls the number of comments
        chapt_info["num_comments"] = response.css("h2.comments-title::text").get()
        
        # finding the post number
        chapt_info["post_num"] = response.css("input[name='comment_post_ID']::attr('value')").get()
        
        # hmmm...it looks like my approach *should* be working here though
        
        # finding the url
        likes_info = "https://public-api.wordpress.com/rest/v1/batch?http_envelope=1&urls[]=/me&urls[]=/sites/87445915/posts/{}/likes&urls[]=/sites/87445915/posts/{}/reblogs/mine".format(chapt_info["post_num"],chapt_info["post_num"])
        
        # yielding a new request to pull the likes info
        yield scrapy.Request(likes_info, callback = self.parse_json_likes,
                            cb_kwargs={"chapt_info":chapt_info}) # passing through the chapter info
        
        
        # next we process the comment informaiton -> it's all on the chapter page
        
        # looping through each comment & parsing them
        for article in response.css("article.comment"):
            
            comment_item = comment() # instantiating a comment item
            
            # pulling comment author
            comment_item["comment_author"] = article.css("article > footer > div > cite ::text").get() 

            # pulling the id
            comment_item["comment_id"] = article.css("article::attr('id')").get()
                        
            # pulling the timestamp
            comment_item["timestamp"] = article.css("article > footer > div > a > time::attr('datetime')").get()
            
            # pulling the text
            comment_item["comment_text"] = article.css("article > div > p::text").getall()
            
            # pulling the number of likes
            comment_item["num_likes"] = article.css("article > div > p > span> a::text").get()
            
            
            # pulling the parent id -> may or may not exist
            comment_item["parent_id"] = article.xpath("../parent::ul/preceding-sibling::article/@id").get()
            
            comment_item["post_num"] = chapt_info["post_num"]
            
            yield (comment_item) # returning the comment
        
    # parsing the json which has the like information we'll be using
    def parse_json_likes(self, response, chapt_info):
        
        chapt_info["num_likes"] = response.json()["body"]["/sites/87445915/posts/{}/likes".format(chapt_info["post_num"])]["found"]
    
        # the set of users is found here
        lu = response.json()["body"]["/sites/87445915/posts/{}/likes".format(chapt_info["post_num"])]["likes"]        
        
        chapt_info["liked_by"] = {item["nice_name"] for item in lu} # set comp to pull unames
        # remember to mention uncertainty around mapping comment unames to this set!!!
            
        yield chapt_info

In [9]:
%%time
process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})

process.crawl(book_spider)
process.start()

2022-10-03 16:20:37 [scrapy.utils.log] INFO: Scrapy 2.6.2 started (bot: scrapybot)
2022-10-03 16:20:37 [scrapy.utils.log] INFO: Versions: lxml 4.6.3.0, libxml2 2.9.10, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 22.4.0, Python 3.8.8 (default, Apr 13 2021, 15:08:03) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 20.0.1 (OpenSSL 1.1.1k  25 Mar 2021), cryptography 3.4.7, Platform Windows-10-10.0.19041-SP0
2022-10-03 16:20:37 [scrapy.crawler] INFO: Overridden settings:
{'AUTOTHROTTLE_START_DELAY': 1,
 'LOG_LEVEL': 30,
 'RETRY_HTTP_CODES': [500, 502, 503, 504, 522, 524, 408, 429, 404],
 'RETRY_TIMES': 5,
 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}


CPU times: total: 1min 21s
Wall time: 1min 23s
