<h3>Data Scraping</h3>

In [1]:
#import basic libraries
import pandas as pd
import re
import logging
import scrapy
from scrapy.crawler import CrawlerProcess

In [2]:
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem

class ImageItem(scrapy.Item):

    #direct link to image file for downloading via ImagePipeline
    image_urls = scrapy.Field()
    
    #link to specific image page for scraping more stats
    image_links = scrapy.Field()
    
    #image attributes
    titles = scrapy.Field() #image title
    date_posted = scrapy.Field() #date posted
    hashtags = scrapy.Field() #hashtags
    
    #image stats
    views = scrapy.Field() #number of views of the current image
    faves = scrapy.Field() #number of faves of the current image
    comments = scrapy.Field() #number of comments of the current image
    downloads = scrapy.Field() #number of downloads of the current image
    
    #artist details
    artists = scrapy.Field() #artist's name
    artist_urls = scrapy.Field() #link to the artist's account
    artist_deviations = scrapy.Field() #number of deviations (images) posted
    artist_comments = scrapy.Field() #number of total comments received
    artist_page_views = scrapy.Field() #number of total page views received
    artist_scraps = scrapy.Field() #number of scraps (WIPs or archived art)
    artist_watchers = scrapy.Field() #number of watchers (followers)
    artist_critiques = scrapy.Field() #number of critiques given
    artist_forum_posts = scrapy.Field() #number of forum posts made
    artist_faves = scrapy.Field() #number of total faves received
    artist_asl = scrapy.Field() #age, sex, location
    artist_dob = scrapy.Field() #date of birth
    account_age = scrapy.Field() #how old the account is
    
    # to be filled in by ImagePipeline
    image_paths = scrapy.Field() #location of image in local storage
    
    
class MyImagesPipeline(ImagesPipeline):
    '''
    Image pipeline for downloading images.
    '''
    def get_media_requests(self, item, info):
        for image_url in item['image_urls']:
            yield scrapy.Request(image_url)

    def item_completed(self, results, item, info):
        image_paths = [x['path'] for ok, x in results if ok]
        if not image_paths:
            raise DropItem("Item contains no images")
        item['image_paths'] = image_paths
        return item

In [3]:
class ImageSpider(scrapy.Spider):
    
    name = 'images'
    
    start_urls = ['https://www.deviantart.com/popular-all-time/?q=sherlock&offset=0']
    
    #initialize offset at 0
    offset = 0
    #set offset limit to control the amount of images downloaded
    offset_limit = 16000
    

    custom_settings = {
        'LOG_LEVEL': logging.INFO,
        'ITEM_PIPELINES': {'__main__.MyImagesPipeline': 1}, #enable image download
        'IMAGES_STORE': 'DA-images-2', #store images in DA-images-2 folder
        'FEED_FORMAT':'json',                                
        'FEED_URI': 'image-data-2.json', #store image data in image-data-2.json
        'DOWNLOAD_FAIL_ON_DATALOSS': False, #if image download fails (due to various issues), don't send error message, just flag it.
        'DOWNLOAD_DELAY': 0.25 #250 ms download delay, with inbuilt scrapy randomization
    }

    def parse(self, response):
        
        #get page body
        page = response.css('div.page-results span.thumb')
        
        for img in page:
            
            #thumbnail link
            thumbnail = img.css('::attr(data-super-img)').get()
            
            #full link that leads to the individual image post
            img_link = img.css('::attr(href)').get()
            
            #if there is a thumbnail, aka the post is an image, follow url to scrape image details
            if thumbnail: 
                yield scrapy.Request(img_link, callback = self.parse_image)
                
        #go to next page
        while self.offset < self.offset_limit:
            self.offset += 24 #DA's natural offset scroll is set at increments of 24
            next_page = f'https://www.deviantart.com/popular-all-time/?q=sherlock&offset={self.offset}'
            yield scrapy.Request(next_page, callback=self.parse)
            
            
    def parse_image(self, response):
        
        #initialize image item
        image = ImageItem()
        
        #get image url (for downloading via ImagePipeline)
        image["image_urls"] = [response.css('div.dev-view-deviation img ::attr(src)').get()]
        #get other image info
        image["image_links"] = response.url
        image['titles'] = response.xpath("//a[@class='title']/text()").extract()[0]
        image['date_posted'] = response.xpath("//div[@class='dev-right-bar-content dev-metainfo-content dev-metainfo-details']/dl/dd/span/text()").extract()[0]
        
        #check whether image has hashtags (some don't)
        hashtag = response.xpath("//div[@class='dev-about-tags-cc dev-about-breadcrumb']/a/text()").extract()
        if hashtag: image['hashtags'] = hashtag
        
        #get image stats
        stats =  response.xpath("//div[@class='dev-right-bar-content dev-metainfo-content dev-metainfo-stats']/dl/dd/text()").extract()
        
        #check that stats list only contains numbers (sometimes irregular data falls in)
        stats = [re.sub("\D", "", s) for s in stats]
        
        #remove any None types from list
        stats = list(filter(None, stats))
        
        #the responses are ordered in: views, faves, comments, downloads
        #sometimes comments are disabled, sometimes downloads are disabled
        headers = ['views','faves','comments','downloads']
        
        #if comments/downloads are disabled, they will not be looped over for a given image
        for i in range(len(stats)):
            image[headers[i]] = stats[i]
            
        #get artist info
        artist_name = response.xpath("//small[@class='author']/span/a/text()").extract()
        
        if artist_name: 
            image['artists'] = response.xpath("//small[@class='author']/span/a/text()").extract()[-1]
            image['artist_urls'] = response.xpath("//small[@class='author']/span/a/@href").extract()[-1]     
            request = scrapy.Request(image['artist_urls'], callback=self.parse_artist, meta={'image':image})
            yield request
        else: #if no artist name (sometimes artists are banned), just yield the image
            image['artists'] = 'Banned'
            yield image
        
    def parse_artist(self, response):
        
        #get image item for the higher-level parser
        image = response.meta['image']
        
        #get artist account stats
        artist_stats = response.xpath("//div[@id='super-secret-stats']/div/div/div/strong/text()").extract()
        
        headers = ['artist_deviations','artist_comments','artist_page_views','artist_scraps','artist_watchers','artist_critiques','artist_forum_posts','artist_faves']
        
        for i in range(len(artist_stats)):
            image[headers[i]] = artist_stats[i]
        
        #get account age and membership details
        age_membership = response.xpath("//a[@href='#super-secret-activity']/div/text()").extract()
        
        #sometimes the age is wrapped up in a span
        if len(age_membership) == 0: 
            age = response.xpath("//a[@href='#super-secret-activity']/span/div/text()").extract()[0]
            age_membership.append(age)        
        image['account_age'] = age_membership[0]

        
        #get artist personal details
        artist_details = response.xpath("//div[@id='super-secret-why']/div/div/div/dl/dd/text()").extract()
        details = ['artist_asl','artist_dob'] #some artists do not share their dobs
        for i in range(len(artist_details)):
            image[details[i]] = artist_details[i]
        
        return image
            
process = CrawlerProcess()
process.crawl(ImageSpider)
process.start()

2019-04-21 01:03:58 [scrapy.utils.log] INFO: Scrapy 1.5.2 started (bot: scrapybot)
2019-04-21 01:03:58 [scrapy.utils.log] INFO: Versions: lxml 4.2.1.0, libxml2 2.9.8, cssselect 1.0.3, parsel 1.5.1, w3lib 1.20.0, Twisted 17.5.0, Python 3.6.8 |Anaconda, Inc.| (default, Dec 29 2018, 19:04:46) - [GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)], pyOpenSSL 18.0.0 (OpenSSL 1.1.1b  26 Feb 2019), cryptography 2.6.1, Platform Darwin-18.2.0-x86_64-i386-64bit
2019-04-21 01:03:58 [scrapy.crawler] INFO: Overridden settings: {'DOWNLOAD_DELAY': 0.25, 'DOWNLOAD_FAIL_ON_DATALOSS': False, 'FEED_FORMAT': 'json', 'FEED_URI': 'image-data-2.json', 'LOG_LEVEL': 20}
2019-04-21 01:03:58 [scrapy.extensions.telnet] INFO: Telnet Password: 0de246c7e41b8d30
2019-04-21 01:03:58 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.ext

2019-04-21 01:40:58 [scrapy.extensions.logstats] INFO: Crawled 9622 pages (at 252 pages/min), scraped 2406 items (at 59 items/min)
2019-04-21 01:41:58 [scrapy.extensions.logstats] INFO: Crawled 9867 pages (at 245 pages/min), scraped 2453 items (at 47 items/min)
2019-04-21 01:42:58 [scrapy.extensions.logstats] INFO: Crawled 10117 pages (at 250 pages/min), scraped 2514 items (at 61 items/min)
2019-04-21 01:43:58 [scrapy.extensions.logstats] INFO: Crawled 10362 pages (at 245 pages/min), scraped 2567 items (at 53 items/min)
2019-04-21 01:44:58 [scrapy.extensions.logstats] INFO: Crawled 10626 pages (at 264 pages/min), scraped 2635 items (at 68 items/min)
2019-04-21 01:45:58 [scrapy.extensions.logstats] INFO: Crawled 10883 pages (at 257 pages/min), scraped 2696 items (at 61 items/min)
2019-04-21 01:46:58 [scrapy.extensions.logstats] INFO: Crawled 11130 pages (at 247 pages/min), scraped 2750 items (at 54 items/min)
2019-04-21 01:47:58 [scrapy.extensions.logstats] INFO: Crawled 11382 pages (at

2019-04-21 02:38:58 [scrapy.extensions.logstats] INFO: Crawled 23959 pages (at 223 pages/min), scraped 5440 items (at 32 items/min)
2019-04-21 02:39:51 [scrapy.pipelines.files] ERROR: File (unknown-error): Error processing file from <GET https://images-wixmp-ed30a86b8c4ca887773594c2.wixmp.com/f/0dc70551-b190-44e8-9b59-5e9378a30049/d2y7tid-cc06bf13-115d-41d9-b9ec-00c33c8cab98.jpg?token=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJ1cm46YXBwOjdlMGQxODg5ODIyNjQzNzNhNWYwZDQxNWVhMGQyNmUwIiwiaXNzIjoidXJuOmFwcDo3ZTBkMTg4OTgyMjY0MzczYTVmMGQ0MTVlYTBkMjZlMCIsIm9iaiI6W1t7InBhdGgiOiJcL2ZcLzBkYzcwNTUxLWIxOTAtNDRlOC05YjU5LTVlOTM3OGEzMDA0OVwvZDJ5N3RpZC1jYzA2YmYxMy0xMTVkLTQxZDktYjllYy0wMGMzM2M4Y2FiOTguanBnIn1dXSwiYXVkIjpbInVybjpzZXJ2aWNlOmZpbGUuZG93bmxvYWQiXX0.3RRjMd6_q6cBQZh-YJCavU7n72FfQbHmg1KuoeJpdk8> referred in <None>
Traceback (most recent call last):
  File "/anaconda3/lib/python3.6/site-packages/twisted/internet/defer.py", line 1386, in _inlineCallbacks
    result = g.send(result)
  File "/a

In [4]:
df = pd.read_json('image-data-2.json')
df

Unnamed: 0,account_age,artist_asl,artist_comments,artist_critiques,artist_deviations,artist_dob,artist_faves,artist_forum_posts,artist_page_views,artist_scraps,...,comments,date_posted,downloads,faves,hashtags,image_links,image_paths,image_urls,titles,views
0,Deviant for 5 Years,Female/France,468,0.0,183,March 5,1830,0,10634,0,...,22.0,"January 17, 2014",92.0,142.0,,https://www.deviantart.com/get-sherlock/art/Sh...,[full/c11741fda71c47368b3ecc42de439de62c48cf30...,[https://images-wixmp-ed30a86b8c4ca887773594c2...,Sherlock,1606.0
1,Deviant for 8 Years,Female/Russia,169,0.0,36,March 5,216,0,11377,2,...,45.0,"December 30, 2011",186.0,1135.0,,https://www.deviantart.com/masterhalfling/art/...,[full/09fbbe05751a1b08cf03445e0821ca44dd5b547b...,[https://images-wixmp-ed30a86b8c4ca887773594c2...,Sherlock,14153.0
2,Deviant for 6 Years,Female/Japan,23,0.0,28,September 9,53,0,9907,0,...,102.0,"August 11, 2012",225.0,1959.0,,https://www.deviantart.com/donperico/art/SHERL...,[full/0f13dd8004eb7f0ce2b45c826f4e76df5978ac13...,[https://images-wixmp-ed30a86b8c4ca887773594c2...,SHERLOCK,21163.0
3,Deviant for 13 Years,Female/France,322,0.0,55,March 29,396,0,36431,3,...,35.0,"January 21, 2012",48.0,551.0,,https://www.deviantart.com/cheeky-bee/art/Sher...,[full/2d3083ae52b6ea7b69fdd83c0f74c5fd8525a489...,[https://images-wixmp-ed30a86b8c4ca887773594c2...,Sherlock Of the Dead,6988.0
4,Deviant for 7 Years,Female/South Korea,436,0.0,73,September 17,37,0,45119,0,...,8.0,"August 19, 2011",34.0,91.0,,https://www.deviantart.com/hahaaaaaaaaaaaa/art...,[full/be06d405f58195ef9d4c5c42c1c6cb2edb673754...,[https://images-wixmp-ed30a86b8c4ca887773594c2...,Sherlock,2742.0
5,Deviant for 10 Years,United Kingdom,17,0.0,77,,6,0,18474,0,...,21.0,"August 1, 2010",59.0,696.0,,https://www.deviantart.com/1stclassstamps/art/...,[full/11dc8450503f97976fba168972410295b76c72ed...,[https://images-wixmp-ed30a86b8c4ca887773594c2...,Sherlock Holmes,14566.0
6,Deviant for 7 Years,28/Female/Russia,8264,0.0,150,"April 21, 1990",3760,0,68208,0,...,46.0,"February 20, 2012",46.0,218.0,,https://www.deviantart.com/feyjane/art/Sherloc...,[full/587559fae93b85cbfa8802d39e7a2e1d3cbe415c...,[https://images-wixmp-ed30a86b8c4ca887773594c2...,Sherlock,3139.0
7,Deviant for 12 Years,France,376,0.0,13,,114,0,6566,0,...,5.0,"January 27, 2012",14.0,96.0,,https://www.deviantart.com/elsias/art/Sherlock...,[full/9507a844991ab37afec9bc0a0f619bfa0d033b04...,[https://images-wixmp-ed30a86b8c4ca887773594c2...,Sherlock,2208.0
8,Deviant for 6 Years,Female/United Kingdom,128,0.0,97,,0,0,29442,0,...,22.0,"July 13, 2013",8.0,66.0,,https://www.deviantart.com/sherlockthegame/art...,[full/45901b4a9092329595f2e6739d213b2d169e45f9...,[https://images-wixmp-ed30a86b8c4ca887773594c2...,Sherlock: The Game Is On (Character Contest),2983.0
9,Deviant for 7 Years,Female/United States,32,0.0,8,March 27,30,0,2958,2,...,38.0,"December 26, 2011",39.0,877.0,,https://www.deviantart.com/fractionofadot/art/...,[full/c752fa217f32f5510a49c4f17f3980db9a442c68...,[https://images-wixmp-ed30a86b8c4ca887773594c2...,Sherlock,10189.0
