<h3>Data Scraping</h3>

In [1]:
#import basic libraries
import pandas as pd
import re
import logging
import scrapy
from scrapy.crawler import CrawlerProcess

In [2]:
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem

class ImageItem(scrapy.Item):

    # scrape from site
    image_urls = scrapy.Field()
    titles = scrapy.Field()
    artists = scrapy.Field()
    dates = scrapy.Field()
    
    #stats
    views = scrapy.Field()
    faves = scrapy.Field()
    comments = scrapy.Field()
    downloads = scrapy.Field()
    
    # to be returned
    image_paths = scrapy.Field()
    images = scrapy.Field()
    
class MyImagesPipeline(ImagesPipeline):

    def get_media_requests(self, item, info):
        for image_url in item['image_urls']:
            yield scrapy.Request(image_url)

    def item_completed(self, results, item, info):
        image_paths = [x['path'] for ok, x in results if ok]
        if not image_paths:
            raise DropItem("Item contains no images")
        item['image_paths'] = image_paths
        return item

In [3]:
class ImageSpider(scrapy.Spider):
    
    name = 'images'
    
    start_urls = ['https://www.deviantart.com/popular-all-time/?q=sherlock&offset=0']
    #initialize offset at 0
    offset = 0
    #set offset limit to control the amount of images downloaded
    offset_limit = 2500
    
    custom_settings = {
        'LOG_LEVEL': logging.INFO,
        'ITEM_PIPELINES': {'__main__.MyImagesPipeline': 1},
        'IMAGES_STORE': 'DA-images', 
        'FEED_FORMAT':'json',                                
        'FEED_URI': 'image-data.json'
    }

    def parse(self, response):
        
        #get page body
        page = response.css('div.page-results span.thumb')
        
        for img in page:
            
            #img_url contains the image thumbnail. If there isn't a thumbnail, then post is not an image and should be skipped
            img_url = img.css('::attr(data-super-img)').get()
            
            #img_link contains the full link that leads to the individual image post
            img_link = img.css('::attr(href)').get()
            
            #if there is a image url present, aka the post is an image, follow url to parse image for details
            if img_url: yield scrapy.Request(img_link, callback = self.parse_image)
                
        #next page procedure
        while self.offset < self.offset_limit:
            self.offset += 24 #DA's natural offset scroll is set at increments of 24
            next_page = f'https://www.deviantart.com/popular-all-time/?q=sherlock&offset={self.offset}'
            yield scrapy.Request(next_page, callback=self.parse)
            
    def parse_image(self, response):
        
        #initialize image item
        image = ImageItem()
        
        #get image url (for downloading via ImagePipeline)
        image["image_urls"] = [response.css('div.dev-view-deviation img ::attr(src)').get()]
        image['artists'] = response.xpath("//small[@class='author']/span/a/text()").extract()[0]
        image['titles'] = response.xpath("//a[@class='title']/text()").extract()[0]
        image['dates'] = response.xpath("//div[@class='dev-right-bar-content dev-metainfo-content dev-metainfo-details']/dl/dd/span/text()").extract()[0]
        
        #get image stats
        stats =  response.xpath("//div[@class='dev-right-bar-content dev-metainfo-content dev-metainfo-stats']/dl/dd/text()").extract()

        #check that stats list only contains numbers (sometimes irregular data falls in)
        stats = [re.sub("\D", "", s) for s in stats]
        
        #remove any None types from list
        stats = list(filter(None, stats))
        
        #the responses are ordered in: views, faves, comments, downloads
        #sometimes comments are disabled, sometimes downloads are disabled
        headers = ['views','faves','comments','downloads']
        
        for i in range(len(stats)):
            image[headers[i]] = stats[i]

        yield image
            
process = CrawlerProcess()
process.crawl(ImageSpider)
process.start()

2019-04-17 01:28:39 [scrapy.utils.log] INFO: Scrapy 1.5.2 started (bot: scrapybot)
2019-04-17 01:28:39 [scrapy.utils.log] INFO: Versions: lxml 4.2.1.0, libxml2 2.9.8, cssselect 1.0.3, parsel 1.5.1, w3lib 1.20.0, Twisted 17.5.0, Python 3.6.8 |Anaconda, Inc.| (default, Dec 29 2018, 19:04:46) - [GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)], pyOpenSSL 18.0.0 (OpenSSL 1.1.1b  26 Feb 2019), cryptography 2.6.1, Platform Darwin-18.2.0-x86_64-i386-64bit
2019-04-17 01:28:39 [scrapy.crawler] INFO: Overridden settings: {'FEED_FORMAT': 'json', 'FEED_URI': 'image-data.json', 'LOG_LEVEL': 20}
2019-04-17 01:28:39 [scrapy.extensions.telnet] INFO: Telnet Password: eee44e5ff899a9f2
2019-04-17 01:28:39 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
2019-04-17 01:28:39 [scrapy.middle

In [4]:
df = pd.read_json('image-data.json')
df

Unnamed: 0,artists,comments,dates,downloads,faves,image_paths,image_urls,titles,views
0,get-sherlock,22.0,"January 17, 2014",91.0,142,[full/c11741fda71c47368b3ecc42de439de62c48cf30...,[https://images-wixmp-ed30a86b8c4ca887773594c2...,Sherlock,1593
1,Nadia-Ch,70.0,"August 26, 2013",,900,[full/7dd4af05bd4fca98ce1688c899c444e3e6e12c0d...,[https://images-wixmp-ed30a86b8c4ca887773594c2...,Sherlock,9543
2,Unisha,42.0,"January 30, 2012",283.0,1560,[full/1e8b904f0bb82d4f5dfca301704d95e6cab4273d...,[https://images-wixmp-ed30a86b8c4ca887773594c2...,Sherlock,20886
3,Umino-aka-Morskaya,88.0,"September 7, 2010",193.0,1702,[full/240c5af33c8a67a4e9f69bcad604ab421e85c86c...,[https://images-wixmp-ed30a86b8c4ca887773594c2...,Sherlock,25432
4,DonPerico,102.0,"August 11, 2012",225.0,1961,[full/0f13dd8004eb7f0ce2b45c826f4e76df5978ac13...,[https://images-wixmp-ed30a86b8c4ca887773594c2...,SHERLOCK,21156
5,masterHalfling,45.0,"December 30, 2011",186.0,1135,[full/09fbbe05751a1b08cf03445e0821ca44dd5b547b...,[https://images-wixmp-ed30a86b8c4ca887773594c2...,Sherlock,14147
6,jaslerb,20.0,"August 17, 2012",20.0,258,[full/39524abdbf4128e28696adc9c450d6d59887d012...,[https://images-wixmp-ed30a86b8c4ca887773594c2...,sherlock,3386
7,AmandaTolleson,427.0,"September 24, 2011",,4371,[full/728c18b1d1e689c2bdb58937e1238ecf04f63acc...,[https://images-wixmp-ed30a86b8c4ca887773594c2...,Sherlock,60710
8,CROMOU,52.0,"February 27, 2012",70.0,345,[full/06a8199ab4ff7a6688cce17d41999e813a965e87...,[https://images-wixmp-ed30a86b8c4ca887773594c2...,Sherlock,7351
9,MirroredSilhouettes,107.0,"January 1, 2013",29.0,317,[full/a2a7ea2e083dfa06539e6ac05df76c8ec4c880eb...,[https://images-wixmp-ed30a86b8c4ca887773594c2...,Sherlock,7936
