<h3>Data Scraping</h3>

In [None]:
#import basic libraries
import pandas as pd
import logging
import scrapy
from scrapy.crawler import CrawlerProcess

In [2]:
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem

class ImageItem(scrapy.Item):

    # scrape from site
    image_urls = scrapy.Field()
    artists = scrapy.Field()
    faves = scrapy.Field()
    comments = scrapy.Field()
    
    # to be returned
    image_paths = scrapy.Field()
    images = scrapy.Field()
    
class MyImagesPipeline(ImagesPipeline):

    def get_media_requests(self, item, info):
        for image_url in item['image_urls']:
            yield scrapy.Request(image_url)

    def item_completed(self, results, item, info):
        image_paths = [x['path'] for ok, x in results if ok]
        if not image_paths:
            raise DropItem("Item contains no images")
        item['image_paths'] = image_paths
        return item

In [3]:
class ImageSpider(scrapy.Spider):
    
    name = 'images'
    
    start_urls = ['https://www.deviantart.com/popular-all-time/?q=sherlock&offset=0']
    #initialize offset at 0
    offset = 0
    #set offset limit to control the amount of images downloaded
    offset_limit = 400
    
    custom_settings = {
        'LOG_LEVEL': logging.INFO,
        'ITEM_PIPELINES': {'__main__.MyImagesPipeline': 1},
        'IMAGES_STORE': 'DA-images', 
        'FEED_FORMAT':'json',                                
        'FEED_URI': 'image-data.json'
    }

    def parse(self, response):
        
        #get page body
        page = response.css('div.page-results span.thumb')
        
        for img in page:
            
            #initialize image object
            image = ImageItem()
        
            #assign image attributes
            image["image_urls"] = [img.css('::attr(data-super-img)').get()]
            image["faves"] = img.css('span.info span.extra-info span.stats span.faves::text').get()
            image["comments"] = img.css('span.info span.extra-info span.stats span.comments::text').get()
            image["artists"] = img.css('span.info span.extra-info span.artist a img.avatar::attr(title)').get()
        
            if image["image_urls"][0] != None: yield image
                
        #next page procedure
        while self.offset < self.offset_limit:
            self.offset += 24 #DA's natural offset scroll is set at increments of 24
            next_page = f'https://www.deviantart.com/popular-all-time/?q=sherlock&offset={self.offset}'
            yield scrapy.Request(next_page, callback=self.parse)
            
process = CrawlerProcess()
process.crawl(ImageSpider)
process.start()

2019-04-13 15:25:35 [scrapy.utils.log] INFO: Scrapy 1.5.2 started (bot: scrapybot)
2019-04-13 15:25:35 [scrapy.utils.log] INFO: Versions: lxml 4.2.1.0, libxml2 2.9.8, cssselect 1.0.3, parsel 1.5.1, w3lib 1.20.0, Twisted 17.5.0, Python 3.6.8 |Anaconda, Inc.| (default, Dec 29 2018, 19:04:46) - [GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)], pyOpenSSL 18.0.0 (OpenSSL 1.1.1b  26 Feb 2019), cryptography 2.6.1, Platform Darwin-18.2.0-x86_64-i386-64bit
2019-04-13 15:25:35 [scrapy.crawler] INFO: Overridden settings: {'FEED_FORMAT': 'json', 'FEED_URI': 'image-data.json', 'LOG_LEVEL': 20}
2019-04-13 15:25:35 [scrapy.extensions.telnet] INFO: Telnet Password: bf295acc92938351
2019-04-13 15:25:35 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
2019-04-13 15:25:35 [scrapy.middle

<h3>Data Processing</h3>

In [17]:
df = pd.read_json('image-data.json')
df.head()

Unnamed: 0,artists,comments,faves,image_paths,image_urls
0,get-sherlock,22,142,[full/c11741fda71c47368b3ecc42de439de62c48cf30...,[https://images-wixmp-ed30a86b8c4ca887773594c2...
1,hoo0,393,4999,[full/fe76954198d74dd3a3fac890216d7f256172132b...,[https://images-wixmp-ed30a86b8c4ca887773594c2...
2,WuLiao-Yuzi,69,2110,[full/ba47403c06820d230acd79405fb8f1cf1e58c6fe...,[https://images-wixmp-ed30a86b8c4ca887773594c2...
3,403shiomi,48,490,[full/6c05caa5b126250613307f70e506f07f1e2513c3...,[https://images-wixmp-ed30a86b8c4ca887773594c2...
4,Joe-Roberts,72,949,[full/071c38ac67037aca590719c02d623a41b100c17e...,[https://images-wixmp-ed30a86b8c4ca887773594c2...


In [18]:
#convert faves and comments to int
df['faves'] = df['faves'].str.replace(',', '').astype(int)
df['comments'] = df['comments'].str.replace(',', '').astype(int)


In [27]:
df.groupby('artists').agg({'faves':sum,
                           'comments': 'sum',
                           'image_urls':'count',
                          }).sort_values(by=['faves','comments','image_urls'], ascending=False)


Unnamed: 0_level_0,faves,comments,image_urls
artists,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AmandaTolleson,37283,3800,10
hoo0,17313,885,3
maXKennedy,13473,1552,15
alicexz,13069,797,1
sakimichan,10924,451,1
xanseviera,8892,455,3
sandara,8745,497,1
teralilac,7089,682,6
mlcamaro,6991,202,3
leightonton,6144,1056,1
