<h3>Data Scraping</h3>

In [1]:
#import basic libraries
import pandas as pd
import re
import logging
import scrapy
from scrapy.crawler import CrawlerProcess

In [2]:
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem

class ImageItem(scrapy.Item):

    # for downloading images in ImagePipeline
    image_urls = scrapy.Field()
    
    #link to specific image
    image_links = scrapy.Field()
    
    #image attributes
    titles = scrapy.Field()
    date_posted = scrapy.Field()
    hashtags = scrapy.Field()
    
    #stats
    views = scrapy.Field()
    faves = scrapy.Field()
    comments = scrapy.Field()
    downloads = scrapy.Field()
    
    #artist details
    artists = scrapy.Field()
    artist_urls = scrapy.Field()
    artist_deviations = scrapy.Field()
    artist_comments = scrapy.Field()
    artist_page_views = scrapy.Field()
    artist_scraps = scrapy.Field()
    artist_watchers = scrapy.Field()
    artist_critiques = scrapy.Field()
    artist_forum_posts = scrapy.Field()
    artist_faves = scrapy.Field()
    artist_asl = scrapy.Field() #age, sex, location
    artist_dob = scrapy.Field() #date of birth
    account_age = scrapy.Field() #how old the account is
    membership = scrapy.Field() #artist current membership status
    
    # to be filled in by ImagePipeline
    image_paths = scrapy.Field()
    images = scrapy.Field()
    
class MyImagesPipeline(ImagesPipeline):

    def get_media_requests(self, item, info):
        for image_url in item['image_urls']:
            yield scrapy.Request(image_url)

    def item_completed(self, results, item, info):
        image_paths = [x['path'] for ok, x in results if ok]
        if not image_paths:
            raise DropItem("Item contains no images")
        item['image_paths'] = image_paths
        return item

In [3]:
class ImageSpider(scrapy.Spider):
    
    name = 'images'
    
    start_urls = ['https://www.deviantart.com/popular-all-time/?q=sherlock&offset=0']
    
    
    #initialize offset at 0
    offset = 0
    #set offset limit to control the amount of images downloaded
    offset_limit = 8000
    

    custom_settings = {
        'LOG_LEVEL': logging.INFO,
        'ITEM_PIPELINES': {'__main__.MyImagesPipeline': 1}, #enable image download
        'IMAGES_STORE': 'DA-images', #store images in DA-images folder
        'FEED_FORMAT':'json',                                
        'FEED_URI': 'image-data.json', #store image data in image-data.json
        'DOWNLOAD_FAIL_ON_DATALOSS': False, #if image download fails (due to various issues), don't send error message, just flag it.
        'DOWNLOAD_DELAY': 0.25 #250 ms download delay, with inbuilt scrapy randomization
    }

    def parse(self, response):
        
        #get page body
        page = response.css('div.page-results span.thumb')
        
        for img in page:
            
            #thumbnail link. If there isn't a thumbnail, then post is not an image and should be skipped
            thumbnail = img.css('::attr(data-super-img)').get()
            
            #img_link contains the full link that leads to the individual image post
            img_link = img.css('::attr(href)').get()
            
            #if there is a thumbnail, aka the post is an image, follow url to parse image for details
            if thumbnail: 
                yield scrapy.Request(img_link, callback = self.parse_image)
                
        #go to next page
        while self.offset < self.offset_limit:
            self.offset += 24 #DA's natural offset scroll is set at increments of 24
            next_page = f'https://www.deviantart.com/popular-all-time/?q=sherlock&offset={self.offset}'
            yield scrapy.Request(next_page, callback=self.parse)
            
            
    def parse_image(self, response):
        
        #initialize image item
        image = ImageItem()
        
        #get image url (for downloading via ImagePipeline)
        image["image_urls"] = [response.css('div.dev-view-deviation img ::attr(src)').get()]
        #get other image info
        image["image_links"] = response.url
        image['titles'] = response.xpath("//a[@class='title']/text()").extract()[0]
        image['date_posted'] = response.xpath("//div[@class='dev-right-bar-content dev-metainfo-content dev-metainfo-details']/dl/dd/span/text()").extract()[0]
        
        #check whether image has hashtags (some don't)
        hashtag = response.xpath("//div[@class='dev-about-tags-cc dev-about-breadcrumb']/a/text()").extract()
        if hashtag: image['hashtags'] = hashtag
        
        #get image stats
        stats =  response.xpath("//div[@class='dev-right-bar-content dev-metainfo-content dev-metainfo-stats']/dl/dd/text()").extract()
        
        #check that stats list only contains numbers (sometimes irregular data falls in)
        stats = [re.sub("\D", "", s) for s in stats]
        
        #remove any None types from list
        stats = list(filter(None, stats))
        
        #the responses are ordered in: views, faves, comments, downloads
        #sometimes comments are disabled, sometimes downloads are disabled
        headers = ['views','faves','comments','downloads']
        
        #if comments/downloads are disabled, they will not be looped over for a given image
        for i in range(len(stats)):
            image[headers[i]] = stats[i]
            
        #get artist info
        artist_name = response.xpath("//small[@class='author']/span/a/text()").extract()
        
        if artist_name: 
            image['artists'] = response.xpath("//small[@class='author']/span/a/text()").extract()[-1]
            image['artist_urls'] = response.xpath("//small[@class='author']/span/a/@href").extract()[-1]     
            request = scrapy.Request(image['artist_urls'], callback=self.parse_artist, meta={'image':image})
            yield request
        else: #if no artist name (sometimes artists are banned), just yield the image
            image['artists'] = 'Banned'
            yield image
        
    def parse_artist(self, response):
        
        #get image item for the higher-level parser
        image = response.meta['image']
        
        #get artist account stats
        artist_stats = response.xpath("//div[@id='super-secret-stats']/div/div/div/strong/text()").extract()
        
        headers = ['artist_deviations','artist_comments','artist_page_views','artist_scraps','artist_watchers','artist_critiques','artist_forum_posts','artist_faves']
        
        for i in range(len(artist_stats)):
            image[headers[i]] = artist_stats[i]
        
        #get account age and membership details
        age_membership = response.xpath("//a[@href='#super-secret-activity']/div/text()").extract()
        
        #sometimes the age is wrapped up in a span
        if len(age_membership) == 0: 
            age = response.xpath("//a[@href='#super-secret-activity']/span/div/text()").extract()[0]
            age_membership.append(age)        
        image['account_age'] = age_membership[0]

        
        #get artist personal details
        artist_details = response.xpath("//div[@id='super-secret-why']/div/div/div/dl/dd/text()").extract()
        details = ['artist_asl','artist_dob'] #some artists do not share their dobs
        for i in range(len(artist_details)):
            image[details[i]] = artist_details[i]
        
        return image
            
process = CrawlerProcess()
process.crawl(ImageSpider)
process.start()

2019-04-17 19:51:28 [scrapy.utils.log] INFO: Scrapy 1.5.2 started (bot: scrapybot)
2019-04-17 19:51:28 [scrapy.utils.log] INFO: Versions: lxml 4.2.1.0, libxml2 2.9.8, cssselect 1.0.3, parsel 1.5.1, w3lib 1.20.0, Twisted 17.5.0, Python 3.6.8 |Anaconda, Inc.| (default, Dec 29 2018, 19:04:46) - [GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)], pyOpenSSL 18.0.0 (OpenSSL 1.1.1b  26 Feb 2019), cryptography 2.6.1, Platform Darwin-18.2.0-x86_64-i386-64bit
2019-04-17 19:51:28 [scrapy.crawler] INFO: Overridden settings: {'DOWNLOAD_DELAY': 0.25, 'DOWNLOAD_FAIL_ON_DATALOSS': False, 'FEED_FORMAT': 'json', 'FEED_URI': 'image-data.json', 'LOG_LEVEL': 20}
2019-04-17 19:51:28 [scrapy.extensions.telnet] INFO: Telnet Password: b6ed4b5034087165
2019-04-17 19:51:28 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.exten

Downloaded 100 images...
Downloaded 200 images...


2019-04-17 19:52:28 [scrapy.extensions.logstats] INFO: Crawled 269 pages (at 269 pages/min), scraped 77 items (at 77 items/min)
2019-04-17 19:53:28 [scrapy.extensions.logstats] INFO: Crawled 562 pages (at 293 pages/min), scraped 173 items (at 96 items/min)


Downloaded 300 images...
Downloaded 400 images...


2019-04-17 19:54:28 [scrapy.extensions.logstats] INFO: Crawled 844 pages (at 282 pages/min), scraped 259 items (at 86 items/min)
2019-04-17 19:55:28 [scrapy.extensions.logstats] INFO: Crawled 1127 pages (at 283 pages/min), scraped 352 items (at 93 items/min)


Downloaded 500 images...


2019-04-17 19:56:28 [scrapy.extensions.logstats] INFO: Crawled 1407 pages (at 280 pages/min), scraped 435 items (at 83 items/min)


Downloaded 600 images...
Downloaded 700 images...


2019-04-17 19:57:28 [scrapy.extensions.logstats] INFO: Crawled 1675 pages (at 268 pages/min), scraped 513 items (at 78 items/min)
2019-04-17 19:58:28 [scrapy.extensions.logstats] INFO: Crawled 1956 pages (at 281 pages/min), scraped 600 items (at 87 items/min)


Downloaded 800 images...


2019-04-17 19:59:28 [scrapy.extensions.logstats] INFO: Crawled 2233 pages (at 277 pages/min), scraped 680 items (at 80 items/min)


Downloaded 900 images...
Downloaded 1000 images...
Downloaded 1100 images...


2019-04-17 20:00:28 [scrapy.extensions.logstats] INFO: Crawled 2512 pages (at 279 pages/min), scraped 764 items (at 84 items/min)
2019-04-17 20:01:28 [scrapy.extensions.logstats] INFO: Crawled 2791 pages (at 279 pages/min), scraped 849 items (at 85 items/min)


Downloaded 1200 images...
Downloaded 1300 images...


2019-04-17 20:02:28 [scrapy.extensions.logstats] INFO: Crawled 3063 pages (at 272 pages/min), scraped 926 items (at 77 items/min)
2019-04-17 20:03:28 [scrapy.extensions.logstats] INFO: Crawled 3325 pages (at 262 pages/min), scraped 997 items (at 71 items/min)


Downloaded 1400 images...
Downloaded 1500 images...


2019-04-17 20:04:28 [scrapy.extensions.logstats] INFO: Crawled 3597 pages (at 272 pages/min), scraped 1070 items (at 73 items/min)
2019-04-17 20:05:28 [scrapy.extensions.logstats] INFO: Crawled 3861 pages (at 264 pages/min), scraped 1143 items (at 73 items/min)


Downloaded 1600 images...
Downloaded 1700 images...
Downloaded 1800 images...


2019-04-17 20:06:28 [scrapy.extensions.logstats] INFO: Crawled 4127 pages (at 266 pages/min), scraped 1215 items (at 72 items/min)
2019-04-17 20:07:28 [scrapy.extensions.logstats] INFO: Crawled 4360 pages (at 233 pages/min), scraped 1278 items (at 63 items/min)


Downloaded 1900 images...
Downloaded 2000 images...


2019-04-17 20:08:28 [scrapy.extensions.logstats] INFO: Crawled 4624 pages (at 264 pages/min), scraped 1344 items (at 66 items/min)
2019-04-17 20:09:28 [scrapy.extensions.logstats] INFO: Crawled 4897 pages (at 273 pages/min), scraped 1420 items (at 76 items/min)


Downloaded 2100 images...


2019-04-17 20:10:28 [scrapy.extensions.logstats] INFO: Crawled 5166 pages (at 269 pages/min), scraped 1493 items (at 73 items/min)


Downloaded 2200 images...
Downloaded 2300 images...


2019-04-17 20:11:28 [scrapy.extensions.logstats] INFO: Crawled 5426 pages (at 260 pages/min), scraped 1555 items (at 62 items/min)
2019-04-17 20:12:28 [scrapy.extensions.logstats] INFO: Crawled 5689 pages (at 263 pages/min), scraped 1628 items (at 73 items/min)


Downloaded 2400 images...
Downloaded 2500 images...
Downloaded 2600 images...


2019-04-17 20:13:04 [scrapy.core.scraper] ERROR: Spider error processing <GET https://www.deviantart.com/sherlock-holmes> (referer: https://www.deviantart.com/sherlock-holmes/art/Sherlock-Holmes-Club-ID-8867259)
Traceback (most recent call last):
  File "/anaconda3/lib/python3.6/site-packages/twisted/internet/defer.py", line 653, in _runCallbacks
    current.result = callback(current.result, *args, **kw)
  File "<ipython-input-3-1d96902a9d4a>", line 118, in parse_artist
    age = response.xpath("//a[@href='#super-secret-activity']/span/div/text()").extract()[0]
IndexError: list index out of range
2019-04-17 20:13:28 [scrapy.extensions.logstats] INFO: Crawled 5942 pages (at 253 pages/min), scraped 1685 items (at 57 items/min)
2019-04-17 20:14:28 [scrapy.extensions.logstats] INFO: Crawled 6209 pages (at 267 pages/min), scraped 1756 items (at 71 items/min)


Downloaded 2700 images...
Downloaded 2800 images...


2019-04-17 20:15:28 [scrapy.extensions.logstats] INFO: Crawled 6470 pages (at 261 pages/min), scraped 1823 items (at 67 items/min)
2019-04-17 20:16:28 [scrapy.extensions.logstats] INFO: Crawled 6729 pages (at 259 pages/min), scraped 1887 items (at 64 items/min)


Downloaded 2900 images...
Downloaded 3000 images...


2019-04-17 20:17:28 [scrapy.extensions.logstats] INFO: Crawled 6993 pages (at 264 pages/min), scraped 1952 items (at 65 items/min)


Downloaded 3100 images...
Downloaded 3200 images...


2019-04-17 20:18:28 [scrapy.extensions.logstats] INFO: Crawled 7255 pages (at 262 pages/min), scraped 2019 items (at 67 items/min)


Downloaded 3300 images...


2019-04-17 20:19:28 [scrapy.extensions.logstats] INFO: Crawled 7505 pages (at 250 pages/min), scraped 2079 items (at 60 items/min)
2019-04-17 20:20:28 [scrapy.extensions.logstats] INFO: Crawled 7765 pages (at 260 pages/min), scraped 2144 items (at 65 items/min)


Downloaded 3400 images...
Downloaded 3500 images...
Downloaded 3600 images...


2019-04-17 20:21:28 [scrapy.extensions.logstats] INFO: Crawled 8015 pages (at 250 pages/min), scraped 2200 items (at 56 items/min)
2019-04-17 20:22:28 [scrapy.extensions.logstats] INFO: Crawled 8266 pages (at 251 pages/min), scraped 2258 items (at 58 items/min)


Downloaded 3700 images...
Downloaded 3800 images...


2019-04-17 20:23:28 [scrapy.extensions.logstats] INFO: Crawled 8522 pages (at 256 pages/min), scraped 2315 items (at 57 items/min)
2019-04-17 20:24:28 [scrapy.extensions.logstats] INFO: Crawled 8777 pages (at 255 pages/min), scraped 2378 items (at 63 items/min)


Downloaded 3900 images...
Downloaded 4000 images...


2019-04-17 20:25:28 [scrapy.extensions.logstats] INFO: Crawled 9032 pages (at 255 pages/min), scraped 2438 items (at 60 items/min)


Downloaded 4100 images...
Downloaded 4200 images...
Downloaded 4300 images...


2019-04-17 20:26:28 [scrapy.extensions.logstats] INFO: Crawled 9279 pages (at 247 pages/min), scraped 2489 items (at 51 items/min)
2019-04-17 20:27:28 [scrapy.extensions.logstats] INFO: Crawled 9528 pages (at 249 pages/min), scraped 2544 items (at 55 items/min)
2019-04-17 20:28:28 [scrapy.extensions.logstats] INFO: Crawled 9777 pages (at 249 pages/min), scraped 2601 items (at 57 items/min)


Downloaded 4400 images...
Downloaded 4500 images...


2019-04-17 20:29:28 [scrapy.extensions.logstats] INFO: Crawled 10031 pages (at 254 pages/min), scraped 2658 items (at 57 items/min)


Downloaded 4600 images...
Downloaded 4700 images...
Downloaded 4800 images...


2019-04-17 20:30:28 [scrapy.extensions.logstats] INFO: Crawled 10282 pages (at 251 pages/min), scraped 2709 items (at 51 items/min)
2019-04-17 20:31:28 [scrapy.extensions.logstats] INFO: Crawled 10526 pages (at 244 pages/min), scraped 2758 items (at 49 items/min)


Downloaded 4900 images...
Downloaded 5000 images...


2019-04-17 20:32:28 [scrapy.extensions.logstats] INFO: Crawled 10773 pages (at 247 pages/min), scraped 2809 items (at 51 items/min)


Downloaded 5100 images...
Downloaded 5200 images...


2019-04-17 20:33:28 [scrapy.extensions.logstats] INFO: Crawled 11018 pages (at 245 pages/min), scraped 2860 items (at 51 items/min)
2019-04-17 20:34:28 [scrapy.extensions.logstats] INFO: Crawled 11267 pages (at 249 pages/min), scraped 2913 items (at 53 items/min)


Downloaded 5300 images...
Downloaded 5400 images...
Downloaded 5500 images...


2019-04-17 20:35:28 [scrapy.extensions.logstats] INFO: Crawled 11518 pages (at 251 pages/min), scraped 2963 items (at 50 items/min)
2019-04-17 20:36:28 [scrapy.extensions.logstats] INFO: Crawled 11761 pages (at 243 pages/min), scraped 3013 items (at 50 items/min)


Downloaded 5600 images...
Downloaded 5700 images...


2019-04-17 20:37:28 [scrapy.extensions.logstats] INFO: Crawled 12007 pages (at 246 pages/min), scraped 3060 items (at 47 items/min)
2019-04-17 20:38:28 [scrapy.extensions.logstats] INFO: Crawled 12243 pages (at 236 pages/min), scraped 3102 items (at 42 items/min)


Downloaded 5800 images...
Downloaded 5900 images...
Downloaded 6000 images...


2019-04-17 20:39:28 [scrapy.extensions.logstats] INFO: Crawled 12490 pages (at 247 pages/min), scraped 3154 items (at 52 items/min)
2019-04-17 20:39:39 [scrapy.pipelines.files] ERROR: File (unknown-error): Error processing file from <GET https://images-wixmp-ed30a86b8c4ca887773594c2.wixmp.com/f/0dc70551-b190-44e8-9b59-5e9378a30049/d2y7tid-cc06bf13-115d-41d9-b9ec-00c33c8cab98.jpg?token=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJ1cm46YXBwOjdlMGQxODg5ODIyNjQzNzNhNWYwZDQxNWVhMGQyNmUwIiwiaXNzIjoidXJuOmFwcDo3ZTBkMTg4OTgyMjY0MzczYTVmMGQ0MTVlYTBkMjZlMCIsIm9iaiI6W1t7InBhdGgiOiJcL2ZcLzBkYzcwNTUxLWIxOTAtNDRlOC05YjU5LTVlOTM3OGEzMDA0OVwvZDJ5N3RpZC1jYzA2YmYxMy0xMTVkLTQxZDktYjllYy0wMGMzM2M4Y2FiOTguanBnIn1dXSwiYXVkIjpbInVybjpzZXJ2aWNlOmZpbGUuZG93bmxvYWQiXX0.3RRjMd6_q6cBQZh-YJCavU7n72FfQbHmg1KuoeJpdk8> referred in <None>
Traceback (most recent call last):
  File "/anaconda3/lib/python3.6/site-packages/twisted/internet/defer.py", line 1386, in _inlineCallbacks
    result = g.send(result)
  File "/a

Downloaded 6100 images...
Downloaded 6200 images...


2019-04-17 20:41:28 [scrapy.extensions.logstats] INFO: Crawled 12989 pages (at 253 pages/min), scraped 3262 items (at 59 items/min)


Downloaded 6300 images...
Downloaded 6400 images...
Downloaded 6500 images...


2019-04-17 20:42:28 [scrapy.extensions.logstats] INFO: Crawled 13225 pages (at 236 pages/min), scraped 3304 items (at 42 items/min)
2019-04-17 20:43:28 [scrapy.extensions.logstats] INFO: Crawled 13471 pages (at 246 pages/min), scraped 3357 items (at 53 items/min)
2019-04-17 20:44:27 [scrapy.core.engine] INFO: Closing spider (finished)
2019-04-17 20:44:27 [scrapy.extensions.feedexport] INFO: Stored json feed (3406 items) in: image-data.json
2019-04-17 20:44:27 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 9323610,
 'downloader/request_count': 13706,
 'downloader/request_method_count/GET': 13706,
 'downloader/response_bytes': 1136183374,
 'downloader/response_count': 13706,
 'downloader/response_status_count/200': 13706,
 'dupefilter/filtered': 3158,
 'file_count': 3407,
 'file_status_count/downloaded': 3407,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2019, 4, 17, 23, 44, 27, 187626),
 'item_dropped_count': 1,
 'item_dropped_reason

df = pd.read_json('image-data.json')
df