<h3>Data Scraping</h3>

In [None]:
#import basic libraries
import pandas as pd
import re
import logging
import scrapy
from scrapy.crawler import CrawlerProcess

In [None]:
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem

class ImageItem(scrapy.Item):

    # for downloading images in ImagePipeline
    image_urls = scrapy.Field()
    
    #image attributes
    titles = scrapy.Field()
    date_posted = scrapy.Field()
    hashtags = scrapy.Field()
    
    #stats
    views = scrapy.Field()
    faves = scrapy.Field()
    comments = scrapy.Field()
    downloads = scrapy.Field()
    
    #artist details
    artists = scrapy.Field()
    artist_urls = scrapy.Field()
    artist_deviations = scrapy.Field()
    artist_comments = scrapy.Field()
    artist_page_views = scrapy.Field()
    artist_scraps = scrapy.Field()
    artist_watchers = scrapy.Field()
    artist_critiques = scrapy.Field()
    artist_forum_posts = scrapy.Field()
    artist_faves = scrapy.Field()
    artist_asl = scrapy.Field() #age, sex, location
    artist_dob = scrapy.Field() #date of birth
    account_age = scrapy.Field() #how old the account is
    membership = scrapy.Field() #artist current membership status
    
    # to be filled in by ImagePipeline
    image_paths = scrapy.Field()
    images = scrapy.Field()
    
class MyImagesPipeline(ImagesPipeline):

    def get_media_requests(self, item, info):
        for image_url in item['image_urls']:
            yield scrapy.Request(image_url)

    def item_completed(self, results, item, info):
        image_paths = [x['path'] for ok, x in results if ok]
        if not image_paths:
            raise DropItem("Item contains no images")
        item['image_paths'] = image_paths
        return item

In [None]:
class ImageSpider(scrapy.Spider):
    
    name = 'images'
    
    start_urls = ['https://www.deviantart.com/popular-all-time/?q=sherlock&offset=0']
    
    
    #initialize offset at 0
    offset = 0
    #set offset limit to control the amount of images downloaded
    offset_limit = 8000
    #count items
    old_items = 0
    new_items = 0
    

    custom_settings = {
        'LOG_LEVEL': logging.INFO,
        'ITEM_PIPELINES': {'__main__.MyImagesPipeline': 1}, #enable image download
        'IMAGES_STORE': 'DA-images', #store images in DA-images folder
        'FEED_FORMAT':'json',                                
        'FEED_URI': 'image-data.json', #store image data in image-data.json
        'DOWNLOAD_FAIL_ON_DATALOSS': False, #if image download fails (due to various issues), don't send error message, just flag it.
        'DOWNLOAD_DELAY': 0.25 #250 ms download delay, with inbuilt scrapy randomization
    }

    def parse(self, response):
        
        #get page body
        page = response.css('div.page-results span.thumb')
        
        for img in page:
            
            #thumbnail link. If there isn't a thumbnail, then post is not an image and should be skipped
            thumbnail = img.css('::attr(data-super-img)').get()
            
            #img_link contains the full link that leads to the individual image post
            img_link = img.css('::attr(href)').get()
            
            #if there is a thumbnail, aka the post is an image, follow url to parse image for details
            if thumbnail: 
                yield scrapy.Request(img_link, callback = self.parse_image)
                self.new_items += 1
                if (self.new_items - self.old_items) == 100:
                    self.old_items = self.new_items
                    print(f"Downloaded {self.new_items} images...")
                
        #go to next page
        #while self.offset < self.offset_limit:
            #self.offset += 24 #DA's natural offset scroll is set at increments of 24
            #next_page = f'https://www.deviantart.com/popular-all-time/?q=sherlock&offset={self.offset}'
            #yield scrapy.Request(next_page, callback=self.parse)
            
            
    def parse_image(self, response):
        
        #initialize image item
        image = ImageItem()
        
        #get image url (for downloading via ImagePipeline)
        image["image_urls"] = [response.css('div.dev-view-deviation img ::attr(src)').get()]
        #get other image info
        image['titles'] = response.xpath("//a[@class='title']/text()").extract()[0]
        image['date_posted'] = response.xpath("//div[@class='dev-right-bar-content dev-metainfo-content dev-metainfo-details']/dl/dd/span/text()").extract()[0]
        
        #check whether image has hashtags (some don't)
        hashtag = response.xpath("//div[@class='dev-about-tags-cc dev-about-breadcrumb']/a/text()").extract()
        if hashtag: image['hashtag'] = hashtag
        
        #get image stats
        stats =  response.xpath("//div[@class='dev-right-bar-content dev-metainfo-content dev-metainfo-stats']/dl/dd/text()").extract()
        
        #check that stats list only contains numbers (sometimes irregular data falls in)
        stats = [re.sub("\D", "", s) for s in stats]
        
        #remove any None types from list
        stats = list(filter(None, stats))
        
        #the responses are ordered in: views, faves, comments, downloads
        #sometimes comments are disabled, sometimes downloads are disabled
        headers = ['views','faves','comments','downloads']
        
        #if comments/downloads are disabled, they will not be looped over for a given image
        for i in range(len(stats)):
            image[headers[i]] = stats[i]
            
        #get artist info
        artist_name = response.xpath("//small[@class='author']/span/a/text()").extract()
        
        if artist_name: 
            image['artists'] = response.xpath("//small[@class='author']/span/a/text()").extract()[-1]
            image['artist_urls'] = response.xpath("//small[@class='author']/span/a/@href").extract()[-1]     
            request = scrapy.Request(image['artist_urls'], callback=self.parse_artist, meta={'image':image})
            yield request
        else: #if no artist name (sometimes artists are banned), just yield the image
            image['artists'] = 'Banned'
            yield image
        
    def parse_artist(self, response):
        
        #get image item for the higher-level parser
        image = response.meta['image']
        
        #get artist account stats
        artist_stats = response.xpath("//div[@id='super-secret-stats']/div/div/div/strong/text()").extract()
        
        headers = ['artist_deviations','artist_comments','artist_page_views','artist_scraps','artist_watchers','artist_critiques','artist_forum_posts','artist_faves']
        
        for i in range(len(artist_stats)):
            image[headers[i]] = artist_stats[i]
        
        #get account age and membership details
        age_membership = response.xpath("//a[@href='#super-secret-activity']/div/text()").extract()
        
        #sometimes the age wrapped up in a span
        if len(age_membership) == 0: 
            age = response.xpath("//a[@href='#super-secret-activity']/span/div/text()").extract()[0]
            age_membership.append(age)        
        image['account_age'] = age_membership[0]

        
        #get artist personal details
        artist_details = response.xpath("//div[@id='super-secret-why']/div/div/div/dl/dd/text()").extract()
        details = ['artist_asl','artist_dob'] #some artists do not share their dobs
        for i in range(len(artist_details)):
            image[details[i]] = artist_details[i]
        
        return image
            
process = CrawlerProcess()
process.crawl(ImageSpider)
process.start()

In [None]:
df = pd.read_json('image-data.json')
df