<h3>Data Scraping</h3>

In [1]:
#import basic libraries
import pandas as pd
import re
import logging
import scrapy
from scrapy.crawler import CrawlerProcess

In [2]:
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem

class ImageItem(scrapy.Item):

    #direct link to image file for downloading via ImagePipeline
    image_urls = scrapy.Field()
    
    #link to specific image page for scraping more stats
    page_links = scrapy.Field()
    
    #image attributes
    titles = scrapy.Field() #image title
    date_posted = scrapy.Field() #date posted
    hashtags = scrapy.Field() #hashtags
    
    #image stats
    faves = scrapy.Field() #number of faves of the current image
    comments = scrapy.Field() #number of comments of the current image
    views = scrapy.Field() #number of views of the current image
    
    #artist details
    artists = scrapy.Field() #artist's name
    artist_urls = scrapy.Field() #link to the artist's account
    artist_page_views = scrapy.Field() #number of total page views
    artist_deviations = scrapy.Field() #number of images posted by the artist
    artist_watchers = scrapy.Field() #number of accounts following the artist
    artist_watching = scrapy.Field() #number of accounts the artist is following
    artist_favourites = scrapy.Field() #number of total faves received
    artist_comments_made = scrapy.Field() #number of comments made
    artist_comments_received = scrapy.Field() #number of total comments received

    artist_account_age = scrapy.Field() #account age
    
    # to be filled in by ImagePipeline
    image_paths = scrapy.Field() #location of image in local storage
    
    
class MyImagesPipeline(ImagesPipeline):
    '''
    Image pipeline for downloading images.
    '''
    def get_media_requests(self, item, info):
        for image_url in item['image_urls']:
            yield scrapy.Request(image_url)

    def item_completed(self, results, item, info):
        image_paths = [x['path'] for ok, x in results if ok]
        if not image_paths:
            raise DropItem("Item contains no images")
        item['image_paths'] = image_paths
        return item

In [3]:
class ImageSpider(scrapy.Spider):
    
    name = 'images'
    
    start_urls = ['https://www.deviantart.com/search/deviations/visual-art/original-work?order=popular-all-time&page=0&q=cyberpunk']
    
    #initialize offset at 0
    page = 0
    #set offset limit to control the amount of images downloaded
    page_limit = 10
    

    custom_settings = {
        'LOG_LEVEL': logging.INFO,
        'ITEM_PIPELINES': {'__main__.MyImagesPipeline': 1}, #enable image download
        'IMAGES_STORE': 'cyberpunk/images', #store images 
        'FEED_FORMAT':'json',                                
        'FEED_URI': 'cyberpunk/image-data.json', #store image data
        'DOWNLOAD_FAIL_ON_DATALOSS': False, #if image download fails (due to various issues), don't send error message, just flag it.
        #'DOWNLOAD_DELAY': 0.25 #250 ms download delay, with inbuilt scrapy randomization
    }

    def parse(self, response):
        
        #get list of image links from the page
        img_links = response.css('div[class=_2tv7Y] a[data-hook="deviation_link"]::attr(href)').getall()
        
        for link in img_links:
            
            yield scrapy.Request(link, callback = self.parse_image)
                
        #go to next page
        while self.page < self.page_limit:
            self.page += 1 #increment page by 1
            next_page = f'https://www.deviantart.com/search/deviations/visual-art/original-work?order=popular-all-time&page={self.page}&q=cyberpunk'
            yield scrapy.Request(next_page, callback=self.parse)
        
    def parse_image(self, response):
        
        #initialize image item
        image = ImageItem()
        
        #get image url (for downloading via ImagePipeline)
        image["image_urls"] = [response.css('div[data-hook="art_stage"] img::attr(src)').get()]
        #get other image info
        image["page_links"] = response.url
        image['titles'] = response.css('div[class="_3qGVQ"]::text').get()
        image['date_posted'] = response.css('div[class="_3XxFW"]::text').getall()[-1]
        
        #check whether image has hashtags (some don't)
        hashtag = response.css('div[class="_2ogLQ"] span::text').getall()
        if hashtag: image['hashtags'] = hashtag
        
        #get image stats
        stats =  response.css('div[class="hYJJ_"] span::text').getall()
        
        #remove blanks from list and limit to first 3 items (4th item onwards is irrelevant)
        stats = ''.join(stats).split()[:3]
        
        #the responses are ordered in: faves, comments, views
        headers = ['faves','comments','views']
            
        #get artist info
        artist_name = response.css('a[data-hook="user_link"]::attr(title)').get()
        
        if artist_name: 
            image['artists'] = response.css('a[data-hook="user_link"]::attr(title)').get()
            artist_gallery = response.css('a[data-hook="user_link"]::attr(href)').get() 
            image['artist_urls'] = artist_gallery.replace('gallery','about') #replace the /gallery pointer to /about   
            request = scrapy.Request(image['artist_urls'], callback=self.parse_artist, meta={'image':image})
            yield request
        else: #if no artist name (sometimes artists are banned), just yield the image
            image['artists'] = 'Banned'
            yield image
            
    def parse_artist(self, response):
        
        #get image item for the higher-level parser
        image = response.meta['image']
        
        #get stat data
        artist_stats = response.css('div[class="_1loOw"]::text').getall()
        
        #get stat headers
        headers = response.css('div[class="_1loOw"] span::text').getall()
        #prefix 'artist', lowercase, and replace blank space with underscore to make headers neat
        headers = ['artist_' + s.lower().replace(' ','_') for s in headers]
        
        #assign stats to headers
        for i in range(len(artist_stats)):
            image[headers[i]] = artist_stats[i]
        
        #get artist personal info
        personal_info = response.css('div[class="_2B4Yo _3N4ed"] span::text').getall()     
        image['artist_account_age'] = personal_info[-1]
        
        return image
            
process = CrawlerProcess()
process.crawl(ImageSpider)
process.start()

2019-09-16 11:43:20 [scrapy.utils.log] INFO: Scrapy 1.5.2 started (bot: scrapybot)
2019-09-16 11:43:20 [scrapy.utils.log] INFO: Versions: lxml 4.2.1.0, libxml2 2.9.8, cssselect 1.0.3, parsel 1.5.1, w3lib 1.20.0, Twisted 17.5.0, Python 3.6.8 |Anaconda, Inc.| (default, Dec 29 2018, 19:04:46) - [GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)], pyOpenSSL 18.0.0 (OpenSSL 1.1.1b  26 Feb 2019), cryptography 2.6.1, Platform Darwin-18.7.0-x86_64-i386-64bit
2019-09-16 11:43:20 [scrapy.crawler] INFO: Overridden settings: {'DOWNLOAD_FAIL_ON_DATALOSS': False, 'FEED_FORMAT': 'json', 'FEED_URI': 'cyberpunk/image-data.json', 'LOG_LEVEL': 20}
2019-09-16 11:43:20 [scrapy.extensions.telnet] INFO: Telnet Password: db8a9db792a67dfb
2019-09-16 11:43:20 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats

In [4]:
df = pd.read_json('cyberpunk/image-data.json')
df

Unnamed: 0,artist_account_age,artist_comments_made,artist_comments_received,artist_deviations,artist_favourites,artist_page_views,artist_urls,artist_watchers,artist_watching,artists,date_posted,hashtags,image_paths,image_urls,page_links,titles
0,Deviant for 7 years,15K,9.8K,147,6.9K,152.3K,https://www.deviantart.com/seerlight/about,13.6K,943,SeerLight,"December 10, 2017","[architecture, background, city, cityscape, fa...",[full/6d4780ab2bedba3e091dc77381b04f622202a889...,[https://images-wixmp-ed30a86b8c4ca887773594c2...,https://www.deviantart.com/seerlight/art/Cyber...,Cyberpunk Streets
1,Deviant for 10 years,3.1K,717,138,167,222.7K,https://www.deviantart.com/ptitvinc/about,9.9K,117,ptitvinc,"April 7, 2015","[boat, cyber, cyberpunk, digital, fiction, har...",[full/91c65c648f93e44db016b7d0282290fd7143890d...,[https://images-wixmp-ed30a86b8c4ca887773594c2...,https://www.deviantart.com/ptitvinc/art/cyberp...,cyberpunk port town
2,Deviant for 14 years,878,1K,253,141,676.4K,https://www.deviantart.com/clintcearley/about,25.2K,66,ClintCearley,"March 5, 2018","[android, animated, artificial, brain, cel, ch...",[full/6728f90a91ead10f6dc4612430f9c98b94a208a2...,[https://images-wixmp-ed30a86b8c4ca887773594c2...,https://www.deviantart.com/clintcearley/art/Cy...,Cyberpunk
3,Deviant for 11 years,4.9K,2.8K,691,1.7K,1.4M,https://www.deviantart.com/anndr/about,40.6K,846,anndr,"June 30, 2016","[blue, china, chinatown, cyberpunk, dark, futu...",[full/7f1456f317b85ec3366fd6f1169ca6516caf774f...,[https://images-wixmp-ed30a86b8c4ca887773594c2...,https://www.deviantart.com/anndr/art/cyberpunk...,cyberpunk
4,Deviant for 11 years,1.4K,113,293,68,114.3K,https://www.deviantart.com/ianllanas/about,3.4K,47,ianllanas,"August 23, 2016","[armor, cyberpunk, sciencefiction]",[full/ddc36941022fb78d45f2de12492db54094b9d12e...,[https://images-wixmp-ed30a86b8c4ca887773594c2...,https://www.deviantart.com/ianllanas/art/Cyber...,Cyberpunk Character Design
5,Deviant for 5 years,96,92,138,5,76.1K,https://www.deviantart.com/zudartslee/about,5.1K,24,Zudartslee,"September 21, 2018","[concept, cyberpunk, art]",[full/b055a0a5e39578d202244be1142c8f0491bf1db7...,[https://images-wixmp-ed30a86b8c4ca887773594c2...,https://www.deviantart.com/zudartslee/art/Cybe...,Cyberpunk Study 3hrs
6,Deviant for 3 years,,,,,,https://www.deviantart.com/coal-sekitan,,,Coal-Sekitan,"January 19, 2012",,[full/99b301ed0cd870f75698b329767fd5ca2b1e66e9...,[https://images-wixmp-ed30a86b8c4ca887773594c2...,https://www.deviantart.com/mjbauer/art/Cyberpu...,Cyberpunk
7,Deviant for 15 years,1.2K,43.6K,605,72,11.5M,https://www.deviantart.com/twokinds/about,59.8K,43,Twokinds,"April 29, 2019","[cyberpunk, raine]",[full/51e4b5116785a77c358da5fb09c0c48d8048659e...,[https://images-wixmp-ed30a86b8c4ca887773594c2...,https://www.deviantart.com/twokinds/art/Cyberp...,Cyberpunk Raine
8,Deviant for 8 years,1.8K,225,277,293,107.3K,https://www.deviantart.com/artificialdesign/about,3.8K,203,artificialdesign,"January 18, 2012",,[full/92dbc4bd407d521da76873505d1184e6bdfc1e8c...,[https://images-wixmp-ed30a86b8c4ca887773594c2...,https://www.deviantart.com/artificialdesign/ar...,Cyberpunk Courier
9,Deviant for 10 years,1.4K,407,513,1.1K,240K,https://www.deviantart.com/anastasia-berry/about,9.6K,1.1K,Anastasia-berry,"December 27, 2017","[synth, art, cyberpunk, cyberpunkgirl, retrowa...",[full/d192ebf02c635e7f834bd9d36b546e0f96e5b08f...,[https://images-wixmp-ed30a86b8c4ca887773594c2...,https://www.deviantart.com/anastasia-berry/art...,Cyberpunk Girl
