In [32]:
import requests
import json
import os
import time

def get_data(query, page_start, page_end, short_pause=10, long_pause=60):

    if page_start < 1:
        print("Starting page number has to be greater than 0.")
        print("Process exiting with error.")
        return 0
        
    saved_files = []
    urls_scraped = 0
        
    for page_number in range(page_start, page_end):

        url = f"https://www.artstation.com/api/v2/search/projects.json?page={page_number}&per_page=75&query={query}&sorting=relevance"
        
        filename = f'data/{query}/data_{page_number}.json'
        
        # only scrape url if json file of the url doesn't already exist
        if not os.path.exists(filename):
            client = requests.session()
            r = client.get(url)
            print(f"Scraping {url} {r.status_code}: {r.reason}")
            
            # if "data" in response text is empty, the max page has probably been reached.
            data = json.loads(r.text)
            if len(data['data']) == 0:
                print(f"Data list is empty. Query {query} has no data on page {page_number}.")
                print(f"Max page for this query has probably been reached. Ending scraping process.")
                return 0
            
            # keep track of number of urls scraped
            urls_scraped += 1 
            
            # check if rate-limit was exceeded.
            if r.status_code == 429:
                print("Rate-limit exceeded. Wait for a while and try running scraper again.")
                print("Process exiting with error.")
                return 0
            
            # save file
            print(f'Saving file as {filename}')
            with open(filename, 'w', encoding='utf-8') as f:
                json.dump(r.text, f, ensure_ascii=False, indent=4)
                saved_files.append(filename)
                
            # take a short pause per url scraped
            print(f"Pausing scraper for {short_pause} seconds.")
            time.sleep(short_pause)
        
        else:
            print(f'{filename} already exists. Skipping the current url.')

        # take a long pause if 10 urls have been scraped
        if urls_scraped > 0 and urls_scraped%10 == 0:
            print(f"Scraped {urls_scraped} urls.")
            print(f"Pausing scraper for {long_pause} seconds.")
            time.sleep(long_pause)
      
    # process end summary
    print(f"Files saved: {saved_files}")
    print(f"Process finished running.")
    return 1
        


In [35]:
genres = ['cyberpunk','noir','horror','western','cartoon']

get_data(genres[0], 125,  150)

Scraping https://www.artstation.com/api/v2/search/projects.json?page=125&per_page=75&query=cyberpunk&sorting=relevance 200: OK
Saving file as data/cyberpunk/data_125.json
Pausing scraper for 10 seconds.
Scraping https://www.artstation.com/api/v2/search/projects.json?page=126&per_page=75&query=cyberpunk&sorting=relevance 200: OK
Saving file as data/cyberpunk/data_126.json
Pausing scraper for 10 seconds.
Scraping https://www.artstation.com/api/v2/search/projects.json?page=127&per_page=75&query=cyberpunk&sorting=relevance 200: OK
Saving file as data/cyberpunk/data_127.json
Pausing scraper for 10 seconds.
Scraping https://www.artstation.com/api/v2/search/projects.json?page=128&per_page=75&query=cyberpunk&sorting=relevance 200: OK
Saving file as data/cyberpunk/data_128.json
Pausing scraper for 10 seconds.
Scraping https://www.artstation.com/api/v2/search/projects.json?page=129&per_page=75&query=cyberpunk&sorting=relevance 200: OK
Saving file as data/cyberpunk/data_129.json
Pausing scraper f

0