# Download New Yorker cartoons

We'll be scraping images and captions from their [Random cartoon page](https://www.newyorker.com/cartoons/random/).

In [48]:
import requests
from requests.exceptions import Timeout
import html
import time
import json

## Write a class to handle the GET requests to NewYorker.com

In [90]:
class Scraper(object):
    """Scrape cartoons from The New Yorker random cartoon page"""

    # Create a persistent requests connection
    _session = requests.Session()
    _session.headers = {'application': 'PyToons',
                       'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
    _SLEEP_MIN = 0.2  # Enforce minimum wait time between API calls (seconds)

    def __init__(self, timeout=5, sleep_time=0.5):
        """ Scraper Constructor

        :param timeout: time before quitting on response (seconds)
        :param sleep_time: time to wait between requests
        """

        self.api_root = "https://www.newyorker.com"
        self.timeout = timeout
        self.sleep_time = sleep_time

    def _make_request(self, path, method='GET'):
        """Make a request to the API"""
        uri = self.api_root + path
        
        # Make the request
        response = None
        try:
            response = self._session.request(method, uri, timeout=self.timeout)
        except Timeout as e:
            print("Timeout raised and caught:\n{e}".format(e=e))

        # Enforce rate limiting
        time.sleep(max(self._SLEEP_MIN, self.sleep_time))
        return response

    def get_random_cartoon(self):
        """Requests a random New Yorker cartoon"""
        endpoint = "/cartoons/random/randomAPI"
        return self._make_request(endpoint)

    def format_cartoon_response(self, response):
        """Cleans up the cartoon response data"""
        data = response.json()
        for cartoon in data:
            cartoon['caption_raw'] = cartoon['caption']
            cartoon['caption'] = html.unescape(cartoon['caption_raw'])
        return data
    
    def download_cartoons(self, num_cartoons):
        """Downloads random cartoons from The New Yorker"""
        cartoons = []
        error_count, max_errors = 0, 3
        t0 = time.time()
        while (len(cartoons) < num_cartoons) and (error_count < max_errors):
            r = self.get_random_cartoon()
            if r and r.status_code == 200:
                cartoons.extend(self.format_cartoon_response(r))
                print(f"({len(cartoons)}): {cartoons[-1]['caption']}\n")
            else:
                error_count += 1
        elapsed = time.time() - t0
        print(f"Time elapsed: {elapsed / 60:.2f} minutes.")
        return cartoons

## Test out the scraper

In [87]:
# Create an instance of the cartoon scraper
scraper = Scraper(sleep_time=1)

In [83]:
# Make a single request for a random cartoon
r = scraper.get_random_cartoon()
r

<Response [200]>

In [85]:
scraper.format_cartoon_response(r)[0]

{'id': '2812527',
 'pn': 'a18413',
 'date': '08/29/2014 18:16',
 'src': 'https://www.newyorker.com/wp-content/uploads/2014/09/140908_a18413-600.jpg',
 'caption': '“As you go through life, take time to monetize the roses.”',
 'cartoon_bank_url': 'http://www.condenaststore.com/-sp/_i12246451_.htm',
 'caption_raw': '&ldquo;As you go through life, take time to monetize the roses.&rdquo;'}

## Download many cartoons

In [88]:
cartoons = scraper.download_cartoons(num_cartoons=10)
cartoons[-1]

(2): “Same thing every September. He begins to doubt the existence of man, then football season begins, and he snaps out of it.”

(4): “Wow—she is stunning.”

(6): “I do my part.”

(8): “Do you mean good, or good for a pumpkin?”

(10): “Don't you want to have parents who can brag about their children?”

Time elapsed: 0.06 minutes.


{'id': '2659342',
 'pn': 'a18393',
 'date': '07/25/2014 15:36',
 'src': 'https://www.newyorker.com/wp-content/uploads/2014/07/a18393.png',
 'caption': "“Don't you want to have parents who can brag about their children?”",
 'cartoon_bank_url': '',
 'caption_raw': "&ldquo;Don't you want to have parents who can brag about their children?&rdquo;"}