# Download New Yorker cartoons

We'll be scraping images and captions from their [Random cartoon page](https://www.newyorker.com/cartoons/random/).

In [48]:
import requests
from requests.exceptions import Timeout
import html
import time
import json

## Write a class to handle the GET requests to NewYorker.com

In [75]:
class Scraper(object):
    """Scrape cartoons from The New Yorker random cartoon page"""

    # Create a persistent requests connection
    _session = requests.Session()
    _session.headers = {'application': 'PyToons',
                       'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
    _SLEEP_MIN = 0.2  # Enforce minimum wait time between API calls (seconds)

    def __init__(self, timeout=5, sleep_time=0.5):
        """ Scraper Constructor

        :param timeout: time before quitting on response (seconds)
        :param sleep_time: time to wait between requests
        """

        self.api_root = "https://www.newyorker.com"
        self.timeout = timeout
        self.sleep_time = sleep_time

    def _make_request(self, path, method='GET'):
        """Make a request to the API"""
        uri = self.api_root + path
        
        # Make the request
        response = None
        try:
            response = self._session.request(method, uri, timeout=self.timeout)
        except Timeout as e:
            print("Timeout raised and caught:\n{e}".format(e=e))

        # Enforce rate limiting
        time.sleep(max(self._SLEEP_MIN, self.sleep_time))
        return response

    def get_random_cartoon(self):
        """Requests a random New Yorker cartoon"""
        endpoint = "/cartoons/random/randomAPI"
        return self._make_request(endpoint)

    def format_cartoon_response(self, response):
        """Cleans up the cartoon response data"""
        data = response.json()
        for cartoon in data:
            cartoon['caption_raw'] = cartoon['caption']
            cartoon['caption'] = html.unescape(cartoon['caption_raw'])
        return data

## Test out the scraper

In [76]:
# Create an instance of the cartoon scraper
scraper = Scraper()

In [77]:
# Get a random cartoon
r = scraper.get_random_cartoon()
r

<Response [200]>

In [78]:
scraper.format_cartoon_response(r)

[{'id': '1556841',
  'pn': 'diffee-2004-08-30',
  'date': '07/14/2014 00:24',
  'src': 'https://www.newyorker.com/wp-content/uploads/2014/04/140421_diffee-2004-08-30-600.jpg',
  'caption': "“I don't think you're getting enough stress.”",
  'cartoon_bank_url': '',
  'caption_raw': "&ldquo;I don't think you're getting enough stress.&rdquo;"},
 {'id': '2805829',
  'pn': 'a10763',
  'date': '08/25/2014 14:34',
  'src': 'https://www.newyorker.com/wp-content/uploads/2005/09/050912_a10763-600.jpg',
  'caption': '“I had my own blog for a while, but I decided to go back to just pointless, incessant barking.”',
  'cartoon_bank_url': '',
  'caption_raw': '&ldquo;I had my own blog for a while, but I decided to go back to just pointless, incessant barking.&rdquo;'}]