# Download New Yorker cartoons

We'll be scraping images and captions from their [Random cartoon page](https://www.newyorker.com/cartoons/random/).

In [48]:
import requests
from requests.exceptions import Timeout
import urllib.request
import html
import time
import json

## Write a class to handle the GET requests to NewYorker.com

In [272]:
class Scraper(object):
    """Scrape cartoons from The New Yorker random cartoon page"""

    # Create a persistent requests connection
    _session = requests.Session()
    _session.headers = {'application': 'PyYorker',
                        'If-None-Match': None,
                        'Accept': '*/*',
                        'Accept-Encoding': 'gzip, deflate, br',
                        'Connection': 'keep-alive',
                        'DNT': '1',
                        'Host': 'www.newyorker.com',
                        'Referer': 'https://www.newyorker.com/cartoons/random/',
                        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
    _SLEEP_MIN = 0.2  # Enforce minimum wait time between API calls (seconds)

    def __init__(self, timeout=5, sleep_time=0.5):
        """ Scraper Constructor

        :param timeout: time before quitting on response (seconds)
        :param sleep_time: time to wait between requests
        """

        self.api_root = "https://www.newyorker.com"
        self.timeout = timeout
        self.sleep_time = sleep_time

    def _make_request(self, path):
        """Make a request to the API"""
        uri = self.api_root + path
        
        # Make the request
        response = None
        try:
            response = self._session.request("GET", uri, timeout=self.timeout)
            self._session.headers['If-None-Match'] = response.headers['ETag']
        except Timeout as e:
            print("Timeout raised and caught:\n{e}".format(e=e))

        # Enforce rate limiting
        time.sleep(max(self._SLEEP_MIN, self.sleep_time))
        return response

    def get_random_cartoon(self):
        """Requests a random New Yorker cartoon"""
        endpoint = "/cartoons/random/randomAPI"
        return self._make_request(endpoint)
    
    def download_image(self, response):
        """Saves image to local directory from a URL"""
        fn = f"{response['id']}.{response['src'].split('.')[-1]}"
        urllib.request.urlretrieve(response['src'], f"./images/{fn}")
        time.sleep(1)
        return None

    def format_cartoon_response(self, response):
        """Cleans up the cartoon response data"""
        data = response.json()
        for cartoon in data:
            cartoon['caption_raw'] = cartoon['caption']
            cartoon['caption'] = html.unescape(cartoon['caption_raw'])
        return data
    
    def download_cartoons(self, num_cartoons, save_images=True):
        """Downloads random cartoons from The New Yorker"""
        t0 = time.time()
        cartoons, errors = [], []
        max_errors = 3
        while (len(cartoons) < num_cartoons) and (len(errors) < max_errors):
            # Make a request to the random cartoon page
            r = self.get_random_cartoon()
            if (not r) and (r.status_code != 200):
                errors.append(r)
                print(f"ERROR: {r.status_code}")
                time.sleep(10)
                continue
                
            # Add cartoon info (captions, URLs, etc.) to list
            formatted = self.format_cartoon_response(r)
            cartoons.extend(formatted)
            print(f"({len(cartoons)}): {cartoons[-1]['caption']}\n")

            # Download the cartoon images and save locally
            if save_images:
                [self.download_image(cartoon) for cartoon in formatted]

        # Save the cartoon info locally
        with open("cartoons.json", "w") as outfile:
            json.dump(cartoons, outfile)
        elapsed = time.time() - t0
        print(f"Time elapsed: {elapsed / 60:.2f} minutes.")                      
        return cartoons, errors

## Test out the scraper

In [110]:
# Create an instance of the cartoon scraper
scraper = Scraper(sleep_time=2)

In [83]:
# Make a single request for a random cartoon
r = scraper.get_random_cartoon()
r

<Response [200]>

In [85]:
scraper.format_cartoon_response(r)[0]

{'id': '2812527',
 'pn': 'a18413',
 'date': '08/29/2014 18:16',
 'src': 'https://www.newyorker.com/wp-content/uploads/2014/09/140908_a18413-600.jpg',
 'caption': '“As you go through life, take time to monetize the roses.”',
 'cartoon_bank_url': 'http://www.condenaststore.com/-sp/_i12246451_.htm',
 'caption_raw': '&ldquo;As you go through life, take time to monetize the roses.&rdquo;'}

## Download many cartoons

In [None]:
cartoons, errors = scraper.download_cartoons(num_cartoons=10_000, save_images=True)
cartoons[-1]

# Read in cartoons as a DataFrame

In [113]:
import pandas as pd

In [127]:
df = pd.read_json("./cartoons.json")
df.drop_duplicates(inplace=True)
df.set_index("id", inplace=True)
print(df.shape)
df.head(3)

(733, 6)


Unnamed: 0_level_0,caption,caption_raw,cartoon_bank_url,date,pn,src
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1557301,“Keep in mind that it was put there by the sha...,&ldquo;Keep in mind that it was put there by t...,,2014-07-14 00:26:00,a18170,https://www.newyorker.com/wp-content/uploads/2...
2877815,“I'm sure you're all familiar with the concept...,&ldquo;I'm sure you're all familiar with the c...,,2014-10-27 17:06:00,aa20,https://www.newyorker.com/wp-content/uploads/1...
2710693,“Can we please go back to playing on our phones?”,&ldquo;Can we please go back to playing on our...,http://www.condenaststore.com/-sp/_i12228701_.htm,2014-08-15 17:22:00,a18471,https://www.newyorker.com/wp-content/uploads/2...
