# Download New Yorker cartoons

We'll be scraping images and captions from their [Random cartoon page](https://www.newyorker.com/cartoons/random/).

In [48]:
import requests
from requests.exceptions import Timeout
import html
import time
import json

## Write a class to handle the GET requests to NewYorker.com

In [None]:
Accept: */*
Accept-Encoding: gzip, deflate, br
Accept-Language: en-US,en;q=0.9,fr;q=0.8

Cookie: CN_xid=00354e9d-c0cb-46d1-b808-4068d5bc79bf; ev_sid=5c27a887e4b040a95cdff547; ev_did=5c27a887e4b040a95cdff546; ev_abGroup=4MeterMax; ev_abValue=10; CN_segments=; AMCVS_F7093025512D2B690A490D44%40AdobeOrg=1; pay_ent_smp=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCIsInZlciI6MX0.eyJ1cmxzIjpbIi9hIiwiL2IiLCIvYyIsIi9kIl0sImNudCI6NCwibWF4Ijo0LCJleHAiOjIwMTgxMn0.e9GsYtcfR5lxj1kQ9HPqLGC-Z9s2Eio8xc819O6vifk; _dr=https%3A%2F%2Fwww.google.com%2F; v30=google.com; v39=google.com; s_cc=true; cneplayermuted=0; cneplayercount=1; cneplayervolume=1; s_vnum_m=1546329600950%26vn%3D3; sinvisit_m=true; s_ppn=no%20value; s_pct=null; AMCV_F7093025512D2B690A490D44%40AdobeOrg=102365995%7CMCIDTS%7C17895%7CMCMID%7C59406177472518989811151060165708450564%7CMCAID%7CNONE%7CMCOPTOUT-1546134630s%7CNONE%7CvVersion%7C2.2.0; s_depth=1; s_sq=%5B%5BB%5D%5D; s_nr=1546127944601-Repeat
DNT: 1
Host: www.newyorker.com
If-None-Match: W/"214-1756038895"
Referer: https://www.newyorker.com/cartoons/random/

In [244]:
class Scraper(object):
    """Scrape cartoons from The New Yorker random cartoon page"""

    # Create a persistent requests connection
    _session = requests.Session()
    _session.headers = {'application': 'PyYorker',
                        'If-None-Match': None,
                        'Accept': '*/*',
                        'Accept-Encoding': 'gzip, deflate, br',
                        'Connection': 'keep-alive',
                        'DNT': '1',
                        'Host': 'www.newyorker.com',
                        'Referer': 'https://www.newyorker.com/cartoons/random/',
                        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
    _SLEEP_MIN = 0.2  # Enforce minimum wait time between API calls (seconds)

    def __init__(self, timeout=5, sleep_time=0.5):
        """ Scraper Constructor

        :param timeout: time before quitting on response (seconds)
        :param sleep_time: time to wait between requests
        """

        self.api_root = "https://www.newyorker.com"
        self.timeout = timeout
        self.sleep_time = sleep_time
        self._last_response = None

    def _make_request(self, path):
        """Make a request to the API"""
        uri = self.api_root + path
        
        # Make the request
        response = None
        try:
            response = self._session.request("GET", uri, timeout=self.timeout)
            self._last_response = response
            self._session.headers['If-None-Match'] = response.headers['ETag']
        except Timeout as e:
            print("Timeout raised and caught:\n{e}".format(e=e))

        # Enforce rate limiting
        time.sleep(max(self._SLEEP_MIN, self.sleep_time))
        return response

    def get_random_cartoon(self):
        """Requests a random New Yorker cartoon"""
        endpoint = "/cartoons/random/randomAPI"
        return self._make_request(endpoint)

    def format_cartoon_response(self, response):
        """Cleans up the cartoon response data"""
        data = response.json()
        for cartoon in data:
            cartoon['caption_raw'] = cartoon['caption']
            cartoon['caption'] = html.unescape(cartoon['caption_raw'])
        return data
    
    def download_cartoons(self, num_cartoons, save_cartoons=True):
        """Downloads random cartoons from The New Yorker"""
        cartoons, errors = [], []
        max_errors = 3
        t0 = time.time()
        while (len(cartoons) < num_cartoons) and (len(errors) < max_errors):
            r = self.get_random_cartoon()
            if r and r.status_code == 200:
                cartoons.extend(self.format_cartoon_response(r))
                print(f"({len(cartoons)}): {cartoons[-1]['caption']}\n")
            else:
                errors.append(r)
        if save_cartoons:
            with open("cartoons.json", "w") as outfile:
                json.dump(cartoons, outfile)
        elapsed = time.time() - t0                      
        print(f"Time elapsed: {elapsed / 60:.2f} minutes.")                      
        return cartoons, errors

## Test out the scraper

In [110]:
# Create an instance of the cartoon scraper
scraper = Scraper(sleep_time=5)

In [83]:
# Make a single request for a random cartoon
r = scraper.get_random_cartoon()
r

<Response [200]>

In [85]:
scraper.format_cartoon_response(r)[0]

{'id': '2812527',
 'pn': 'a18413',
 'date': '08/29/2014 18:16',
 'src': 'https://www.newyorker.com/wp-content/uploads/2014/09/140908_a18413-600.jpg',
 'caption': '“As you go through life, take time to monetize the roses.”',
 'cartoon_bank_url': 'http://www.condenaststore.com/-sp/_i12246451_.htm',
 'caption_raw': '&ldquo;As you go through life, take time to monetize the roses.&rdquo;'}

## Download many cartoons

In [None]:
cartoons, errors = scraper.download_cartoons(num_cartoons=1000, save_cartoons=True)
cartoons[-1]

# Read in cartoons as a DataFrame

In [113]:
import pandas as pd

In [127]:
df = pd.read_json("./cartoons.json")
df.drop_duplicates(inplace=True)
df.set_index("id", inplace=True)
print(df.shape)
df.head(3)

(733, 6)


Unnamed: 0_level_0,caption,caption_raw,cartoon_bank_url,date,pn,src
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1557301,“Keep in mind that it was put there by the sha...,&ldquo;Keep in mind that it was put there by t...,,2014-07-14 00:26:00,a18170,https://www.newyorker.com/wp-content/uploads/2...
2877815,“I'm sure you're all familiar with the concept...,&ldquo;I'm sure you're all familiar with the c...,,2014-10-27 17:06:00,aa20,https://www.newyorker.com/wp-content/uploads/1...
2710693,“Can we please go back to playing on our phones?”,&ldquo;Can we please go back to playing on our...,http://www.condenaststore.com/-sp/_i12228701_.htm,2014-08-15 17:22:00,a18471,https://www.newyorker.com/wp-content/uploads/2...


## Download the cartoon images

In [140]:
import urllib.request

In [188]:
print(df.iloc[550])

caption                     “Nobody has to  me to like Jeff Bridges.”
caption_raw         &ldquo;Nobody has to  me to like Jeff Bridges....
cartoon_bank_url                                                     
date                                              2014-08-06 12:02:00
pn                                                               aa80
src                 https://www.newyorker.com/wp-content/uploads/1...
Name: 2699631, dtype: object


In [181]:
# Download the image for each cartoon
errors = []
for n, (index, row) in enumerate(df.iterrows()):
    if n % 10 == 0:
        print(f"({n}/{len(df)}): {row.caption}")
    fn = f'{index}.{row.src.split(".")[-1]}'
    try:
        tup = urllib.request.urlretrieve(row.src, f"./images/{fn}")
    except Exception as e:
        print(f"ERROR ({n}/{len(df)}):\n{e}")
        errors.append(e)
        time.sleep(10)        
    if len(errors) > 3:
        break
    time.sleep(3)

(0/733): “Keep in mind that it was put there by the sharks.”
(10/733): 
(20/733): 
(30/733): “I love how you play them against each other.”
(40/733): 
(50/733): “Can I ask you a personal question?”
(60/733): “It's sad, but it's not laugh-out-loud sad.”
(70/733): “Product placement in a cartoon? You're crazy! Crazy delicious, like a smooth, refreshing Pepsi-Cola.”
(80/733): “You wanna watch the Hunting Channel or the Gathering Network?”
(90/733): “I do my part.”
(100/733): “It's the downstairs neighbors again—they say you're technically proficient, but there's not enough emotion.”
(110/733): “This is what happens when ethical standards are set artificially high.”
(120/733): “That was before I found this amazing new way to earn $$$ working from home.”
(130/733): “What do you mean you 'know someone's been here'?”
(140/733): 
(150/733): “It was your first day—why not give it another twelve years?”
(160/733): 
(170/733): “Switching to drones has made having to be everywhere at once much mor