In [None]:
import urllib.request
from bs4 import BeautifulSoup
import re
import time
import base64
import pickle
import logging

In [None]:
logging.basicConfig(filename="scraping.log",
                    level=logging.DEBUG,
                    filemode="a",
                    format='%(asctime)s %(levelname)s - %(message)s',
                    datefmt='%m-%d-%Y %H:%M:%S')

In [None]:
pickle_post_file = "post_cache.pkl"
pickle_img_file = "img_cache.pkl"

In [None]:
try:
    post_cache = pickle.load(open(pickle_post_file, "rb"))
    logging.info("Loaded post cache")
except:
    post_cache = {}

In [None]:
try:
    img_cache = pickle.load(open(pickle_img_file, "rb"))
    logging.info("Loaded image cache")
except:
    img_cache = {}

In [None]:
def scrape_scc(url):
    # Get html
    if url in post_cache:
        logging.info("Loading from cache: {}".format(url))
        html = post_cache[url]
    else:
        logging.info("Requesting: {}".format(url))
        req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"})
        html = urllib.request.urlopen(req).read()
        # Save to cache
        post_cache[url] = html
        time.sleep(1)
    soup = BeautifulSoup(html, "lxml")
    
    # Get title
    title = soup.find("h1", attrs={"class": "pjgm-posttitle"})
    anchor = re.sub(" ", "_", title.text.lower())
    new_title = '<h1 id="{}">{}</h1>'.format(anchor, title.text)
    toc_link = '<a href="#{}">{}</a>'.format(anchor, title.text)
    
    # Get content
    metadata = soup.find("div", attrs={"class": "pjgm-postmeta"})
    content = soup.find("div", attrs={"class": "pjgm-postcontent"})
    
    # Convert images to base64
    content_with_images = embed_images(content)
    
    post = {"url": url, 
            "toc": toc_link, 
            "body": new_title + str(metadata) + content_with_images}
    return post

In [None]:
def embed_images(content):
    "Read in image URL, convert to base64, and replace image tags in content"
    image_tags = content.find_all("img")
    content = str(content)
    for tag in image_tags:
        # Get image url
        img_url = tag.attrs["src"]
        
        # Get filename extension. If no extension, do nothing
        try:
            img_type = re.search("\.(\w+)$", img_url).group(1)
        except AttributeError:
            continue
        
        # Get image data
        if img_url in img_cache:
            logging.info("Loading image from cache: {}".format(img_url))
            img_data = img_cache[img_url]
        else:
            logging.info("Requesting: {}".format(img_url))
            req = urllib.request.Request(img_url, headers={"User-Agent": "Mozilla/5.0"})
            try:
                img_data = urllib.request.urlopen(req).read()
                # Save to cache
                img_cache[img_url] = img_data
            except urllib.request.HTTPError:
                img_data = None 
        
        if img_data:
            # Convert to base64
            b64_img = base64.b64encode(img_data)
            # Create new img tag
            new_img_tag = '<img src="data:image/{};base64,{}"/>'.format(
                img_type, b64_img.decode(encoding="ascii"))
        else:
            new_img_tag = '<i>Image unavailable.</i>'

        # Replace old tag
        content = re.sub(str(tag), new_img_tag, content)
    return content

In [None]:
url_list = [
    "http://slatestarcodex.com/2014/07/30/meditations-on-moloch/",
    "http://slatestarcodex.com/2014/09/30/i-can-tolerate-anything-except-the-outgroup/",
    "http://slatestarcodex.com/2016/04/27/book-review-albions-seed/",
    "http://slatestarcodex.com/2014/12/17/the-toxoplasma-of-rage/",
    "http://slatestarcodex.com/2014/12/19/nobody-is-perfect-everything-is-commensurable/",
    "http://slatestarcodex.com/2014/11/21/the-categories-were-made-for-man-not-man-for-the-categories/",
    "http://slatestarcodex.com/2013/07/17/who-by-very-slow-decay/",
    "http://slatestarcodex.com/2014/08/16/burdens/",
    "http://slatestarcodex.com/2015/01/31/the-parable-of-the-talents/",
    "http://slatestarcodex.com/2017/02/22/repost-the-non-libertarian-faq/",
    "http://slatestarcodex.com/2013/03/03/reactionary-philosophy-in-an-enormous-planet-sized-nutshell/",
    "http://slatestarcodex.com/2013/10/20/the-anti-reactionary-faq/"
]

In [None]:
book = {"toc": ["<h1>Table of Contents</h1>"],
        "body": []}

In [None]:
for url in url_list:
    post = scrape_scc(url)
    book["toc"].append(post["toc"])
    book["body"].append(post["body"])

In [None]:
book_html = "\n<br>".join(book["toc"]) + "\n" +"\n".join(book["body"])

In [None]:
with open("ssc_moloch.html", "w") as f:
    logging.info("Writing html file")
    f.write(book_html)

In [None]:
logging.info("Saving pickle files")
pickle.dump(post_cache, open(pickle_post_file, "wb"))
pickle.dump(img_cache, open(pickle_img_file, "wb"))