In [29]:
#!/usr/bin/env python
import logging
import csv
from selenium import webdriver
from urllib.parse import urldefrag, urljoin
from collections import deque
from bs4 import BeautifulSoup
from string import ascii_uppercase
import os




class SeleniumCrawler(object):

    def __init__(self, base_url, exclusion_list, inclusion_list, output_file='example.csv', start_url=None):

        assert isinstance(exclusion_list, list), 'Exclusion list - needs to be a list'
        assert isinstance(inclusion_list, list), 'Inclusion list - needs to be a list'

        
        dir = os.getcwd()           #get driver

        chrome_driver_path = dir + '/chromedriver'
        self.browser = webdriver.Chrome(chrome_driver_path)

        self.base = base_url

        self.start = start_url if start_url else base_url  #If no start URL is passed use the base_url

        self.exclusions = exclusion_list  #List of URL patterns we want to exclude

        self.inclusions = inclusion_list #List of URL patterns we want to include
        
        self.crawled_urls = []  #List to keep track of URLs we have already visited

        self.url_queue = deque([self.start])  #Add the start URL to our list of URLs to crawl

        self.output_file = output_file


    def get_page(self, url):
        try:
            self.browser.get(url)
            return self.browser.page_source
        except Exception as e:
            logging.exception(e)
            return

    def get_soup(self, html):
        if html is not None:
            soup = BeautifulSoup(html, 'lxml')
            return soup
        else:
            return

    def get_links(self, soup):

        for link in soup.find_all('a', href=True):
            link = link['href']
            link_parsed = link.split('-')
            if link_parsed[-1] != 'transcript/':
                continue
            if any(e in link for e in self.exclusions):
                continue
            url = urljoin(self.base, urldefrag(link)[0])
            if url not in self.url_queue and url not in self.crawled_urls:
                if url.startswith(self.base):
                    print('url made it!: ', url)
                    self.url_queue.append(url)

    def get_data(self, soup):

        try:
            result = soup.find("div", {"class":"post-content"}).text  #trying to extract comedy transcripts
        except:
            result = None
        return result

    def csv_output(self, url, title):

        with open(self.output_file, 'a', encoding='utf-8') as outputfile:

            writer = csv.writer(outputfile)
            writer.writerow([url, title])

    def run_crawler(self):
        while len(self.url_queue): #If we have URLs to crawl - we crawl
            current_url = self.url_queue.popleft() #We grab a URL from the left of the list
            self.crawled_urls.append(current_url) #We then add this URL to our crawled list
            html = self.get_page(current_url)
            if self.browser.current_url != current_url: #If the end URL is different from requested URL - add URL to crawled list
                self.crawled_urls.append(current_url)
            soup = self.get_soup(html)
            if soup is not None:  #If we have soup - parse and write to our csv file
                self.get_links(soup)
                print('curr_url: ', current_url)
                transcript = self.get_data(soup)
                self.csv_output(current_url, transcript)



In [31]:
if __name__ == '__main__':
    #get input parameters for crawler
    base_url = 'https://scrapsfromtheloft.com'
    start_url = 'https://scrapsfromtheloft.com/tag/stand-up-transcripts/'
    output_file = 'transcripts.csv'
    a = SeleniumCrawler(base_url, ['?'], [''], output_file, start_url)
    a.run_crawler()

url made it!:  https://scrapsfromtheloft.com/2019/09/12/george-carlin-dumb-americans-transcript/
url made it!:  https://scrapsfromtheloft.com/2019/09/10/bill-burr-paper-tiger-transcript/
url made it!:  https://scrapsfromtheloft.com/2019/08/29/dave-chappelle-sticks-stones-epilogue-punchline-transcript/
url made it!:  https://scrapsfromtheloft.com/2019/08/26/dave-chappelle-sticks-stones-transcript/
url made it!:  https://scrapsfromtheloft.com/2019/08/25/emily-heller-ice-thickeners-transcript/
url made it!:  https://scrapsfromtheloft.com/2019/08/13/david-cross-oh-come-on-transcript/
url made it!:  https://scrapsfromtheloft.com/2019/08/08/kevin-hart-gun-compartment-transcript/
url made it!:  https://scrapsfromtheloft.com/2019/08/01/whitney-cummings-can-i-touch-it-transcript/
url made it!:  https://scrapsfromtheloft.com/2019/07/07/ralphie-may-filthy-animal-tour-transcript/
url made it!:  https://scrapsfromtheloft.com/2019/06/27/mike-epps-only-one-mike-transcript/
url made it!:  https://scra

curr_url:  https://scrapsfromtheloft.com/2019/09/12/george-carlin-dumb-americans-transcript/
curr_url:  https://scrapsfromtheloft.com/2019/09/10/bill-burr-paper-tiger-transcript/
curr_url:  https://scrapsfromtheloft.com/2019/08/29/dave-chappelle-sticks-stones-epilogue-punchline-transcript/
curr_url:  https://scrapsfromtheloft.com/2019/08/26/dave-chappelle-sticks-stones-transcript/
curr_url:  https://scrapsfromtheloft.com/2019/08/25/emily-heller-ice-thickeners-transcript/
curr_url:  https://scrapsfromtheloft.com/2019/08/13/david-cross-oh-come-on-transcript/
curr_url:  https://scrapsfromtheloft.com/2019/08/08/kevin-hart-gun-compartment-transcript/
curr_url:  https://scrapsfromtheloft.com/2019/08/01/whitney-cummings-can-i-touch-it-transcript/
curr_url:  https://scrapsfromtheloft.com/2019/07/07/ralphie-may-filthy-animal-tour-transcript/
curr_url:  https://scrapsfromtheloft.com/2019/06/27/mike-epps-only-one-mike-transcript/
curr_url:  https://scrapsfromtheloft.com/2019/06/22/adam-devine-bes

ERROR:root:Message: disconnected: Unable to receive message from renderer
  (Session info: chrome=77.0.3865.75)
  (Driver info: chromedriver=2.42.591059 (a3d9684d10d61aa0c45f6723b327283be1ebaad8),platform=Mac OS X 10.14.6 x86_64)
Traceback (most recent call last):
  File "<ipython-input-29-811c970ac113>", line 44, in get_page
    self.browser.get(url)
  File "//anaconda3/lib/python3.7/site-packages/selenium/webdriver/remote/webdriver.py", line 333, in get
    self.execute(Command.GET, {'url': url})
  File "//anaconda3/lib/python3.7/site-packages/selenium/webdriver/remote/webdriver.py", line 321, in execute
    self.error_handler.check_response(response)
  File "//anaconda3/lib/python3.7/site-packages/selenium/webdriver/remote/errorhandler.py", line 242, in check_response
    raise exception_class(message, screen, stacktrace)
selenium.common.exceptions.WebDriverException: Message: disconnected: Unable to receive message from renderer
  (Session info: chrome=77.0.3865.75)
  (Driver info:

curr_url:  https://scrapsfromtheloft.com/2018/09/13/iliza-shlesinger-war-paint-transcript/
curr_url:  https://scrapsfromtheloft.com/2018/08/25/bert-kreischer-secret-time-transcript/
curr_url:  https://scrapsfromtheloft.com/2018/08/11/demetri-martin-overthinker-transcript/
curr_url:  https://scrapsfromtheloft.com/2018/08/04/bill-maher-live-from-oklahoma-2018-full-transcript/
curr_url:  https://scrapsfromtheloft.com/2018/08/04/rowan-atkinson-live-1992-transcript/
curr_url:  https://scrapsfromtheloft.com/2018/07/31/iliza-shlesinger-confirmed-kills-transcript/
curr_url:  https://scrapsfromtheloft.com/2018/07/30/iliza-shlesinger-elder-millennial-2018-full-transcript/
curr_url:  https://scrapsfromtheloft.com/2018/07/26/jim-gaffigan-noble-ape-full-transcript/
curr_url:  https://scrapsfromtheloft.com/2018/07/23/jim-norton-contextually-inadequate-full-transcript/
curr_url:  https://scrapsfromtheloft.com/2018/07/21/hannah-gadsby-nanette-transcript/
curr_url:  https://scrapsfromtheloft.com/2018/0

curr_url:  https://scrapsfromtheloft.com/2017/12/07/aziz-ansari-buried-alive-2013-full-transcript/
curr_url:  https://scrapsfromtheloft.com/2017/12/01/paul-mooney-piece-mind-godbless-america-2014-full-transcript/
curr_url:  https://scrapsfromtheloft.com/2017/11/26/patton-oswalt-annihilation-2017-full-transcript/
url made it!:  https://scrapsfromtheloft.com/2018/08/25/real-time-with-bill-maher-august-24-2018-transcript/
curr_url:  https://scrapsfromtheloft.com/2017/11/23/bill-maher-live-from-d-c-2014-full-transcript/
curr_url:  https://scrapsfromtheloft.com/2017/11/10/sarah-silverman-jesus-is-magic-2005-full-transcript/
curr_url:  https://scrapsfromtheloft.com/2017/11/10/sarah-silverman-we-are-miracles-2013-full-transcript/
curr_url:  https://scrapsfromtheloft.com/2017/11/09/pablo-francisco-ouch-live-san-jose-2006-full-transcript/
curr_url:  https://scrapsfromtheloft.com/2017/11/08/bill-maher-but-im-not-wrong-2010-full-transcript/
curr_url:  https://scrapsfromtheloft.com/2017/11/07/russ

curr_url:  https://scrapsfromtheloft.com/2017/05/12/eddie-griffin-can-tell-em-i-said-transcript/
curr_url:  https://scrapsfromtheloft.com/2017/05/10/bill-burr-walk-way-2017-full-transcript/
curr_url:  https://scrapsfromtheloft.com/2017/05/06/louis-ck-oh-my-god-full-transcript/
curr_url:  https://scrapsfromtheloft.com/2017/05/04/louis-c-k-live-at-the-comedy-store-2015-transcript/
curr_url:  https://scrapsfromtheloft.com/2017/05/03/bill-hicks-relentless-1992-transcript/
curr_url:  https://scrapsfromtheloft.com/2017/05/02/hasan-minhaj-white-house-correspondents-dinner-transcript/
curr_url:  https://scrapsfromtheloft.com/2017/04/30/daniel-tosh-happy-thoughts-2011-transcript/
curr_url:  https://scrapsfromtheloft.com/2017/04/28/patrice-oneal-elephant-in-the-room-2011-full-transcript/
curr_url:  https://scrapsfromtheloft.com/2017/04/27/katt-williams-live-2006-full-transcript/
curr_url:  https://scrapsfromtheloft.com/2017/04/27/richard-pryor-live-concert-1979-full-transcript/
curr_url:  https: