In [79]:
import logging
import csv
from selenium import webdriver
from urllib.parse import urldefrag, urljoin
from collections import deque
from bs4 import BeautifulSoup
from string import ascii_uppercase

In [65]:
class SeleniumCrawler(object):

    def __init__(self, base_url, exclusion_list, inclusion_list, output_file='example.csv', start_url=None):

        assert isinstance(exclusion_list, list), 'Exclusion list - needs to be a list'
        assert isinstance(inclusion_list, list), 'Inclusion list - needs to be a list'

        
        dir = os.getcwd()           #get driver

        chrome_driver_path = dir + '/chromedriver'
        self.browser = webdriver.Chrome(chrome_driver_path)

        self.base = base_url

        self.start = start_url if start_url else base_url  #If no start URL is passed use the base_url

        self.exclusions = exclusion_list  #List of URL patterns we want to exclude

        self.inclusions = inclusion_list #List of URL patterns we want to include
        
        self.crawled_urls = []  #List to keep track of URLs we have already visited

        self.url_queue = deque([self.start])  #Add the start URL to our list of URLs to crawl

        self.output_file = output_file


    def get_page(self, url):
        try:
            self.browser.get(url)
            return self.browser.page_source
        except Exception as e:
            logging.exception(e)
            return

    def get_soup(self, html):
        if html is not None:
            soup = BeautifulSoup(html, 'lxml')
            return soup
        else:
            return

    def get_links(self, soup):

        for link in soup.find_all('a', href=True):
            link = link['href']
            if any(e in link for e in self.exclusions):
                continue
            url = urljoin(self.base, urldefrag(link)[0])
            if url not in self.url_queue and url not in self.crawled_urls:
                if url.startswith(self.base):
                    self.url_queue.append(url)

    def get_data(self, soup):

        try:
            result = soup.find("div", {"class":"comedian-desc"}).text  #trying to extract comedian bios
        except:
            result = None
        return result

    def csv_output(self, url, title):

        with open(self.output_file, 'a', encoding='utf-8') as outputfile:

            writer = csv.writer(outputfile)
            writer.writerow([url, title])

    def run_crawler(self):
        while len(self.url_queue): #If we have URLs to crawl - we crawl
            current_url = self.url_queue.popleft() #We grab a URL from the left of the list
            self.crawled_urls.append(current_url) #We then add this URL to our crawled list
            html = self.get_page(current_url)
            if self.browser.current_url != current_url: #If the end URL is different from requested URL - add URL to crawled list
                self.crawled_urls.append(current_url)
            soup = self.get_soup(html)
            if soup is not None:  #If we have soup - parse and write to our csv file
                self.get_links(soup)
                print('curr_url: ', current_url)
                title = self.get_data(soup)
                self.csv_output(current_url, title)


In [86]:
if __name__ == '__main__':
    base_url = 'https://www.comedyworks.com/comedians/'
    start_url = 'https://www.comedyworks.com/comedians?page='

    for i in range(1,51):
        curr_url = start_url + str(i)
        a = SeleniumCrawler(test_url, ['?'], [''], start_url = curr_url)
        a.run_crawler()
        


curr_url:  https://www.comedyworks.com/comedians?page=1
curr_url:  https://www.comedyworks.com/comedians
curr_url:  https://www.comedyworks.com/comedians/about-lastnight
curr_url:  https://www.comedyworks.com/comedians/adam-carolla-is-unprepared
curr_url:  https://www.comedyworks.com/comedians/adam-cayton-holland
curr_url:  https://www.comedyworks.com/comedians/adam-ray
curr_url:  https://www.comedyworks.com/comedians/adam-ruins-everything-presents-mind-parasites-live-with-adam-conover
curr_url:  https://www.comedyworks.com/comedians/adrian-mesa
curr_url:  https://www.comedyworks.com/comedians/aisha-tyler
curr_url:  https://www.comedyworks.com/comedians/aj-finney
curr_url:  https://www.comedyworks.com/comedians/al-goodwin
curr_url:  https://www.comedyworks.com/comedians/al-jackson
curr_url:  https://www.comedyworks.com/comedians?page=2
curr_url:  https://www.comedyworks.com/comedians
curr_url:  https://www.comedyworks.com/comedians/alex-edelman
curr_url:  https://www.comedyworks.com/co

curr_url:  https://www.comedyworks.com/comedians/brian-posehn
curr_url:  https://www.comedyworks.com/comedians/brian-redban
curr_url:  https://www.comedyworks.com/comedians/brian-regan
curr_url:  https://www.comedyworks.com/comedians/brooks-wheelan
curr_url:  https://www.comedyworks.com/comedians/bryan-callen
curr_url:  https://www.comedyworks.com/comedians/bryan-kellen
curr_url:  https://www.comedyworks.com/comedians/building-hope
curr_url:  https://www.comedyworks.com/comedians/about-lastnight
curr_url:  https://www.comedyworks.com/comedians/adam-carolla-is-unprepared
curr_url:  https://www.comedyworks.com/comedians/adam-cayton-holland
curr_url:  https://www.comedyworks.com/comedians/adam-ray
curr_url:  https://www.comedyworks.com/comedians/adam-ruins-everything-presents-mind-parasites-live-with-adam-conover
curr_url:  https://www.comedyworks.com/comedians/adrian-mesa
curr_url:  https://www.comedyworks.com/comedians/aisha-tyler
curr_url:  https://www.comedyworks.com/comedians/aj-finn

curr_url:  https://www.comedyworks.com/comedians/aj-finney
curr_url:  https://www.comedyworks.com/comedians/al-goodwin
curr_url:  https://www.comedyworks.com/comedians/al-jackson
curr_url:  https://www.comedyworks.com/comedians?page=13
curr_url:  https://www.comedyworks.com/comedians
curr_url:  https://www.comedyworks.com/comedians/dave-chappelle
curr_url:  https://www.comedyworks.com/comedians/dave-coulier
curr_url:  https://www.comedyworks.com/comedians/dave-foley
curr_url:  https://www.comedyworks.com/comedians/david-alan-grier
curr_url:  https://www.comedyworks.com/comedians/david-koechner
curr_url:  https://www.comedyworks.com/comedians/david-spade
curr_url:  https://www.comedyworks.com/comedians/david-cross-at-paramount-theatre
curr_url:  https://www.comedyworks.com/comedians/deacon-gray
curr_url:  https://www.comedyworks.com/comedians/dean-delray
curr_url:  https://www.comedyworks.com/comedians/demetri-martin
curr_url:  https://www.comedyworks.com/comedians/about-lastnight
curr_

curr_url:  https://www.comedyworks.com/comedians/adam-cayton-holland
curr_url:  https://www.comedyworks.com/comedians/adam-ray
curr_url:  https://www.comedyworks.com/comedians/adam-ruins-everything-presents-mind-parasites-live-with-adam-conover
curr_url:  https://www.comedyworks.com/comedians/adrian-mesa
curr_url:  https://www.comedyworks.com/comedians/aisha-tyler
curr_url:  https://www.comedyworks.com/comedians/aj-finney
curr_url:  https://www.comedyworks.com/comedians/al-goodwin
curr_url:  https://www.comedyworks.com/comedians/al-jackson
curr_url:  https://www.comedyworks.com/comedians?page=19
curr_url:  https://www.comedyworks.com/comedians
curr_url:  https://www.comedyworks.com/comedians/henry-cho
curr_url:  https://www.comedyworks.com/comedians/hippieman
curr_url:  https://www.comedyworks.com/comedians/holiday-roast
curr_url:  https://www.comedyworks.com/comedians/how-did-this-get-made
curr_url:  https://www.comedyworks.com/comedians/howie-mandel
curr_url:  https://www.comedyworks

curr_url:  https://www.comedyworks.com/comedians/john-caparulo
curr_url:  https://www.comedyworks.com/comedians/john-crist-up-close-tour
curr_url:  https://www.comedyworks.com/comedians/john-cusack
curr_url:  https://www.comedyworks.com/comedians/john-heffron
curr_url:  https://www.comedyworks.com/comedians/about-lastnight
curr_url:  https://www.comedyworks.com/comedians/adam-carolla-is-unprepared
curr_url:  https://www.comedyworks.com/comedians/adam-cayton-holland
curr_url:  https://www.comedyworks.com/comedians/adam-ray
curr_url:  https://www.comedyworks.com/comedians/adam-ruins-everything-presents-mind-parasites-live-with-adam-conover
curr_url:  https://www.comedyworks.com/comedians/adrian-mesa
curr_url:  https://www.comedyworks.com/comedians/aisha-tyler
curr_url:  https://www.comedyworks.com/comedians/aj-finney
curr_url:  https://www.comedyworks.com/comedians/al-goodwin
curr_url:  https://www.comedyworks.com/comedians/al-jackson
curr_url:  https://www.comedyworks.com/comedians?page

curr_url:  https://www.comedyworks.com/comedians/louie-anderson
curr_url:  https://www.comedyworks.com/comedians/louis-ck
curr_url:  https://www.comedyworks.com/comedians/louis-johnson
curr_url:  https://www.comedyworks.com/comedians/lucas-brothers
curr_url:  https://www.comedyworks.com/comedians/marc-maron
curr_url:  https://www.comedyworks.com/comedians/marcella-arguello
curr_url:  https://www.comedyworks.com/comedians/margaret-cho
curr_url:  https://www.comedyworks.com/comedians/maria-bamford
curr_url:  https://www.comedyworks.com/comedians/maria-bamford-aparna-nancherla
curr_url:  https://www.comedyworks.com/comedians/mark-normand
curr_url:  https://www.comedyworks.com/comedians/about-lastnight
curr_url:  https://www.comedyworks.com/comedians/adam-carolla-is-unprepared
curr_url:  https://www.comedyworks.com/comedians/adam-cayton-holland
curr_url:  https://www.comedyworks.com/comedians/adam-ray
curr_url:  https://www.comedyworks.com/comedians/adam-ruins-everything-presents-mind-para

curr_url:  https://www.comedyworks.com/comedians/adrian-mesa
curr_url:  https://www.comedyworks.com/comedians/aisha-tyler
curr_url:  https://www.comedyworks.com/comedians/aj-finney
curr_url:  https://www.comedyworks.com/comedians/al-goodwin
curr_url:  https://www.comedyworks.com/comedians/al-jackson
curr_url:  https://www.comedyworks.com/comedians?page=36
curr_url:  https://www.comedyworks.com/comedians
curr_url:  https://www.comedyworks.com/comedians/new-faces-contest-finals
curr_url:  https://www.comedyworks.com/comedians/new-faces-contest-rd-1
curr_url:  https://www.comedyworks.com/comedians/new-faces-contest-rd-2
curr_url:  https://www.comedyworks.com/comedians/new-faces-contest-wild-card
curr_url:  https://www.comedyworks.com/comedians/new-talent-night
curr_url:  https://www.comedyworks.com/comedians/nick-guerra
curr_url:  https://www.comedyworks.com/comedians/nick-kroll
curr_url:  https://www.comedyworks.com/comedians/nick-swardson
curr_url:  https://www.comedyworks.com/comedians

curr_url:  https://www.comedyworks.com/comedians/sal-vulcano-paramount
curr_url:  https://www.comedyworks.com/comedians/about-lastnight
curr_url:  https://www.comedyworks.com/comedians/adam-carolla-is-unprepared
curr_url:  https://www.comedyworks.com/comedians/adam-cayton-holland
curr_url:  https://www.comedyworks.com/comedians/adam-ray
curr_url:  https://www.comedyworks.com/comedians/adam-ruins-everything-presents-mind-parasites-live-with-adam-conover
curr_url:  https://www.comedyworks.com/comedians/adrian-mesa
curr_url:  https://www.comedyworks.com/comedians/aisha-tyler
curr_url:  https://www.comedyworks.com/comedians/aj-finney
curr_url:  https://www.comedyworks.com/comedians/al-goodwin
curr_url:  https://www.comedyworks.com/comedians/al-jackson
curr_url:  https://www.comedyworks.com/comedians?page=42
curr_url:  https://www.comedyworks.com/comedians
curr_url:  https://www.comedyworks.com/comedians/sam-adams
curr_url:  https://www.comedyworks.com/comedians/sam-jay
curr_url:  https://w

curr_url:  https://www.comedyworks.com/comedians/todd-glass
curr_url:  https://www.comedyworks.com/comedians/todd-johnson
curr_url:  https://www.comedyworks.com/comedians/tom-cotter
curr_url:  https://www.comedyworks.com/comedians/tom-green
curr_url:  https://www.comedyworks.com/comedians/tom-papa
curr_url:  https://www.comedyworks.com/comedians/tom-segrua-tour
curr_url:  https://www.comedyworks.com/comedians/tommy-johnagin
curr_url:  https://www.comedyworks.com/comedians/tommy-chong-and-shelby
curr_url:  https://www.comedyworks.com/comedians/about-lastnight
curr_url:  https://www.comedyworks.com/comedians/adam-carolla-is-unprepared
curr_url:  https://www.comedyworks.com/comedians/adam-cayton-holland
curr_url:  https://www.comedyworks.com/comedians/adam-ray
curr_url:  https://www.comedyworks.com/comedians/adam-ruins-everything-presents-mind-parasites-live-with-adam-conover
curr_url:  https://www.comedyworks.com/comedians/adrian-mesa
curr_url:  https://www.comedyworks.com/comedians/aish