In [1]:
# imports
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import re

In [2]:
# set driver to chrome
driver = webdriver.Chrome("/Applications/chromedriver")

In [3]:
# go to the umphreys all things URL
url = "https://allthings.umphreys.com/setlists/"
driver.get(url)

In [4]:
# load the main page
content = driver.page_source
soup = BeautifulSoup(content)

In [5]:
shows = {}

# iterate through the years
for year in soup.findAll('a', attrs={'class':'setlistbutton btn btn-small btn-default'}):
    year_url = url + str(year.get('rel')[0]) + ".html"
    # go to year url
    driver.get(year_url)
    content = driver.page_source
    soup = BeautifulSoup(content)
    
    # scrape every show of this year
    for show in soup.findAll('section', attrs={'class':'setlist'}):
        
        # get show date and title of show
        header = show.find('h3', attrs={'class':'splashtitle'})
        date = header.find('a', attrs={'class':'setlistdate'}).findAll(text=True)[0].strip()
        title = ''.join(header.findAll(text=True)[1:]).strip()
        
        # get dictionary of setlabel to setlist for every set
        if 'Setlist Unknown (but it was probably awesome)' in str(show.findAll('p',recursive=False, text=True)):
            sets = None
        else:
            sets = {}
            for setlist in show.findAll('p',recursive=False):
                song_titles = []
                setlabel = setlist.find('b', attrs={'class':'setlabel'})
                setlabel = list(setlabel)[0].strip().rstrip(':')
                song_titles = ''.join(setlist.findAll(text=True)[1:]).strip()
                sets[setlabel] = song_titles
        
        # get setlist meta date
        meta = ''.join(show.find('div', attrs={'class':'showmeta'}).findAll(text=True))
        
        # get the reviews (if any)
        tmp = meta.split('Reviews:')
        reviews = None if len(tmp) == 1 else tmp[1].strip()
        meta = tmp[0].strip()
        
        # remove the purchase via UMLive info (also removes reviews)
        meta = meta.split('Purchase via UMLive')[0]
        
        # get the support (if any)
        tmp = meta.split('Support:')
        support = None if len(tmp) == 1 else tmp[1].strip(' \n')
        meta = tmp[0].strip()
        
        # get the notes (if any)
        tmp = meta.split('Notes:')
        notes = None if len(tmp) == 1 else tmp[1].split('\n')
        meta = tmp[0].strip()
        
        # get the tag captions
        tags = re.split('([[]\d+[]].)',meta)[1:]
        tmp = {}
        for i in range(int(len(tags)/2)):
            tmp[tags[2*i].strip(' ][')] = tags[2*i+1]
        tags = tmp  
        
        # add show to show dictionary
        shows[date] = dict(title=title,
                           sets=sets,
                           reviews=reviews,
                           support=support,
                           notes=notes,
                           tags=tags)

In [6]:
# Create pandas dataframe
df = pd.DataFrame()
for date in shows:
    row = {}
    row['date'] = datetime.strptime(date, "%m.%d.%Y").date()
    row['title'] = shows[date]['title']
    if shows[date]['sets'] is not None:
        for setlist in shows[date]['sets']:
            row[setlist] = shows[date]['sets'][setlist]
    row['reviews'] = shows[date]['reviews']
    row['support'] = shows[date]['support']
    row['notes'] = str(shows[date]['notes'])
    row['tags'] = str(shows[date]['tags'])
    df = df.append(pd.DataFrame(row, index=[0]))

In [7]:
# create CSV
df.to_csv('scrape_09_25_2020.csv')