In [1]:
# imports
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import re

## Scrape Data

In [2]:
# set driver to chrome
driver = webdriver.Chrome("/Applications/chromedriver")

In [3]:
# go to the umphreys all things URL
url = "https://allthings.umphreys.com/setlists/"
driver.get(url)

In [4]:
# load the main page
content = driver.page_source
soup = BeautifulSoup(content)

In [5]:
shows = {}

# iterate through the years
for year in soup.findAll('a', attrs={'class':'setlistbutton btn btn-small btn-default'}):
    year_url = url + str(year.get('rel')[0]) + ".html"
    # go to year url
    driver.get(year_url)
    content = driver.page_source
    soup = BeautifulSoup(content)
    
    # scrape every show of this year
    for show in soup.findAll('section', attrs={'class':'setlist'}):
        
        # get show date and title of show
        header = show.find('h3', attrs={'class':'splashtitle'})
        date = header.find('a', attrs={'class':'setlistdate'}).findAll(text=True)[0].strip()
        title = ''.join(header.findAll(text=True)[1:]).strip()
        
        # get dictionary of setlabel to setlist for every set
        if 'Setlist Unknown (but it was probably awesome)' in str(show.findAll('p',recursive=False, text=True)):
            sets = None
        else:
            sets = {}
            for setlist in show.findAll('p',recursive=False):
                song_titles = []
                setlabel = ''.join(setlist.find('b', attrs={'class':'setlabel'}).findAll(text=True))
                setlabel = setlabel.strip().rstrip(':')
                song_titles = ''.join(''.join(setlist.findAll(text=True)).strip().split(':')[1:]).strip()
                sets[setlabel] = song_titles
        
        # get setlist meta date
        meta = ''.join(show.find('div', attrs={'class':'showmeta'}).findAll(text=True))
        
        # get the reviews (if any)
        tmp = meta.split('Reviews:')
        reviews = None if len(tmp) == 1 else tmp[1].strip()
        meta = tmp[0].strip()
        
        # remove the purchase via UMLive info (also removes reviews)
        meta = meta.split('Purchase via UMLive')[0]
        
        # get the support (if any)
        tmp = meta.split('Support:')
        support = None if len(tmp) == 1 else tmp[1].strip(' \n')
        meta = tmp[0].strip()
        
        # get the notes (if any)
        tmp = meta.split('Notes:')
        notes = None if len(tmp) == 1 else tmp[1].split('\n')
        meta = tmp[0].strip()
        
        # get the tag captions
        tags = re.split('([[]\d+[]].)',meta)[1:]
        tmp = {}
        for i in range(int(len(tags)/2)):
            tmp[tags[2*i].strip(' ][')] = tags[2*i+1]
        tags = tmp  
        
        # add show to show dictionary
        shows[date] = dict(title=title,
                           sets=sets,
                           reviews=reviews,
                           support=support,
                           notes=notes,
                           tags=tags)

In [16]:
# Create pandas dataframe
df = pd.DataFrame()
for date in shows:
    row = {}
    row['date'] = datetime.strptime(date, "%m.%d.%Y").date()
    row['title'] = shows[date]['title']
    if shows[date]['sets'] is not None:
        for setlist in shows[date]['sets']:
            row[setlist] = shows[date]['sets'][setlist]
    row['reviews'] = shows[date]['reviews']
    row['support'] = shows[date]['support']
    row['notes'] = str(shows[date]['notes'])
    row['tags'] = str(shows[date]['tags'])
    df = df.append(pd.DataFrame(row, index=[0]))

In [20]:
df.columns

Index(['date', 'title', 'Set 1', 'Set 2', 'Encore', 'reviews', 'support',
       'notes', 'tags', 'One Set', 'Set 3', '2nd Encore', 'Quarter 1',
       'Quarter 2', 'Quarter 3', 'Quarter 4', 'Overtime', '3rd Encore'],
      dtype='object')

In [21]:
df = df[['date', 'title', 'support', 'notes', 'tags', 'reviews', 'One Set','Set 1', 'Set 2', 'Set 3', 'Encore', 
         '2nd Encore','3rd Encore', 'Quarter 1', 'Quarter 2', 'Quarter 3', 'Quarter 4', 'Overtime', ]]
df

Unnamed: 0,date,title,support,notes,tags,reviews,One Set,Set 1,Set 2,Set 3,Encore,2nd Encore,3rd Encore,Quarter 1,Quarter 2,Quarter 3,Quarter 4,Overtime
0,2020-01-17,"Beacon Theatre, New York, NY, USA",,['Anthem was in remembrance of Neil Peart'],{'1': 'began with More Than Words (Extreme) te...,,,"Unsung Hero > JaJunk > Ocean Billy, Speak U...","Suxity > JaJunk, Breaker, Wappy Sprayberry[...",,The Weight Around > The Silent Type[4],,,,,,,
0,2020-01-18,"Beacon Theatre, New York, NY, USA",,['Limelight was in remembrance of Neil Peart'],"{'1': 'with Bright Lights, Big City jam', '2':...",,,"End of the Road > 1348 > Crucial Taunt, Syn...","Limelight[4], Der Bluten Kat > Higgins > De...",,Last Train Home > 1348,,,,,,,
0,2020-01-19,"Brooklyn Bowl, Brooklyn, NY, USA",,,"{'1': 'began with YYZ (Rush) tease', '2': 'wit...",,,"Looks, The Fussy Dutchman[1], Maybe Someday ...","Front Porch > Resolution > Front Porch[2], ...",,Phil's Farm > Suxity > Phil's Farm,,,,,,,
0,2020-01-23,"Penn's Peak, Jim Thorpe, PA, USA",The New Deal,,"{'1': 'with Rhiannon (Fleetwood Mac) teases', ...",,,"Plunger > Glory, Roctopus, Similar Skin, M...","Night Nurse > Ringo, Example 1[1], Hurt Bir...",,Kashmir,,,,,,,
0,2020-01-24,"Stage AE, Pittsburgh, PA, USA",The New Deal,['last Tom Sawyer 02.18.2017 (264 shows)'],"{'1': 'debut, original; with Jake on acoustic'...",,,"Depth Charge > Mad Love > Rocker Part 2, Re...","Slacker, The Triple Wide > In The Black, Sy...",,Conduit,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,1998-11-27,"Elbo Room, Chicago, IL, USA",,['there may have been a second set at this show'],{},,"Front Porch, Hangover, Phil's Farm, Rosanna...",,,,,,,,,,,
0,1998-12-09,"The Factory, South Bend, IN, USA",,,"{'1': 'debut, Pink Floyd', '2': 'with Divided ...",,,"Divisions, Red Baron, Muff II The Revenge, ...","G-Song, In the Flesh?[1], Jessica[2], Thin ...",,,,,,,,,
0,1998-12-11,"Martyrs', Chicago, IL, USA",,,{},,"Red Baron, All In Time, Down Under, G-Song,...",,,,,,,,,,,
0,1998-12-17,"The Factory, South Bend, IN, USA",,,{},,,,,,,,,,,,,


In [7]:
# create CSV
df.to_csv('scrape_09_25_2020.csv')

## Analyze Scrape

In [8]:
df = pd.read_csv('scrape_09_25_2020.csv',index_col=0)

In [9]:
df[df.date == datetime.strptime("11.03.2006", "%m.%d.%Y").date()]

Unnamed: 0,date,title,Set 1,Set 2,Encore,reviews,support,notes,tags,One Set,Set 3,2nd Encore,Quarter 1,Quarter 2,Quarter 3,Quarter 4,Overtime,3rd Encore


In [10]:
df[df.title == 'The Barrymore Theatre, Madison, WI, USA']

Unnamed: 0,date,title,Set 1,Set 2,Encore,reviews,support,notes,tags,One Set,Set 3,2nd Encore,Quarter 1,Quarter 2,Quarter 3,Quarter 4,Overtime,3rd Encore
0,2011-03-02,"The Barrymore Theatre, Madison, WI, USA",Phil's Farm > Words > Miss Tinkle's Overture...,"Prowler > Syncopated Strangers, I Am the Wal...","Walletsworth, Hot for Teacher",1 review(s),,,{'1': 'with Blues for Los Angeles (Bill Frisel...,,,,,,,,,
0,2011-03-03,"The Barrymore Theatre, Madison, WI, USA","The Fussy Dutchman, Intentions Clear[1] > Ge...","Divisions, Der Bluten Kat > Atmosfarag > De...",Wizard Burial Ground,1 review(s),,"[""last It's So Easy 04.13.2008 (338 shows)""]","{'1': '""Jimmy Stewart"" with lyrics'}",,,,,,,,,
0,2009-02-12,"The Barrymore Theatre, Madison, WI, USA","Slacker > Ringo > Slacker, Atmosfarag[1] ->...","The Bottom Half, End of the Road, Red Tape, ...",Soul Food I[4] > Morning Song,,,,"{'1': 'unfinished', '2': 'with Jeff Coffin on ...",,,,,,,,,
0,2009-02-13,"The Barrymore Theatre, Madison, WI, USA","The Haunt > Hajimemashite, Prowler > 2nd Se...",Blue Echo > Living On a Farm[1] > Blue Echo ...,2x2,,,['last Memphis Underground 01.07.2006 (372 sho...,"{'1': ""debut, Ali Baba's Tahini; two verses"", ...",,,,,,,,,
0,2009-02-14,"The Barrymore Theatre, Madison, WI, USA","August[1], Professor Wormbog, Syncopated Str...","All In Time, Out Of Order > Gulf Stream, Mu...",Preamble > Mantis,1 review(s),,,"{'1': 'with Nothing Too Fancy tease', '2': 'wi...",,,,,,,,,
0,2009-02-15,"The Barrymore Theatre, Madison, WI, USA","Resolution[1] > Great American[1], Rocker[2]...","Bridgeless > Prophecy Now[4] > Bridgeless, ...","Honey Bunch[7], A Fifth of Beethoven",,,"['last Rocker 09.22.2006 (276 shows)', 'last W...",{'1': 'with Jake and Brendan on acoustic guita...,,,,,,,,,
0,2006-11-02,"The Barrymore Theatre, Madison, WI, USA",Intro > Higgins > Hurt Bird Bath > Sweetnes...,Much Obliged -> The Fuzz > Tribute to the Spi...,Thin Air,1 review(s),,,"{'1': 'with Kat\'s Tune tease; ""Jimmy Stewart""...",,,,,,,,,
0,2006-11-03,"The Barrymore Theatre, Madison, WI, USA","Plunger, Got Your Milk (Right Here) > Wallet...","All In Time -> Smell the Mitten, Wife Soup, ...",Prowler > Glory > All In Time[4],2 review(s),,,"{'1': 'with Brendan and Jake on acoustics', '2...",,,That's the Way[5],,,,,,
0,2005-02-18,"The Barrymore Theatre, Madison, WI, USA","Hangover -> Who Knows > Hangover, Tribute to...","Atmosfarag[2] > The Bottom Half, Der Bluten ...",Great American > Walletsworth,1 review(s),,"[""with The Woody Woodpecker theme (George Tibb...",{'1': 'with Top Gun Anthem (Harold Faltermeyer...,,,,,,,,,
0,2005-06-03,"The Barrymore Theatre, Madison, WI, USA",Andy's Last Beer > Uncle Wally > Andy's Last...,"The Triple Wide -> 2x2, Black Sabbath > War ...",Soul Food I[3] > In the Hall of the Mountain ...,,,,"{'1': 'with Muffburger Sandwich, Music for a F...",,,,,,,,,
