In [1]:
# imports
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import dill as pickle
from datetime import datetime
from writer import date_stamp, time_stamp
import numpy as np
import re

In [2]:
# set driver to chrome
driver = webdriver.Chrome("/Applications/chromedriver")

### Scrape Setlists

In [3]:
# go to the umphreys all things URL
url = "https://allthings.umphreys.com/setlists/"
driver.get(url)

In [4]:
# load the main page
content = driver.page_source
soup = BeautifulSoup(content)

In [5]:
shows = []

# iterate through the years
for year in soup.findAll('a', attrs={'class':'setlistbutton btn btn-small btn-default'}):
    year_url = url + year['rel'][0] + ".html"
    # go to year url
    driver.get(year_url)
    content = driver.page_source
    soup = BeautifulSoup(content)
    
    # scrape every show of this year
    for show in soup.findAll('section', attrs={'class':'setlist'}):
        
        # get show date and title of show
        header = show.find('h3', attrs={'class':'splashtitle'})
        date = header.find('a', attrs={'class':'setlistdate'}).findAll(text=True)[0].strip()
        title = ''.join(header.findAll(text=True)[1:]).strip()
        
        # Ignore Brendan and Jake shows
        if 'Brendan and Jake' in title:
            continue
        
        # get dictionary of setlabel to setlist for every set
        if 'Setlist Unknown (but it was probably awesome)' in str(show.findAll('p',recursive=False, text=True)):
            sets = None
        else:
            sets = {}
            for setlist in show.findAll('p',recursive=False):
                setlabel = ''.join(setlist.find('b', attrs={'class':'setlabel'}).findAll(text=True))
                setlabel = setlabel.strip().rstrip(':')
                
                # Create useful objects representing the setlist
                setlist_str = setlist.findAll(text=True)
                set_label_parts = 3 if setlabel in ['1st Encore', '2nd Encore', '3rd Encore'] else 1
                songs_transitions_tags = [x.strip() for x in setlist_str[set_label_parts:] if x.strip() != '']
                
                # Manually add missing transitions on this date
                if date == '02.21.2004':
                    i = songs_transitions_tags.index('All In Time')
                    songs_transitions_tags[i:i+1] = ['All In Time', ',']
                    i = songs_transitions_tags.index('Waiting Room')
                    songs_transitions_tags[i:i+1] = [',', 'Waiting Room']                    
                 
                # Extract transitions from unlinked to songs
                i = 0
                while i < len(songs_transitions_tags):
                    x = songs_transitions_tags[i]
                    x_split = x.split(' ')
                    if len(x_split) > 1:
                        out_trans = x_split[-1]
                        if out_trans in [',', '>', '->']:
                            songs_transitions_tags[i:i+1] = [x.rstrip(out_trans).strip(), out_trans]
                            continue
                        in_trans = x_split[0]
                        if in_trans in [',', '>', '->']:
                            songs_transitions_tags[i:i+1] = [in_trans, x.lstrip(in_trans).strip()]
                            continue     
                    if len(x) > 1 and x[-1] == ',':
                        songs_transitions_tags[i:i+1] = [x.rstrip(',').strip(), ',']
                        continue
                    i += 1
                    
                # Manual change
                try:
                    i = songs_transitions_tags.index('unknown song,  unknown song')
                    songs_transitions_tags[i:i+1] = ['unknown song', ',', 'unknown song']
                except:
                    pass
                
                # Remove tags from setlist object maintaining the index of the corresponding song
                i = 0
                tags = []
                songs_transitions = songs_transitions_tags.copy()
                while i < len(songs_transitions):
                    if re.match(r"([[]\d+[]])", songs_transitions[i]):
                        tag_num = int(songs_transitions[i].strip('[]'))
                        prev_index = i-1
                        tags.append((tag_num, prev_index))
                        songs_transitions.pop(i)            
                    else:
                        i += 1
                        
                # Create blank dictionary of songs
                songs = []
                for i in np.arange(0,len(songs_transitions),2):
                    songs.append({'name' : songs_transitions[i],
                                  'stewart' : None, 
                                  'in_transition' : None,
                                  'out_transition' : None,
                                  'tag' : None})
                    
                def offset(a, b):
                    off = 0
                    ret = []
                    a_i = 0
                    b_i = 0
                    while b_i < len(b):
                        if a[a_i] == b[b_i]:
                            a_i += 1
                            b_i += 1
                            ret.append(off)
                        else:
                            a_i += 1
                            off += 1
                    return ret
                
                # Get Jimmy Stewarts for each song; update dictionary
                linked_songs = [x.findAll(text=True)[0].strip() for x in setlist.findAll('a', recursive=False)]
                all_songs = [x['name'] for x in songs]
                if len(linked_songs) > 0:
                    stewart_offset = offset(all_songs, linked_songs)
                    linked_songs_elements = setlist.findAll('a', recursive=False)
                    for i in range(len(linked_songs_elements)):
                        try:
                            stewart = '_'.join(linked_songs_elements[i]['class'])
                        except KeyError:
                            stewart = None
                        songs[i - stewart_offset[i]]['stewart'] = stewart
                           
                # Get tags for each song; update dictionary              
                for tag in tags:
                    song_index = int(tag[1]/2)
                    songs[song_index]['tag'] = tag[0]
                
                # Get transitions for each song; update dictionary  
                for i in np.arange(1,len(songs_transitions),2):
                    out_song_index = int((i-1)/2)
                    songs[out_song_index]['out_transition'] = songs_transitions[i]
                    in_song_index = int((i+1)/2)
                    songs[in_song_index]['in_transition'] = songs_transitions[i]
                
                sets[setlabel] = songs
        
        # get setlist meta date
        meta = ''.join(show.find('div', attrs={'class':'showmeta'}).findAll(text=True))
        
        # get the reviews (if any)
        tmp = meta.split('Reviews:')
        reviews = None if len(tmp) == 1 else tmp[1].strip()
        meta = tmp[0].strip()
        
        # remove the purchase via UMLive info (also removes reviews)
        meta = meta.split('Purchase via UMLive')[0]
        
        # get the support (if any)
        tmp = meta.split('Support:')
        support = None if len(tmp) == 1 else tmp[1].strip(' \n')
        meta = tmp[0].strip()
        
        # get the notes (if any)
        tmp = meta.split('Notes:')
        notes = None if len(tmp) == 1 else tmp[1].split('\n')
        meta = tmp[0].strip()
        
        # get the tag captions
        tags = re.split('([[]\d+[]].)',meta)[1:]
        tmp = {}
        for i in range(int(len(tags)/2)):
            tmp[int(tags[2*i].strip(' ]['))] = tags[2*i+1]
        tags = tmp 
        
        if notes is not None and 'set for those who purchased the VIP package' in notes:
            continue
        
        # add show to shows
        shows.append(dict(date=date,
                          title=title,
                          sets=sets,
                          reviews=reviews,
                          support=support,
                          notes=notes,
                          tags=tags))

In [6]:
# Pickle the dictionary object
with open('scrapes/setlists_%s.pickle' % (time_stamp()), 'wb') as handle:
    pickle.dump(shows, handle, protocol=pickle.HIGHEST_PROTOCOL)

### Scrape Songs

In [7]:
# go to the umphreys all things URL
url = "https://allthings.umphreys.com/song/"
driver.get(url)

In [8]:
# load the main page
content = driver.page_source
soup = BeautifulSoup(content)

In [9]:
songs = {}
table = soup.find('table', attrs={'id' : 'songtable'})
for row in table.findAll('tr')[2:]:
    tmp = row.findAll('td')
    if tmp[0].findAll(text=True)[0] in ['Happy Birthday', 'XRT Xmas Promo Medley']:
        songs['Happy Birthday'] = {'artist' : None,
                                   'times_played' : tmp[2].findAll(text=True)[0],
                                   'debut_date' : tmp[3].findAll(text=True)[0].replace('-','.'),
                                   'last_played' : tmp[4].findAll(text=True)[0].replace('-','.')}
    else:
        cells = [cell.findAll(text=True)[0].strip() for cell in tmp]
        songs[cells[0]] = {'artist' : cells[1],
                           'times_played' : cells[2],
                           'debut_date' : cells[3].replace('-','.'),
                           'last_played' : cells[4].replace('-','.')}

In [10]:
# Create CSV object
songs = pd.DataFrame(songs).transpose()

# Deal with song titles that were too long
setlist_songs = []
for show in shows:
    setlists = show['sets']
    if setlists is not None:
        for setlist in setlists:
            for song in setlists[setlist]:
                setlist_songs.append(song['name'])
setlist_songs = list(set(setlist_songs))

too_long_dict = {}
for too_long in [x for x in list(songs.index) if x[-3:] == '...']:
    too_long_dict[too_long] = [x for x in setlist_songs if too_long[:32] in x][0]

songs = songs.rename(index=too_long_dict)

songs.to_csv('scrapes/songs_%s.csv' % (time_stamp()))