<img src="header.png">


    

In [1]:
import pandas as pd
import numpy as np
import regex as re
import selenium

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from tqdm import tqdm_notebook

## Let's have a look at the structure of the web content on metallica.com and work out the best strategy to scrape the data we want

#### Key Observations: 

There are c.100 pages that cover all gigs from 2018-1982. Each of these pages has its own unique URL. Let's call these 'page URLs'. 
    
Each of these 100 page URL's link to a page containing links to 22 gigs (accessible by a 'more information' button on each of the 100 pages). 

Therefore it seems that the most efficient strategy is to first scrape the 100 page URLs and then for each page URL scrape the 21 gig URLS that are contained. The ultimate aim is to create a list of *all* gig urls - which should be about 2100 in total. 

Once we have all gig URLS we can run a script to scrape all the relevant gig data from them sequentially in a browser.

## Obtaining page URL's 

In [7]:
#Lists for page URL's 
page_urls = []

In [8]:
#initialise driver and scrape all the page URL's
driver = webdriver.Chrome('./chromedriver')

#Get URL's for each page
driver.get('https://www.metallica.com/tour/past/?sz=21&start=0')    
page = driver.find_element_by_class_name('page-next')
page_urls.append(page.get_attribute('href'))     

for i in range(1000):
    try:
        driver.get(page_urls[-1])
        page = driver.find_element_by_class_name('page-next')
        page_urls.append(page.get_attribute('href'))    
    except:
        break

page_urls.insert(0,'https://www.metallica.com/tour/past/?sz=21&start=0')

In [12]:
#looks like it worked
page_urls_df = pd.DataFrame(page_urls)
page_urls_df.columns =['Page URLs']
page_urls_df.head(10)

Unnamed: 0,Page URLs
0,https://www.metallica.com/tour/past/?sz=21&sta...
1,https://www.metallica.com/tour/past/?sz=21&sta...
2,https://www.metallica.com/tour/past/?sz=21&sta...
3,https://www.metallica.com/tour/past/?sz=21&sta...
4,https://www.metallica.com/tour/past/?sz=21&sta...
5,https://www.metallica.com/tour/past/?sz=21&sta...
6,https://www.metallica.com/tour/past/?sz=21&sta...
7,https://www.metallica.com/tour/past/?sz=21&sta...
8,https://www.metallica.com/tour/past/?sz=21&sta...
9,https://www.metallica.com/tour/past/?sz=21&sta...


In [13]:
#let's save these just in case we need them later
page_urls_df.to_csv('Metallica_Page_URLS.csv',index=False)

## Obtaining gig URL's 

In [None]:
#Lists for gig URL's 
gig_url = []

In [None]:
#getting all the gig URL's on each page URL
driver = webdriver.Chrome('./chromedriver')
for i in page_urls:
    driver.get(i) 
    all_gigs = driver.find_elements_by_class_name('show')

    for each in all_gigs:
        show_id = each.get_attribute('data-show-id')
        try:
            gig_url.append(f'https://www.metallica.com/events/{show_id}.html')
        except:
            gig_url.append(np.nan)

In [None]:
#looks like it's worked
gig_urls_df = pd.DataFrame(gig_url)
gig_urls_df.head(5)

In [None]:
#again, let's save it to a CSV just in case we need it later on
gig_urls_df.to_csv('Metallica_Gig_URLS',index=False)

In [234]:
#set up empty lists to contain the relevant data from each gig url
Date = []
Venue = []
City_Country = []
Tour = []
Set = []
Encores = []
Number_of_Encores = []
Set_Length = []
URL = []

In [369]:
#removes numbers
def removenumbers(x):
    return re.sub("\d","",x)

#returns location of gig
def location(x):
    return x.split('\n')[0]

#returns gig venue
def venue(x):
    venue_date = x.split('\n')[1]
    return venue_date.split(' / ')[0]

#returns gig date
def date(x):
    venue_date = x.split('\n')[1]
    return venue_date.split(' / ')[1]

#returns tour name
def tour_clean(x):
    return x.split('\n')[1]

In [370]:
#extracts set 
def set_extractor(x):
    clean = removenumbers(x)
    output = clean.split('ENCORE')[0]
    return output

#cleans set text
def set_clean(x):
    new = x.split('\n\n')
    return [i.replace('\n','') for i in new][1:]

In [371]:
#extracts encore 
def encore_extractor(x):
    clean = removenumbers(x)
    output = clean.split('ENCORE')[1:]
    return str(output)

#cleans encore text
def encore_clean(x):
    clean = x.replace(' #','').replace("['",'').replace("']",'').replace("\\n', '",'').split('\\n\\n')
    return clean[1:]

In [372]:
#extracts tour
def tour_finder(x):
    try:
        one = x.split('TOUR NAME')[1]
        return one.split('\n')[1]
    except:
        return np.nan

In [373]:
#extracts support acts
def other_acts(x):
    try:
        one = x.split('OTHER ACTS')[1]
        return one.split('\n')[1]
    except:
        return np.nan
       
#if other acts are several, this splits them and appends the to a list
def other_act_split(x):
    try:
        return x.split(',')
    except:
        return list(x)

In [374]:
## Import gig URL's
gig_urls = pd.read_csv('../1.1_Data_Acquisition_URLs/Metallica_Gig_URLS')

In [375]:
#there are 2070 gigs to scrape
len(gig_urls)

2070

In [376]:
#gig urls to list
all_gigs = list(gig_urls['0'])

In [378]:
#set up empty lists
Date = []
Venue = []
City_Country = []
Tour = []
Set = []
Encores = []
Encores_Count = []
Set_Length = []
Other_Acts = []
URL = []

driver = webdriver.Chrome('./chromedriver')

for gig in tqdm_notebook(all_gigs):
    driver.get(gig)

#=== append URL, location and date data - some of the cleaning functions above built in so less to clean afterwards    
#=== URL appending will be good for spot checks to see if any NaN's are geniune

    URL.append(gig)
    item_location_date = driver.find_element_by_xpath('//*[@id="primary"]/div[1]/div/div')
    try:
        City_Country.append(location(item_location_date.text).title())
    except:
        City_Country.append(np.nan)
    try:
        Date.append(date(item_location_date.text).title())
    except:
        Date.append(np.nan)
    try:
        Venue.append(venue(item_location_date.text).title())
    except:
        Venue.append(np.nan)

#=== append Tour name data
    item_content = driver.find_element_by_xpath('//*[@id="primary"]/div[2]')
    try:
        Tour.append(tour_finder(item_content.text))
    except:
        Tour.append(np.nan) 

#=== append other act data
    item_content = driver.find_element_by_xpath('//*[@id="primary"]/div[2]')
    try:
        Other_Acts.append([i.strip() for i in other_act_split(other_acts(item_content.text))])
    except:
        Other_Acts.append(np.nan)   
        

#=== append Set and Encore data. There are several places on the page this appears across the 2000 gigs so need multiple elements
    try:
        try:
            item_set = driver.find_element_by_xpath('//*[@id="primary"]/div[2]/div/div/div[1]/div[1]/div[2]')
            try:
                Set.append(set_clean(set_extractor(item_set.text)))
            except:
                Set.append(np.nan)

        except:
            item_set = driver.find_element_by_xpath('//*[@id="primary"]/div[2]/div/div/div[1]/div[1]/div')
            try:
                Set.append(set_clean(set_extractor(item_set.text)))
            except:
                Set.append(np.nan)   

        set_length = len((set_clean(set_extractor(item_set.text))))

        try:
            item_set = driver.find_element_by_xpath('//*[@id="primary"]/div[2]/div/div/div[1]/div[1]/div[2]')
            try:
                Encores.append(encore_clean(encore_extractor(item_set.text)))

            except:
                Encores.append(np.nan)

        except:
            item_set = driver.find_element_by_xpath('//*[@id="primary"]/div[2]/div/div/div[1]/div[1]/div')
            try:
                Encores.append(encore_clean(encore_extractor(item_set.text)))
            except:
                Encores.append(np.nan) 

        encore_length = len(encore_clean(encore_extractor(item_set.text)))


        try:
            Set_Length.append(set_length+encore_length)
        except:
            Set_Length.append(np.nan)         

        try:
            Encores_Count.append(len(str(item_set.text).split('ENCORE'))-1)
        except:
            Encores_Count.append(np.nan)

#=== final exception, if I've missed any elements this will return as NaN. Hopefully not too many.
    except:
        Set.append(np.nan)
        Encores.append(np.nan)
        Encores_Count.append(np.nan)
        Set_Length.append(np.nan)

HBox(children=(IntProgress(value=0, max=2070), HTML(value='')))

In [379]:
#check that arrays are equal - they are, whoop whoop!
a =[Date, Venue, City_Country, Tour, Set, Encores, Encores_Count, Set_Length,Other_Acts, URL]
[len(i) for i in a]

[2070, 2070, 2070, 2070, 2070, 2070, 2070, 2070, 2070, 2070]

In [380]:
df = pd.DataFrame({'Date':Date,
            'Venue':Venue,
            'City_Country':City_Country,
            'Tour': Tour,
            'Set': Set,
            'Encores':Encores,
            'Encores_Count' : Encores_Count,
            'Set_Length':Set_Length,
            'Other_Acts' : Other_Acts,                   
            'URL': URL }) 

## Save the DF to CSV

In [383]:
df.to_csv('Metallica_Data_Dirty',index=False)