In [58]:
#import libraries
import requests
import re
from bs4 import BeautifulSoup
import csv

#function that gets all html pages of past tour events
def pages():

    #get all pages link
    url = 'https://www.metallica.com/tour/past/'
    page_lst = []
    page_lst.append(url)
    loop = True

    while loop is True:
        result = requests.get(url)
        src = result.content
        soup = BeautifulSoup(src,'lxml')

        #find next page link and create list of links
        try:
            urls = soup.find('a',{'class':'page-next'})
            url = urls.get('href')
            page_lst.append(url)
        except:
            break
    return page_lst

#add events into a list of events
events_lst = []

#function to get tour events from each page of past tour events
def events(url):

    #get contents of html page that contains past tour events
    result = requests.get(url)
    src = result.content
    soup = BeautifulSoup(src,'lxml')

    #get all tour event link urls in that page
    urls = soup.find_all('a',{'class':'venue-city'})
    for x in urls:
        events_lst.append(x.get('href'))
    return events_lst

#create a function to input tour event url and return location, date, setlist
def tour_event(url):

    #get contents of individual setlist website
    result = requests.get(url)
    src = result.content
    soup = BeautifulSoup(src,'lxml')
    
    #get location of show
    loc = soup.find_all('a',title=re.compile(r'Search for Events in: '))
    location = ''
    for data in loc:
        location += data.text
    
    #convert to city, state, and country
    split_location = location.split(', ')
    if len(split_location) == 3:
        city = split_location[0]
        state = split_location[1]
        country = split_location[2]
    if len(split_location) == 2:
        city = split_location[0]
        state = ''
        country = split_location[1]
    
    #if country field has a state inside convert to state and make country united states (2 events have GA/FL as country)
    if len(country) == 2:
        state = country
        country = 'United States'
    
    #if states are not abbreviated convert to abbreviation
    states_abbreivation = {
    "Alabama": "AL",
    "Alaska": "AK",
    "Arizona": "AZ",
    "Arkansas": "AR",
    "California": "CA",
    "Colorado": "CO",
    "Connecticut": "CT",
    "Delaware": "DE",
    "Florida": "FL",
    "Georgia": "GA",
    "Hawaii": "HI",
    "Idaho": "ID",
    "Illinois": "IL",
    "Indiana": "IN",
    "Iowa": "IA",
    "Kansas": "KS",
    "Kentucky": "KY",
    "Louisiana": "LA",
    "Maine": "ME",
    "Maryland": "MD",
    "Massachusetts": "MA",
    "Michigan": "MI",
    "Minnesota": "MN",
    "Mississippi": "MS",
    "Missouri": "MO",
    "Montana": "MT",
    "Nebraska": "NE",
    "Nevada": "NV",
    "New Hampshire": "NH",
    "New Jersey": "NJ",
    "New Mexico": "NM",
    "New York": "NY",
    "North Carolina": "NC",
    "North Dakota": "ND",
    "Ohio": "OH",
    "Oklahoma": "OK",
    "Oregon": "OR",
    "Pennsylvania": "PA",
    "Rhode Island": "RI",
    "South Carolina": "SC",
    "South Dakota": "SD",
    "Tennessee": "TN",
    "Texas": "TX",
    "Utah": "UT",
    "Vermont": "VT",
    "Virginia": "VA",
    "Washington": "WA",
    "West Virginia": "WV",
    "Wisconsin": "WI",
    "Wyoming": "WY",
    "D.C.": "DC",
    "American Samoa": "AS",
    "Guam": "GU",
    "Northern Mariana Islands": "MP",
    "Puerto Rico": "PR",
    "United States Minor Outlying Islands": "UM",
    "U.S. Virgin Islands": "VI",
    }
    if len(state) > 2:
        state =  states_abbreivation.get(state)
        
    #concatenate city, state, and country into new single string
    try: 
        if len(state) == 2:
            location = city+', '+state+', '+country
    except:
        location = city+', '+country
    
    # get date of show
    dat = soup.find_all('h4')
    date = ''
    for x in dat:
        date += x.text
    date = re.findall(r'[\w]+ \d+, \d{4}',date)[0]
    
    #get setlist of show
    links = soup.find_all('a',{'class':'songName'})
    
    #store tour event data in list [date, city, state, country, setlist...]
    tour_event_lst = []
    
    #add date and location to list
    tour_event_lst.append(url)
    tour_event_lst.append(date)
    tour_event_lst.append(location)
    tour_event_lst.append(city)
    tour_event_lst.append(state)
    tour_event_lst.append(country)
    
    #add setlist to list (no duplicate songs), and add song count to dictionary
    for x in links:
        if x.text not in tour_event_lst:
            tour_event_lst.append(x.text)
            songs_lst.append(x.text)
    return tour_event_lst

#create list of all songs played and dictionary with song title as keys and count of that song as values
songs_lst = list()
song_counts = {'Song':'Total Plays'}

#create csv
with open('metallica_scrape.csv','w', newline = '', encoding='utf-8') as f:
    writer = csv.writer(f)
    
    #write headers (longest show ever played was 25 songs long--4/8/1992)
    headers = [
    'URL', 'Date', 'Location', 'City', 'State', 'Country', 'Song 1', 'Song 2', 'Song 3', 'Song 4', 'Song 5', 
     'Song 6', 'Song 7', 'Song 8', 'Song 9', 'Song 10', 'Song 11', 'Song 12', 'Song 13', 'Song 14', 'Song 15', 
     'Song 16', 'Song 17', 'Song 18', 'Song 19', 'Song 20', 'Song 21', 'Song 22','Song 23', 'Song 24', 'Song 25']
    writer.writerow(headers)
    
    #for every pages url, get a list of events and add to a master list of all the events...
    #... for every event write to csv 'date, city, state, and country and setlist'
    for url in pages():
        events(url)
    for url in events_lst:
        writer.writerow(tour_event(url))
f.close

#add songs from list to dictionary
for song in songs_lst:
    song_counts[song] = song_counts.get(song, 0) + 1

#get summary statistics in different csv
with open('metallica_scrape_stats.csv','w', newline = '', encoding='utf-8') as f:
    writer = csv.writer(f)
    for key, value in song_counts.items():
        writer.writerow([key, value])
f.close()