In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import time
import pandas as pd


def fetch_all_setlist_urls(artist_url, driver):
    driver.get(artist_url)
    time.sleep(5)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    setlist_links = soup.find_all('a', href=True, class_='summary url')
    setlist_urls = [urljoin("https://www.setlist.fm", link['href']) for link in setlist_links]
    return setlist_urls

def fetch_setlist_data(url, driver):
    driver.get(url)
    time.sleep(5)
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    artist_element = soup.find(class_='setlistHeadline')
    if artist_element:
        artist_text = artist_element.text.strip()
        print(f"Raw artist text: {artist_text}")
        
        if ' at ' in artist_text:
            artist_name = artist_text.split(' at ')[0].replace('Setlist', '').strip()
        else:
            artist_name = artist_text.replace('Setlist', '').strip()
    else:
        artist_name = "Artist not found"

    date_location_element = soup.find('div', class_='dateBlock')
    date_location = ' '.join(date_location_element.text.split()) if date_location_element else "Date and location not found"
    location_name = soup.select_one('a[title^="More setlists from"] > span').text.strip()

    start_time_element = soup.find('div', class_='mainTime', string=lambda t: "PM" in t or "AM" in t)
    start_time = start_time_element.text.strip() if start_time_element else "Start time not found"

    tour_avg_duration_element = soup.find('div', class_='mainTime', string=lambda t: "h" in t and "m" in t)
    tour_avg_duration = tour_avg_duration_element.text.strip() if tour_avg_duration_element else "Tour duration not found"

    songs_elements = soup.find('ol', class_='songsList')
    songs = [song_part.text.strip() for song_part in songs_elements.find_all('div', class_='songPart')] if songs_elements else []

    data = {
        'Artist': artist_name,
        'Date and Location': date_location,
        'Location Name': location_name,
        'Start Time': start_time,
        'Tour Average Duration': tour_avg_duration,
        'Songs': [songs],
    }

    return pd.DataFrame([data])



import os

def main():
    service = Service(executable_path='/Users/kater/Downloads/chromedriver-mac-x64/chromedriver')
    driver = webdriver.Chrome(service=service)
    
    # change URL to the artist data wanted to scrape on setlist fm
    artist_main_url = 'https://www.setlist.fm/setlists/olivia-rodrigo-bc9194e.html'
    setlist_urls = fetch_all_setlist_urls(artist_main_url, driver)
    
    all_setlists_data = []
    for url in setlist_urls:
        print(f"Processing setlist URL: {url}")
        setlist_data = fetch_setlist_data(url, driver)
        all_setlists_data.append(setlist_data)
    
    combined_df = pd.concat(all_setlists_data, ignore_index=True)
    exploded_df = combined_df.explode('Songs')

    csv_file_path = 'setlist_data_detailed.csv'

    if os.path.exists(csv_file_path):
        exploded_df.to_csv(csv_file_path, mode='a', header=False, index=False)
    else:
        exploded_df.to_csv(csv_file_path, mode='w', header=True, index=False)
    
    print("CSV file has been updated with the latest detailed data for all setlists.")

    driver.quit()

if __name__ == "__main__":
    main()



print("CSV file has been updated with the latest detailed data for all setlists.")


SessionNotCreatedException: Message: session not created: DevToolsActivePort file doesn't exist
Stacktrace:
0   chromedriver                        0x00000001044ee188 chromedriver + 4596104
1   chromedriver                        0x00000001044e5ef3 chromedriver + 4562675
2   chromedriver                        0x00000001040e939a chromedriver + 381850
3   chromedriver                        0x00000001041215c4 chromedriver + 611780
4   chromedriver                        0x000000010411cec3 chromedriver + 593603
5   chromedriver                        0x00000001041191a3 chromedriver + 577955
6   chromedriver                        0x0000000104160ea9 chromedriver + 872105
7   chromedriver                        0x0000000104154ee3 chromedriver + 823011
8   chromedriver                        0x0000000104125be4 chromedriver + 629732
9   chromedriver                        0x000000010412679e chromedriver + 632734
10  chromedriver                        0x00000001044b40a2 chromedriver + 4358306
11  chromedriver                        0x00000001044b8ced chromedriver + 4377837
12  chromedriver                        0x00000001044b8663 chromedriver + 4376163
13  chromedriver                        0x00000001044b8f95 chromedriver + 4378517
14  chromedriver                        0x000000010449dac5 chromedriver + 4266693
15  chromedriver                        0x00000001044b931d chromedriver + 4379421
16  chromedriver                        0x0000000104490110 chromedriver + 4210960
17  chromedriver                        0x00000001044d6b58 chromedriver + 4500312
18  chromedriver                        0x00000001044d6cd1 chromedriver + 4500689
19  chromedriver                        0x00000001044e5b33 chromedriver + 4561715
20  libsystem_pthread.dylib             0x00007ff8081e54f4 _pthread_start + 125
21  libsystem_pthread.dylib             0x00007ff8081e100f thread_start + 15
