In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.chrome.options import Options
import time
import pandas as pd
import numpy as np
import re
from datetime import datetime, timedelta

In [2]:
def clean_string(s):
    # Convert to lowercase
    s = s.lower()
    # Keep only letters, numbers, commas, ampersands, and spaces using regular expression
    s = re.sub(r'[^a-z0-9,& ]', '', s)
    return s

In [20]:
def scrape_chart(url, date):
    driver.get(url)

    # Scrape the chart title (e.g., "Daily Viral Songs Global")
    chart_title = driver.find_element(By.CSS_SELECTOR, 'h1').text
    
    # take the first 3 words of chart title text (e.g. "Daily Viral Songs")
    chart = " ".join(chart_title.split(" ")[0:3])
    
    territory = chart_title.strip()
    territory = re.sub(r"^Local Pulse ", "", chart_title)

    # Locate the table element using 'data-encore-id'
    table_element = driver.find_element(By.CSS_SELECTOR, '[data-encore-id="table"]')

    # Extract table rows (assuming table rows are defined using <tr> within the <table>)
    rows = table_element.find_elements(By.TAG_NAME, "tr")

    # Initialize list to hold table data
    table_data = []

    # Iterate over rows and extract song, artist, and other information
    for row in rows:

        # Extract all cells in the row
        cells = row.find_elements(By.TAG_NAME, "td")
        
        if len(cells) < 2:
            continue

        rank_and_chg = cells[1].text.split("\n")  # Split on newline to separate song and artist
        if len(rank_and_chg) >= 2:
            rank = int(rank_and_chg[0].strip())  # First part is the song title
            chg = rank_and_chg[1].strip()  # Second part is the artist name
        else:
            rank = int(rank_and_chg[0].strip())
            chg = ''

        subject = cells[2].text if len(cells) > 2 else ''
        subject = subject.split("\n")  # Split on newline to separate song and artist
        if len(subject) >= 2:
            # NEED TO KEEP ARTIST FIRST SO YOU CAN OVERWRITE SUBJECT AFTER
            artist = subject[1].strip()  # Second part is the artist name
            subject = subject[0].strip()  # First part is the subject title
        else:
            # NEED TO KEEP ARTIST FIRST SO YOU CAN OVERWRITE SUBJECT AFTER
            artist = subject[0].strip()
            subject = ''

        # Extract remaining columns
        peak = cells[3].text
        prev = cells[4].text
        streak = cells[5].text
        
        if chg in ('–', 'Re-Entry', 'New'):
            pass
        elif int(prev) - int(rank) == 0:
            pass
        elif int(prev) - int(rank) > 0:
            chg = f"+{chg}"
        elif int(prev) - int(rank) < 0:
            chg = f"-{chg}"
        
        row_data = [rank, chg, subject, artist, peak, prev, streak, territory, date]
        table_data.append(row_data)


    df = pd.DataFrame(table_data, 
            columns=["Rank", "Change", "Track", "Artist", "Peak", "Prev", "Streak", "City", "Date"])

    df['Chart'] = 'City Weekly Pulse Songs'
    df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d', errors='coerce')
    
    return df

In [16]:
def get_most_recent_thursday():
    today = datetime.today()
    # Calculate how many days to subtract to get the most recent Thursday
    days_since_thursday = (today.weekday() - 3) % 7
    most_recent_thursday = today - timedelta(days=days_since_thursday + 7)
    return most_recent_thursday.strftime("%Y-%m-%d")

In [17]:
cities = [
    'anaheim',
    'atlanta',
    'austin',
    'charlotte',
    'chicago',
    'cleveland',
    'dallas',
    'denver',
    'detroit',
    'houston',
    'indianapolis',
    'lasvegas',
    'losangeles',
    'memphis',
    'miami',
    'minneapolis',
    'nashville',
    'neworleans',
    'newyorkcity',
    'omaha',
    'philadelphia',
    'phoenix',
    'pittsburgh',
    'portland',
    'sacramento',
    'saltlakecity',
    'sanantonio',
    'sandiego',
    'sanfrancisco',
    'sanjuan',
    'seattle',
    'stlouis',
    'tampa',
    'washington'
]

In [34]:
chromedriver_path = "path/to/file"
service = Service(executable_path=chromedriver_path)

options = Options()
# options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

# Initialize the WebDriver
driver = webdriver.Chrome(service=service, options=options)

# Open the Spotify login page
driver.get("https://accounts.spotify.com/en/login")

# Maximize window (optional)
driver.maximize_window()

# Locate the username and password fields and enter your credentials
username_field = driver.find_element(By.ID, "login-username")
password_field = driver.find_element(By.ID, "login-password")

username_field.send_keys("enter username")
password_field.send_keys("enter password")

login_button = driver.find_element(By.ID, "login-button")
login_button.click()

driver.implicitly_wait(10)
time.sleep(10)

all_cities_df = pd.DataFrame()

url = "https://charts.spotify.com/charts/view/citypulsetrack"
date = get_most_recent_thursday()
for city in cities:
    print(city)
    city_url = url + f"-{city}-weekly/{date}"
    df = scrape_chart(city_url, date)
    all_cities_df = pd.concat([all_cities_df, df], ignore_index=True)

anaheim
atlanta
austin
charlotte
chicago
cleveland
dallas
denver
detroit
houston
indianapolis
lasvegas
losangeles
memphis
miami
minneapolis
nashville
neworleans
newyorkcity
omaha
philadelphia
phoenix
pittsburgh
portland
sacramento
saltlakecity
sanantonio
sandiego
sanfrancisco
sanjuan
seattle
stlouis
tampa
washington


In [54]:
download_file = "path/to/file"

all_cities_df.to_csv(download_file)